diff --git a/client/client.go b/client/client.go index 50dc712..a8c69b4 100644 --- a/client/client.go +++ b/client/client.go @@ -1,10 +1,12 @@ package client import ( + "bytes" "context" "github.com/chromedp/cdproto/dom" "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" + "github.com/musabgultekin/chardet" "github.com/pkg/errors" "golang.org/x/net/html/charset" "io" @@ -26,6 +28,9 @@ type Client struct { *http.Client } +const DefaultUserAgent = "Geziyor 1.0" +const DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB + // NewClient creates http.Client with modified values for typical web scraper func NewClient() *Client { client := &http.Client{ @@ -70,11 +75,11 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD // Limit response body reading bodyReader := io.LimitReader(resp.Body, maxBodySize) - // Start reading body and determine encoding - if !charsetDetectDisabled && resp.Request.Method != "HEAD" { - bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type")) + // Convert response if encoding provided + if req.Encoding != "" && resp.Request.Method != "HEAD" { + bodyReader, err = charset.NewReader(bodyReader, "text/html; charset="+req.Encoding) if err != nil { - return nil, errors.Wrap(err, "Determine encoding error") + return nil, errors.Wrap(err, "Reading provided encoding error") } } @@ -83,6 +88,27 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD return nil, errors.Wrap(err, "Reading body error") } + // Decoding body + if req.Encoding == "" && resp.Request.Method != "HEAD" { + contentType := resp.Header.Get("Content-Type") + // Charset detection + // If enabled and charset not provided in content-type + if !charsetDetectDisabled && !strings.Contains(contentType, "charset") { + if res, err := chardet.NewHtmlDetector().DetectBest(body); err == nil { + contentType = "text/html; charset=" + res.Charset + } + } + convertedReader, err := charset.NewReader(bytes.NewReader(body), contentType) + if err != nil { + return nil, errors.Wrap(err, "Determine encoding error") + } + convertedBody, err := ioutil.ReadAll(convertedReader) + if err != nil { + return nil, errors.Wrap(err, "Determine encoding error") + } + body = convertedBody + } + response := Response{ Response: resp, Body: body, @@ -137,7 +163,8 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) { return nil, errors.Wrap(err, "Request getting rendered error") } - // Set new URL in case of redirection + // Update changed data + req.Header = ConvertMapToHeader(res.RequestHeaders) req.URL, _ = url.Parse(res.URL) response := Response{ diff --git a/client/client_test.go b/client/client_test.go index 2e2731c..b5ddc0d 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -1,7 +1,9 @@ package client import ( + "fmt" "net/http" + "net/http/httptest" "reflect" "testing" ) @@ -89,3 +91,49 @@ func TestConvertMapToHeader(t *testing.T) { }) } } + +func TestCharsetFromHeaders(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain; charset=iso-8859-9") + fmt.Fprint(w, "G\xfcltekin") + })) + defer ts.Close() + + req, _ := NewRequest("GET", ts.URL, nil) + res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false) + + if string(res.Body) != "Gültekin" { + t.Fatal(string(res.Body)) + } +} + +func TestCharsetFromBody(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + fmt.Fprint(w, "G\xfcltekin") + })) + defer ts.Close() + + req, _ := NewRequest("GET", ts.URL, nil) + res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false) + + if string(res.Body) != "Gültekin" { + t.Fatal(string(res.Body)) + } +} + +func TestCharsetProvidedWithRequest(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + fmt.Fprint(w, "G\xfcltekin") + })) + defer ts.Close() + + req, _ := NewRequest("GET", ts.URL, nil) + req.Encoding = "windows-1254" + res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false) + + if string(res.Body) != "Gültekin" { + t.Fatal(string(res.Body)) + } +} diff --git a/client/request.go b/client/request.go index 6b3872d..4dc6892 100644 --- a/client/request.go +++ b/client/request.go @@ -12,6 +12,7 @@ type Request struct { Synchronized bool Rendered bool Cancelled bool + Encoding string } // Cancel request diff --git a/geziyor.go b/geziyor.go index a60bfec..8b10c86 100644 --- a/geziyor.go +++ b/geziyor.go @@ -53,10 +53,10 @@ func NewGeziyor(opt *Options) *Geziyor { } if opt.UserAgent == "" { - geziyor.Opt.UserAgent = "Geziyor 1.0" + geziyor.Opt.UserAgent = client.DefaultUserAgent } if opt.MaxBodySize == 0 { - geziyor.Opt.MaxBodySize = 1024 * 1024 * 1024 // 1GB + geziyor.Opt.MaxBodySize = client.DefaultMaxBody } if opt.Cache != nil { geziyor.Client.Transport = &httpcache.Transport{ diff --git a/geziyor_test.go b/geziyor_test.go index 4bfbe01..21a9d3c 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -13,7 +13,6 @@ import ( "net/http" "net/http/httptest" "testing" - "unicode/utf8" ) func TestSimple(t *testing.T) { @@ -175,33 +174,6 @@ func TestExtractor(t *testing.T) { }).Start() } -func TestCharsetDetection(t *testing.T) { - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - fmt.Fprint(w, "\xf0ültekin") - })) - defer ts.Close() - - geziyor.NewGeziyor(&geziyor.Options{ - StartURLs: []string{ts.URL}, - ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { - if !utf8.Valid(r.Body) { - t.Fatal() - } - }, - CharsetDetectDisabled: false, - }).Start() - - geziyor.NewGeziyor(&geziyor.Options{ - StartURLs: []string{ts.URL}, - ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { - if utf8.Valid(r.Body) { - t.Fatal() - } - }, - CharsetDetectDisabled: true, - }).Start() -} - func TestRedirect(t *testing.T) { defer leaktest.Check(t)() geziyor.NewGeziyor(&geziyor.Options{ diff --git a/go.mod b/go.mod index ebb52bc..ab036d5 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/fortytw2/leaktest v1.3.0 github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 github.com/go-kit/kit v0.8.0 + github.com/musabgultekin/chardet v0.0.0-20190703142329-3f8ab18f5ee7 github.com/pkg/errors v0.8.1 github.com/prometheus/client_golang v1.0.0 github.com/stretchr/testify v1.3.0 diff --git a/go.sum b/go.sum index 8ccae9f..daa19c9 100644 --- a/go.sum +++ b/go.sum @@ -13,6 +13,8 @@ github.com/chromedp/cdproto v0.0.0-20190609032908-dd39f0bf0a54 h1:2NlKweNkC3yy6I github.com/chromedp/cdproto v0.0.0-20190609032908-dd39f0bf0a54/go.mod h1:5NWqr1Ri5aJB5uSvUXfVpbBslleS+eMjspUWv2Lcaow= github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05 h1:5iy45UjpWvkgTcd7GrGQSPr7sifrp9nNweI/eAsMjGE= github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05/go.mod h1:MsTqWB2yT7cErDFnF1F3y0PN8i/a/qQj+0GXKLW/I3s= +github.com/chshawkn-pub/chardet v0.0.0-20160202204651-99815dcde191 h1:3+K6ySWX+ur+IziS7YE1D0Us8HQkHjBoTWzmcnVcws4= +github.com/chshawkn-pub/chardet v0.0.0-20160202204651-99815dcde191/go.mod h1:IKsHWTi5UkZBZJJtaVIk18w/Geisj1vFG2wV7zFRi9I= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -46,6 +48,8 @@ github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0j github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/musabgultekin/chardet v0.0.0-20190703142329-3f8ab18f5ee7 h1:btpAkst4HX1a4UgexN/LASOwvtycli7+TEUZ3ovb9cQ= +github.com/musabgultekin/chardet v0.0.0-20190703142329-3f8ab18f5ee7/go.mod h1:IwGQg7OmA3BFgV3X+Ww2W5JT6kh5Ua4/gRIKZBt7gWs= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= @@ -69,6 +73,8 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/zhl-home1/chardet v0.0.0-20160202204651-99815dcde191 h1:CXfTd0yQDeEhscRudH7YUSJSu1RkJhRLswIMfiKyZic= +github.com/zhl-home1/chardet v0.0.0-20160202204651-99815dcde191/go.mod h1:pEa4IVfMX0hSsE/jpJ0vKsZFXZjL6oSwtKvRiBoMimg= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=