Charset detection heuristics added with chardet lib.
This commit is contained in:
parent
b355a566cf
commit
33238bc875
@ -1,10 +1,12 @@
|
|||||||
package client
|
package client
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"github.com/chromedp/cdproto/dom"
|
"github.com/chromedp/cdproto/dom"
|
||||||
"github.com/chromedp/cdproto/network"
|
"github.com/chromedp/cdproto/network"
|
||||||
"github.com/chromedp/chromedp"
|
"github.com/chromedp/chromedp"
|
||||||
|
"github.com/musabgultekin/chardet"
|
||||||
"github.com/pkg/errors"
|
"github.com/pkg/errors"
|
||||||
"golang.org/x/net/html/charset"
|
"golang.org/x/net/html/charset"
|
||||||
"io"
|
"io"
|
||||||
@ -26,6 +28,9 @@ type Client struct {
|
|||||||
*http.Client
|
*http.Client
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const DefaultUserAgent = "Geziyor 1.0"
|
||||||
|
const DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB
|
||||||
|
|
||||||
// NewClient creates http.Client with modified values for typical web scraper
|
// NewClient creates http.Client with modified values for typical web scraper
|
||||||
func NewClient() *Client {
|
func NewClient() *Client {
|
||||||
client := &http.Client{
|
client := &http.Client{
|
||||||
@ -70,11 +75,11 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD
|
|||||||
// Limit response body reading
|
// Limit response body reading
|
||||||
bodyReader := io.LimitReader(resp.Body, maxBodySize)
|
bodyReader := io.LimitReader(resp.Body, maxBodySize)
|
||||||
|
|
||||||
// Start reading body and determine encoding
|
// Convert response if encoding provided
|
||||||
if !charsetDetectDisabled && resp.Request.Method != "HEAD" {
|
if req.Encoding != "" && resp.Request.Method != "HEAD" {
|
||||||
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
|
bodyReader, err = charset.NewReader(bodyReader, "text/html; charset="+req.Encoding)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, errors.Wrap(err, "Determine encoding error")
|
return nil, errors.Wrap(err, "Reading provided encoding error")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -83,6 +88,27 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD
|
|||||||
return nil, errors.Wrap(err, "Reading body error")
|
return nil, errors.Wrap(err, "Reading body error")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Decoding body
|
||||||
|
if req.Encoding == "" && resp.Request.Method != "HEAD" {
|
||||||
|
contentType := resp.Header.Get("Content-Type")
|
||||||
|
// Charset detection
|
||||||
|
// If enabled and charset not provided in content-type
|
||||||
|
if !charsetDetectDisabled && !strings.Contains(contentType, "charset") {
|
||||||
|
if res, err := chardet.NewHtmlDetector().DetectBest(body); err == nil {
|
||||||
|
contentType = "text/html; charset=" + res.Charset
|
||||||
|
}
|
||||||
|
}
|
||||||
|
convertedReader, err := charset.NewReader(bytes.NewReader(body), contentType)
|
||||||
|
if err != nil {
|
||||||
|
return nil, errors.Wrap(err, "Determine encoding error")
|
||||||
|
}
|
||||||
|
convertedBody, err := ioutil.ReadAll(convertedReader)
|
||||||
|
if err != nil {
|
||||||
|
return nil, errors.Wrap(err, "Determine encoding error")
|
||||||
|
}
|
||||||
|
body = convertedBody
|
||||||
|
}
|
||||||
|
|
||||||
response := Response{
|
response := Response{
|
||||||
Response: resp,
|
Response: resp,
|
||||||
Body: body,
|
Body: body,
|
||||||
@ -137,7 +163,8 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) {
|
|||||||
return nil, errors.Wrap(err, "Request getting rendered error")
|
return nil, errors.Wrap(err, "Request getting rendered error")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set new URL in case of redirection
|
// Update changed data
|
||||||
|
req.Header = ConvertMapToHeader(res.RequestHeaders)
|
||||||
req.URL, _ = url.Parse(res.URL)
|
req.URL, _ = url.Parse(res.URL)
|
||||||
|
|
||||||
response := Response{
|
response := Response{
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
package client
|
package client
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
"reflect"
|
"reflect"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
@ -89,3 +91,49 @@ func TestConvertMapToHeader(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCharsetFromHeaders(t *testing.T) {
|
||||||
|
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "text/plain; charset=iso-8859-9")
|
||||||
|
fmt.Fprint(w, "G\xfcltekin")
|
||||||
|
}))
|
||||||
|
defer ts.Close()
|
||||||
|
|
||||||
|
req, _ := NewRequest("GET", ts.URL, nil)
|
||||||
|
res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false)
|
||||||
|
|
||||||
|
if string(res.Body) != "Gültekin" {
|
||||||
|
t.Fatal(string(res.Body))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCharsetFromBody(t *testing.T) {
|
||||||
|
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "text/plain")
|
||||||
|
fmt.Fprint(w, "G\xfcltekin")
|
||||||
|
}))
|
||||||
|
defer ts.Close()
|
||||||
|
|
||||||
|
req, _ := NewRequest("GET", ts.URL, nil)
|
||||||
|
res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false)
|
||||||
|
|
||||||
|
if string(res.Body) != "Gültekin" {
|
||||||
|
t.Fatal(string(res.Body))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCharsetProvidedWithRequest(t *testing.T) {
|
||||||
|
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "text/plain")
|
||||||
|
fmt.Fprint(w, "G\xfcltekin")
|
||||||
|
}))
|
||||||
|
defer ts.Close()
|
||||||
|
|
||||||
|
req, _ := NewRequest("GET", ts.URL, nil)
|
||||||
|
req.Encoding = "windows-1254"
|
||||||
|
res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false)
|
||||||
|
|
||||||
|
if string(res.Body) != "Gültekin" {
|
||||||
|
t.Fatal(string(res.Body))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -12,6 +12,7 @@ type Request struct {
|
|||||||
Synchronized bool
|
Synchronized bool
|
||||||
Rendered bool
|
Rendered bool
|
||||||
Cancelled bool
|
Cancelled bool
|
||||||
|
Encoding string
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cancel request
|
// Cancel request
|
||||||
|
@ -53,10 +53,10 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if opt.UserAgent == "" {
|
if opt.UserAgent == "" {
|
||||||
geziyor.Opt.UserAgent = "Geziyor 1.0"
|
geziyor.Opt.UserAgent = client.DefaultUserAgent
|
||||||
}
|
}
|
||||||
if opt.MaxBodySize == 0 {
|
if opt.MaxBodySize == 0 {
|
||||||
geziyor.Opt.MaxBodySize = 1024 * 1024 * 1024 // 1GB
|
geziyor.Opt.MaxBodySize = client.DefaultMaxBody
|
||||||
}
|
}
|
||||||
if opt.Cache != nil {
|
if opt.Cache != nil {
|
||||||
geziyor.Client.Transport = &httpcache.Transport{
|
geziyor.Client.Transport = &httpcache.Transport{
|
||||||
|
@ -13,7 +13,6 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"testing"
|
"testing"
|
||||||
"unicode/utf8"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestSimple(t *testing.T) {
|
func TestSimple(t *testing.T) {
|
||||||
@ -175,33 +174,6 @@ func TestExtractor(t *testing.T) {
|
|||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCharsetDetection(t *testing.T) {
|
|
||||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
fmt.Fprint(w, "\xf0ültekin")
|
|
||||||
}))
|
|
||||||
defer ts.Close()
|
|
||||||
|
|
||||||
geziyor.NewGeziyor(&geziyor.Options{
|
|
||||||
StartURLs: []string{ts.URL},
|
|
||||||
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
|
||||||
if !utf8.Valid(r.Body) {
|
|
||||||
t.Fatal()
|
|
||||||
}
|
|
||||||
},
|
|
||||||
CharsetDetectDisabled: false,
|
|
||||||
}).Start()
|
|
||||||
|
|
||||||
geziyor.NewGeziyor(&geziyor.Options{
|
|
||||||
StartURLs: []string{ts.URL},
|
|
||||||
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
|
||||||
if utf8.Valid(r.Body) {
|
|
||||||
t.Fatal()
|
|
||||||
}
|
|
||||||
},
|
|
||||||
CharsetDetectDisabled: true,
|
|
||||||
}).Start()
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestRedirect(t *testing.T) {
|
func TestRedirect(t *testing.T) {
|
||||||
defer leaktest.Check(t)()
|
defer leaktest.Check(t)()
|
||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
|
1
go.mod
1
go.mod
@ -10,6 +10,7 @@ require (
|
|||||||
github.com/fortytw2/leaktest v1.3.0
|
github.com/fortytw2/leaktest v1.3.0
|
||||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
|
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
|
||||||
github.com/go-kit/kit v0.8.0
|
github.com/go-kit/kit v0.8.0
|
||||||
|
github.com/musabgultekin/chardet v0.0.0-20190703142329-3f8ab18f5ee7
|
||||||
github.com/pkg/errors v0.8.1
|
github.com/pkg/errors v0.8.1
|
||||||
github.com/prometheus/client_golang v1.0.0
|
github.com/prometheus/client_golang v1.0.0
|
||||||
github.com/stretchr/testify v1.3.0
|
github.com/stretchr/testify v1.3.0
|
||||||
|
6
go.sum
6
go.sum
@ -13,6 +13,8 @@ github.com/chromedp/cdproto v0.0.0-20190609032908-dd39f0bf0a54 h1:2NlKweNkC3yy6I
|
|||||||
github.com/chromedp/cdproto v0.0.0-20190609032908-dd39f0bf0a54/go.mod h1:5NWqr1Ri5aJB5uSvUXfVpbBslleS+eMjspUWv2Lcaow=
|
github.com/chromedp/cdproto v0.0.0-20190609032908-dd39f0bf0a54/go.mod h1:5NWqr1Ri5aJB5uSvUXfVpbBslleS+eMjspUWv2Lcaow=
|
||||||
github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05 h1:5iy45UjpWvkgTcd7GrGQSPr7sifrp9nNweI/eAsMjGE=
|
github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05 h1:5iy45UjpWvkgTcd7GrGQSPr7sifrp9nNweI/eAsMjGE=
|
||||||
github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05/go.mod h1:MsTqWB2yT7cErDFnF1F3y0PN8i/a/qQj+0GXKLW/I3s=
|
github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05/go.mod h1:MsTqWB2yT7cErDFnF1F3y0PN8i/a/qQj+0GXKLW/I3s=
|
||||||
|
github.com/chshawkn-pub/chardet v0.0.0-20160202204651-99815dcde191 h1:3+K6ySWX+ur+IziS7YE1D0Us8HQkHjBoTWzmcnVcws4=
|
||||||
|
github.com/chshawkn-pub/chardet v0.0.0-20160202204651-99815dcde191/go.mod h1:IKsHWTi5UkZBZJJtaVIk18w/Geisj1vFG2wV7zFRi9I=
|
||||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
@ -46,6 +48,8 @@ github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0j
|
|||||||
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
|
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
|
||||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||||
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
|
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
|
||||||
|
github.com/musabgultekin/chardet v0.0.0-20190703142329-3f8ab18f5ee7 h1:btpAkst4HX1a4UgexN/LASOwvtycli7+TEUZ3ovb9cQ=
|
||||||
|
github.com/musabgultekin/chardet v0.0.0-20190703142329-3f8ab18f5ee7/go.mod h1:IwGQg7OmA3BFgV3X+Ww2W5JT6kh5Ua4/gRIKZBt7gWs=
|
||||||
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
|
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
|
||||||
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||||
github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
|
github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
|
||||||
@ -69,6 +73,8 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
|
|||||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||||
|
github.com/zhl-home1/chardet v0.0.0-20160202204651-99815dcde191 h1:CXfTd0yQDeEhscRudH7YUSJSu1RkJhRLswIMfiKyZic=
|
||||||
|
github.com/zhl-home1/chardet v0.0.0-20160202204651-99815dcde191/go.mod h1:pEa4IVfMX0hSsE/jpJ0vKsZFXZjL6oSwtKvRiBoMimg=
|
||||||
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||||
|
Loading…
x
Reference in New Issue
Block a user