Charset detection heuristics added with chardet lib.
This commit is contained in:
@ -1,10 +1,12 @@
|
||||
package client
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"github.com/chromedp/cdproto/dom"
|
||||
"github.com/chromedp/cdproto/network"
|
||||
"github.com/chromedp/chromedp"
|
||||
"github.com/musabgultekin/chardet"
|
||||
"github.com/pkg/errors"
|
||||
"golang.org/x/net/html/charset"
|
||||
"io"
|
||||
@ -26,6 +28,9 @@ type Client struct {
|
||||
*http.Client
|
||||
}
|
||||
|
||||
const DefaultUserAgent = "Geziyor 1.0"
|
||||
const DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB
|
||||
|
||||
// NewClient creates http.Client with modified values for typical web scraper
|
||||
func NewClient() *Client {
|
||||
client := &http.Client{
|
||||
@ -70,11 +75,11 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD
|
||||
// Limit response body reading
|
||||
bodyReader := io.LimitReader(resp.Body, maxBodySize)
|
||||
|
||||
// Start reading body and determine encoding
|
||||
if !charsetDetectDisabled && resp.Request.Method != "HEAD" {
|
||||
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
|
||||
// Convert response if encoding provided
|
||||
if req.Encoding != "" && resp.Request.Method != "HEAD" {
|
||||
bodyReader, err = charset.NewReader(bodyReader, "text/html; charset="+req.Encoding)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "Determine encoding error")
|
||||
return nil, errors.Wrap(err, "Reading provided encoding error")
|
||||
}
|
||||
}
|
||||
|
||||
@ -83,6 +88,27 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD
|
||||
return nil, errors.Wrap(err, "Reading body error")
|
||||
}
|
||||
|
||||
// Decoding body
|
||||
if req.Encoding == "" && resp.Request.Method != "HEAD" {
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
// Charset detection
|
||||
// If enabled and charset not provided in content-type
|
||||
if !charsetDetectDisabled && !strings.Contains(contentType, "charset") {
|
||||
if res, err := chardet.NewHtmlDetector().DetectBest(body); err == nil {
|
||||
contentType = "text/html; charset=" + res.Charset
|
||||
}
|
||||
}
|
||||
convertedReader, err := charset.NewReader(bytes.NewReader(body), contentType)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "Determine encoding error")
|
||||
}
|
||||
convertedBody, err := ioutil.ReadAll(convertedReader)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "Determine encoding error")
|
||||
}
|
||||
body = convertedBody
|
||||
}
|
||||
|
||||
response := Response{
|
||||
Response: resp,
|
||||
Body: body,
|
||||
@ -137,7 +163,8 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) {
|
||||
return nil, errors.Wrap(err, "Request getting rendered error")
|
||||
}
|
||||
|
||||
// Set new URL in case of redirection
|
||||
// Update changed data
|
||||
req.Header = ConvertMapToHeader(res.RequestHeaders)
|
||||
req.URL, _ = url.Parse(res.URL)
|
||||
|
||||
response := Response{
|
||||
|
@ -1,7 +1,9 @@
|
||||
package client
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
@ -89,3 +91,49 @@ func TestConvertMapToHeader(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetFromHeaders(t *testing.T) {
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/plain; charset=iso-8859-9")
|
||||
fmt.Fprint(w, "G\xfcltekin")
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
req, _ := NewRequest("GET", ts.URL, nil)
|
||||
res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false)
|
||||
|
||||
if string(res.Body) != "Gültekin" {
|
||||
t.Fatal(string(res.Body))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetFromBody(t *testing.T) {
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/plain")
|
||||
fmt.Fprint(w, "G\xfcltekin")
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
req, _ := NewRequest("GET", ts.URL, nil)
|
||||
res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false)
|
||||
|
||||
if string(res.Body) != "Gültekin" {
|
||||
t.Fatal(string(res.Body))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCharsetProvidedWithRequest(t *testing.T) {
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/plain")
|
||||
fmt.Fprint(w, "G\xfcltekin")
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
req, _ := NewRequest("GET", ts.URL, nil)
|
||||
req.Encoding = "windows-1254"
|
||||
res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false)
|
||||
|
||||
if string(res.Body) != "Gültekin" {
|
||||
t.Fatal(string(res.Body))
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,7 @@ type Request struct {
|
||||
Synchronized bool
|
||||
Rendered bool
|
||||
Cancelled bool
|
||||
Encoding string
|
||||
}
|
||||
|
||||
// Cancel request
|
||||
|
Reference in New Issue
Block a user