Charset detection heuristics added with chardet lib.

This commit is contained in:
Musab Gültekin
2019-07-03 18:08:28 +03:00
parent b355a566cf
commit 33238bc875
7 changed files with 90 additions and 35 deletions

View File

@ -1,10 +1,12 @@
package client
import (
"bytes"
"context"
"github.com/chromedp/cdproto/dom"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
"github.com/musabgultekin/chardet"
"github.com/pkg/errors"
"golang.org/x/net/html/charset"
"io"
@ -26,6 +28,9 @@ type Client struct {
*http.Client
}
const DefaultUserAgent = "Geziyor 1.0"
const DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB
// NewClient creates http.Client with modified values for typical web scraper
func NewClient() *Client {
client := &http.Client{
@ -70,11 +75,11 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD
// Limit response body reading
bodyReader := io.LimitReader(resp.Body, maxBodySize)
// Start reading body and determine encoding
if !charsetDetectDisabled && resp.Request.Method != "HEAD" {
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
// Convert response if encoding provided
if req.Encoding != "" && resp.Request.Method != "HEAD" {
bodyReader, err = charset.NewReader(bodyReader, "text/html; charset="+req.Encoding)
if err != nil {
return nil, errors.Wrap(err, "Determine encoding error")
return nil, errors.Wrap(err, "Reading provided encoding error")
}
}
@ -83,6 +88,27 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD
return nil, errors.Wrap(err, "Reading body error")
}
// Decoding body
if req.Encoding == "" && resp.Request.Method != "HEAD" {
contentType := resp.Header.Get("Content-Type")
// Charset detection
// If enabled and charset not provided in content-type
if !charsetDetectDisabled && !strings.Contains(contentType, "charset") {
if res, err := chardet.NewHtmlDetector().DetectBest(body); err == nil {
contentType = "text/html; charset=" + res.Charset
}
}
convertedReader, err := charset.NewReader(bytes.NewReader(body), contentType)
if err != nil {
return nil, errors.Wrap(err, "Determine encoding error")
}
convertedBody, err := ioutil.ReadAll(convertedReader)
if err != nil {
return nil, errors.Wrap(err, "Determine encoding error")
}
body = convertedBody
}
response := Response{
Response: resp,
Body: body,
@ -137,7 +163,8 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) {
return nil, errors.Wrap(err, "Request getting rendered error")
}
// Set new URL in case of redirection
// Update changed data
req.Header = ConvertMapToHeader(res.RequestHeaders)
req.URL, _ = url.Parse(res.URL)
response := Response{

View File

@ -1,7 +1,9 @@
package client
import (
"fmt"
"net/http"
"net/http/httptest"
"reflect"
"testing"
)
@ -89,3 +91,49 @@ func TestConvertMapToHeader(t *testing.T) {
})
}
}
func TestCharsetFromHeaders(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain; charset=iso-8859-9")
fmt.Fprint(w, "G\xfcltekin")
}))
defer ts.Close()
req, _ := NewRequest("GET", ts.URL, nil)
res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false)
if string(res.Body) != "Gültekin" {
t.Fatal(string(res.Body))
}
}
func TestCharsetFromBody(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
fmt.Fprint(w, "G\xfcltekin")
}))
defer ts.Close()
req, _ := NewRequest("GET", ts.URL, nil)
res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false)
if string(res.Body) != "Gültekin" {
t.Fatal(string(res.Body))
}
}
func TestCharsetProvidedWithRequest(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
fmt.Fprint(w, "G\xfcltekin")
}))
defer ts.Close()
req, _ := NewRequest("GET", ts.URL, nil)
req.Encoding = "windows-1254"
res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false)
if string(res.Body) != "Gültekin" {
t.Fatal(string(res.Body))
}
}

View File

@ -12,6 +12,7 @@ type Request struct {
Synchronized bool
Rendered bool
Cancelled bool
Encoding string
}
// Cancel request