Refactored client options

Fixed default User-Agent string not being set.
This commit is contained in:
Musab Gültekin 2019-08-05 15:42:30 +03:00
parent 0e5230eac8
commit 85597219e6
4 changed files with 61 additions and 39 deletions

View File

@ -27,13 +27,19 @@ var (
// Client is a small wrapper around *http.Client to provide new methods. // Client is a small wrapper around *http.Client to provide new methods.
type Client struct { type Client struct {
*http.Client *http.Client
maxBodySize int64 opt *Options
charsetDetectDisabled bool
retryTimes int
retryHTTPCodes []int
remoteAllocatorURL string
} }
// Options is custom http.client options
type Options struct {
MaxBodySize int64
CharsetDetectDisabled bool
RetryTimes int
RetryHTTPCodes []int
RemoteAllocatorURL string
}
// Default values for client
const ( const (
DefaultUserAgent = "Geziyor 1.0" DefaultUserAgent = "Geziyor 1.0"
DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB
@ -45,7 +51,7 @@ var (
) )
// NewClient creates http.Client with modified values for typical web scraper // NewClient creates http.Client with modified values for typical web scraper
func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, retryHTTPCodes []int, remoteAllocatorURL string) *Client { func NewClient(opt *Options) *Client {
httpClient := &http.Client{ httpClient := &http.Client{
Transport: &http.Transport{ Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment, Proxy: http.ProxyFromEnvironment,
@ -64,17 +70,22 @@ func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, re
} }
client := Client{ client := Client{
Client: httpClient, Client: httpClient,
maxBodySize: maxBodySize, opt: opt,
charsetDetectDisabled: charsetDetectDisabled,
retryTimes: retryTimes,
retryHTTPCodes: retryHTTPCodes,
remoteAllocatorURL: remoteAllocatorURL,
} }
return &client return &client
} }
// newClientDefault creates new client with default options
func newClientDefault() *Client {
return NewClient(&Options{
MaxBodySize: DefaultMaxBody,
RetryTimes: DefaultRetryTimes,
RetryHTTPCodes: DefaultRetryHTTPCodes,
})
}
// DoRequest selects appropriate request handler, client or Chrome // DoRequest selects appropriate request handler, client or Chrome
func (c *Client) DoRequest(req *Request) (resp *Response, err error) { func (c *Client) DoRequest(req *Request) (resp *Response, err error) {
if req.Rendered { if req.Rendered {
@ -85,7 +96,7 @@ func (c *Client) DoRequest(req *Request) (resp *Response, err error) {
// Retry on Error // Retry on Error
if err != nil { if err != nil {
if req.retryCounter < c.retryTimes { if req.retryCounter < c.opt.RetryTimes {
req.retryCounter++ req.retryCounter++
log.Println("Retrying:", req.URL.String()) log.Println("Retrying:", req.URL.String())
return c.DoRequest(req) return c.DoRequest(req)
@ -94,8 +105,8 @@ func (c *Client) DoRequest(req *Request) (resp *Response, err error) {
} }
// Retry on http status codes // Retry on http status codes
for _, statusCode := range c.retryHTTPCodes { for _, statusCode := range c.opt.RetryHTTPCodes {
if req.retryCounter < c.retryTimes { if req.retryCounter < c.opt.RetryTimes {
if resp.StatusCode == statusCode { if resp.StatusCode == statusCode {
req.retryCounter++ req.retryCounter++
log.Println("Retrying:", req.URL.String(), resp.StatusCode) log.Println("Retrying:", req.URL.String(), resp.StatusCode)
@ -121,7 +132,7 @@ func (c *Client) DoRequestClient(req *Request) (*Response, error) {
} }
// Limit response body reading // Limit response body reading
bodyReader := io.LimitReader(resp.Body, c.maxBodySize) bodyReader := io.LimitReader(resp.Body, c.opt.MaxBodySize)
// Decode response // Decode response
if resp.Request.Method != "HEAD" && resp.ContentLength > 0 { if resp.Request.Method != "HEAD" && resp.ContentLength > 0 {
@ -130,7 +141,7 @@ func (c *Client) DoRequestClient(req *Request) (*Response, error) {
bodyReader = transform.NewReader(bodyReader, enc.NewDecoder()) bodyReader = transform.NewReader(bodyReader, enc.NewDecoder())
} }
} else { } else {
if !c.charsetDetectDisabled { if !c.opt.CharsetDetectDisabled {
bodyReader, err = charset.NewReader(bodyReader, req.Header.Get("Content-Type")) bodyReader, err = charset.NewReader(bodyReader, req.Header.Get("Content-Type"))
if err != nil { if err != nil {
return nil, errors.Wrap(err, "Reading determined encoding error") return nil, errors.Wrap(err, "Reading determined encoding error")
@ -159,8 +170,8 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) {
var res *network.Response var res *network.Response
ctx := context.Background() ctx := context.Background()
if c.remoteAllocatorURL != "" { if c.opt.RemoteAllocatorURL != "" {
ctx, _ = chromedp.NewRemoteAllocator(ctx, c.remoteAllocatorURL) ctx, _ = chromedp.NewRemoteAllocator(ctx, c.opt.RemoteAllocatorURL)
} }
ctx, cancel := chromedp.NewContext(ctx) ctx, cancel := chromedp.NewContext(ctx)
defer cancel() defer cancel()

View File

@ -101,7 +101,7 @@ func TestCharsetFromHeaders(t *testing.T) {
defer ts.Close() defer ts.Close()
req, _ := NewRequest("GET", ts.URL, nil) req, _ := NewRequest("GET", ts.URL, nil)
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req) res, _ := newClientDefault().DoRequest(req)
if string(res.Body) != "Gültekin" { if string(res.Body) != "Gültekin" {
t.Fatal(string(res.Body)) t.Fatal(string(res.Body))
@ -116,7 +116,7 @@ func TestCharsetFromBody(t *testing.T) {
defer ts.Close() defer ts.Close()
req, _ := NewRequest("GET", ts.URL, nil) req, _ := NewRequest("GET", ts.URL, nil)
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req) res, _ := newClientDefault().DoRequest(req)
if string(res.Body) != "Gültekin" { if string(res.Body) != "Gültekin" {
t.Fatal(string(res.Body)) t.Fatal(string(res.Body))
@ -132,7 +132,7 @@ func TestCharsetProvidedWithRequest(t *testing.T) {
req, _ := NewRequest("GET", ts.URL, nil) req, _ := NewRequest("GET", ts.URL, nil)
req.Encoding = "windows-1254" req.Encoding = "windows-1254"
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req) res, _ := newClientDefault().DoRequest(req)
if string(res.Body) != "Gültekin" { if string(res.Body) != "Gültekin" {
t.Fatal(string(res.Body)) t.Fatal(string(res.Body))
@ -141,7 +141,7 @@ func TestCharsetProvidedWithRequest(t *testing.T) {
func TestRetry(t *testing.T) { func TestRetry(t *testing.T) {
req, _ := NewRequest("GET", "https://httpbin.org/status/500", nil) req, _ := NewRequest("GET", "https://httpbin.org/status/500", nil)
res, err := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req) res, err := newClientDefault().DoRequest(req)
assert.Nil(t, res) assert.Nil(t, res)
assert.Error(t, err) assert.Error(t, err)
} }

View File

@ -36,6 +36,21 @@ type Geziyor struct {
// NewGeziyor creates new Geziyor with default values. // NewGeziyor creates new Geziyor with default values.
// If options provided, options // If options provided, options
func NewGeziyor(opt *Options) *Geziyor { func NewGeziyor(opt *Options) *Geziyor {
// Default Options
if opt.UserAgent == "" {
opt.UserAgent = client.DefaultUserAgent
}
if opt.MaxBodySize == 0 {
opt.MaxBodySize = client.DefaultMaxBody
}
if opt.RetryTimes == 0 {
opt.RetryTimes = client.DefaultRetryTimes
}
if len(opt.RetryHTTPCodes) == 0 {
opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes
}
geziyor := &Geziyor{ geziyor := &Geziyor{
Opt: opt, Opt: opt,
Exports: make(chan interface{}, 1), Exports: make(chan interface{}, 1),
@ -52,22 +67,14 @@ func NewGeziyor(opt *Options) *Geziyor {
metrics: metrics.NewMetrics(opt.MetricsType), metrics: metrics.NewMetrics(opt.MetricsType),
} }
// Default
if opt.UserAgent == "" {
opt.UserAgent = client.DefaultUserAgent
}
if opt.MaxBodySize == 0 {
opt.MaxBodySize = client.DefaultMaxBody
}
if opt.RetryTimes == 0 {
opt.RetryTimes = client.DefaultRetryTimes
}
if len(opt.RetryHTTPCodes) == 0 {
opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes
}
// Client // Client
geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes, opt.BrowserEndpoint) geziyor.Client = client.NewClient(&client.Options{
MaxBodySize: opt.MaxBodySize,
CharsetDetectDisabled: opt.CharsetDetectDisabled,
RetryTimes: opt.RetryTimes,
RetryHTTPCodes: opt.RetryHTTPCodes,
RemoteAllocatorURL: opt.BrowserEndpoint,
})
if opt.Cache != nil { if opt.Cache != nil {
geziyor.Client.Transport = &cache.Transport{ geziyor.Client.Transport = &cache.Transport{
Policy: opt.CachePolicy, Policy: opt.CachePolicy,

View File

@ -220,7 +220,11 @@ func BenchmarkRequests(b *testing.B) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, "Hello, client") fmt.Fprint(w, "Hello, client")
})) }))
ts.Client().Transport = client.NewClient(client.DefaultMaxBody, false, client.DefaultRetryTimes, client.DefaultRetryHTTPCodes, "").Transport ts.Client().Transport = client.NewClient(&client.Options{
MaxBodySize: client.DefaultMaxBody,
RetryTimes: client.DefaultRetryTimes,
RetryHTTPCodes: client.DefaultRetryHTTPCodes,
}).Transport
defer ts.Close() defer ts.Close()
// As we don't benchmark creating a server, reset timer. // As we don't benchmark creating a server, reset timer.