diff --git a/client/client.go b/client/client.go index ac2d91d..b56b8b9 100644 --- a/client/client.go +++ b/client/client.go @@ -27,13 +27,19 @@ var ( // Client is a small wrapper around *http.Client to provide new methods. type Client struct { *http.Client - maxBodySize int64 - charsetDetectDisabled bool - retryTimes int - retryHTTPCodes []int - remoteAllocatorURL string + opt *Options } +// Options is custom http.client options +type Options struct { + MaxBodySize int64 + CharsetDetectDisabled bool + RetryTimes int + RetryHTTPCodes []int + RemoteAllocatorURL string +} + +// Default values for client const ( DefaultUserAgent = "Geziyor 1.0" DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB @@ -45,7 +51,7 @@ var ( ) // NewClient creates http.Client with modified values for typical web scraper -func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, retryHTTPCodes []int, remoteAllocatorURL string) *Client { +func NewClient(opt *Options) *Client { httpClient := &http.Client{ Transport: &http.Transport{ Proxy: http.ProxyFromEnvironment, @@ -64,17 +70,22 @@ func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, re } client := Client{ - Client: httpClient, - maxBodySize: maxBodySize, - charsetDetectDisabled: charsetDetectDisabled, - retryTimes: retryTimes, - retryHTTPCodes: retryHTTPCodes, - remoteAllocatorURL: remoteAllocatorURL, + Client: httpClient, + opt: opt, } return &client } +// newClientDefault creates new client with default options +func newClientDefault() *Client { + return NewClient(&Options{ + MaxBodySize: DefaultMaxBody, + RetryTimes: DefaultRetryTimes, + RetryHTTPCodes: DefaultRetryHTTPCodes, + }) +} + // DoRequest selects appropriate request handler, client or Chrome func (c *Client) DoRequest(req *Request) (resp *Response, err error) { if req.Rendered { @@ -85,7 +96,7 @@ func (c *Client) DoRequest(req *Request) (resp *Response, err error) { // Retry on Error if err != nil { - if req.retryCounter < c.retryTimes { + if req.retryCounter < c.opt.RetryTimes { req.retryCounter++ log.Println("Retrying:", req.URL.String()) return c.DoRequest(req) @@ -94,8 +105,8 @@ func (c *Client) DoRequest(req *Request) (resp *Response, err error) { } // Retry on http status codes - for _, statusCode := range c.retryHTTPCodes { - if req.retryCounter < c.retryTimes { + for _, statusCode := range c.opt.RetryHTTPCodes { + if req.retryCounter < c.opt.RetryTimes { if resp.StatusCode == statusCode { req.retryCounter++ log.Println("Retrying:", req.URL.String(), resp.StatusCode) @@ -121,7 +132,7 @@ func (c *Client) DoRequestClient(req *Request) (*Response, error) { } // Limit response body reading - bodyReader := io.LimitReader(resp.Body, c.maxBodySize) + bodyReader := io.LimitReader(resp.Body, c.opt.MaxBodySize) // Decode response if resp.Request.Method != "HEAD" && resp.ContentLength > 0 { @@ -130,7 +141,7 @@ func (c *Client) DoRequestClient(req *Request) (*Response, error) { bodyReader = transform.NewReader(bodyReader, enc.NewDecoder()) } } else { - if !c.charsetDetectDisabled { + if !c.opt.CharsetDetectDisabled { bodyReader, err = charset.NewReader(bodyReader, req.Header.Get("Content-Type")) if err != nil { return nil, errors.Wrap(err, "Reading determined encoding error") @@ -159,8 +170,8 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) { var res *network.Response ctx := context.Background() - if c.remoteAllocatorURL != "" { - ctx, _ = chromedp.NewRemoteAllocator(ctx, c.remoteAllocatorURL) + if c.opt.RemoteAllocatorURL != "" { + ctx, _ = chromedp.NewRemoteAllocator(ctx, c.opt.RemoteAllocatorURL) } ctx, cancel := chromedp.NewContext(ctx) defer cancel() diff --git a/client/client_test.go b/client/client_test.go index 0c1fb40..7c2d4ba 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -101,7 +101,7 @@ func TestCharsetFromHeaders(t *testing.T) { defer ts.Close() req, _ := NewRequest("GET", ts.URL, nil) - res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req) + res, _ := newClientDefault().DoRequest(req) if string(res.Body) != "Gültekin" { t.Fatal(string(res.Body)) @@ -116,7 +116,7 @@ func TestCharsetFromBody(t *testing.T) { defer ts.Close() req, _ := NewRequest("GET", ts.URL, nil) - res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req) + res, _ := newClientDefault().DoRequest(req) if string(res.Body) != "Gültekin" { t.Fatal(string(res.Body)) @@ -132,7 +132,7 @@ func TestCharsetProvidedWithRequest(t *testing.T) { req, _ := NewRequest("GET", ts.URL, nil) req.Encoding = "windows-1254" - res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req) + res, _ := newClientDefault().DoRequest(req) if string(res.Body) != "Gültekin" { t.Fatal(string(res.Body)) @@ -141,7 +141,7 @@ func TestCharsetProvidedWithRequest(t *testing.T) { func TestRetry(t *testing.T) { req, _ := NewRequest("GET", "https://httpbin.org/status/500", nil) - res, err := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req) + res, err := newClientDefault().DoRequest(req) assert.Nil(t, res) assert.Error(t, err) } diff --git a/geziyor.go b/geziyor.go index ed2a540..d19768c 100644 --- a/geziyor.go +++ b/geziyor.go @@ -36,6 +36,21 @@ type Geziyor struct { // NewGeziyor creates new Geziyor with default values. // If options provided, options func NewGeziyor(opt *Options) *Geziyor { + + // Default Options + if opt.UserAgent == "" { + opt.UserAgent = client.DefaultUserAgent + } + if opt.MaxBodySize == 0 { + opt.MaxBodySize = client.DefaultMaxBody + } + if opt.RetryTimes == 0 { + opt.RetryTimes = client.DefaultRetryTimes + } + if len(opt.RetryHTTPCodes) == 0 { + opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes + } + geziyor := &Geziyor{ Opt: opt, Exports: make(chan interface{}, 1), @@ -52,22 +67,14 @@ func NewGeziyor(opt *Options) *Geziyor { metrics: metrics.NewMetrics(opt.MetricsType), } - // Default - if opt.UserAgent == "" { - opt.UserAgent = client.DefaultUserAgent - } - if opt.MaxBodySize == 0 { - opt.MaxBodySize = client.DefaultMaxBody - } - if opt.RetryTimes == 0 { - opt.RetryTimes = client.DefaultRetryTimes - } - if len(opt.RetryHTTPCodes) == 0 { - opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes - } - // Client - geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes, opt.BrowserEndpoint) + geziyor.Client = client.NewClient(&client.Options{ + MaxBodySize: opt.MaxBodySize, + CharsetDetectDisabled: opt.CharsetDetectDisabled, + RetryTimes: opt.RetryTimes, + RetryHTTPCodes: opt.RetryHTTPCodes, + RemoteAllocatorURL: opt.BrowserEndpoint, + }) if opt.Cache != nil { geziyor.Client.Transport = &cache.Transport{ Policy: opt.CachePolicy, diff --git a/geziyor_test.go b/geziyor_test.go index d2d7a62..c0a5046 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -220,7 +220,11 @@ func BenchmarkRequests(b *testing.B) { ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { fmt.Fprint(w, "Hello, client") })) - ts.Client().Transport = client.NewClient(client.DefaultMaxBody, false, client.DefaultRetryTimes, client.DefaultRetryHTTPCodes, "").Transport + ts.Client().Transport = client.NewClient(&client.Options{ + MaxBodySize: client.DefaultMaxBody, + RetryTimes: client.DefaultRetryTimes, + RetryHTTPCodes: client.DefaultRetryHTTPCodes, + }).Transport defer ts.Close() // As we don't benchmark creating a server, reset timer.