Refactored client options
Fixed default User-Agent string not being set.
This commit is contained in:
parent
0e5230eac8
commit
85597219e6
@ -27,13 +27,19 @@ var (
|
|||||||
// Client is a small wrapper around *http.Client to provide new methods.
|
// Client is a small wrapper around *http.Client to provide new methods.
|
||||||
type Client struct {
|
type Client struct {
|
||||||
*http.Client
|
*http.Client
|
||||||
maxBodySize int64
|
opt *Options
|
||||||
charsetDetectDisabled bool
|
|
||||||
retryTimes int
|
|
||||||
retryHTTPCodes []int
|
|
||||||
remoteAllocatorURL string
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Options is custom http.client options
|
||||||
|
type Options struct {
|
||||||
|
MaxBodySize int64
|
||||||
|
CharsetDetectDisabled bool
|
||||||
|
RetryTimes int
|
||||||
|
RetryHTTPCodes []int
|
||||||
|
RemoteAllocatorURL string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default values for client
|
||||||
const (
|
const (
|
||||||
DefaultUserAgent = "Geziyor 1.0"
|
DefaultUserAgent = "Geziyor 1.0"
|
||||||
DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB
|
DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB
|
||||||
@ -45,7 +51,7 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// NewClient creates http.Client with modified values for typical web scraper
|
// NewClient creates http.Client with modified values for typical web scraper
|
||||||
func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, retryHTTPCodes []int, remoteAllocatorURL string) *Client {
|
func NewClient(opt *Options) *Client {
|
||||||
httpClient := &http.Client{
|
httpClient := &http.Client{
|
||||||
Transport: &http.Transport{
|
Transport: &http.Transport{
|
||||||
Proxy: http.ProxyFromEnvironment,
|
Proxy: http.ProxyFromEnvironment,
|
||||||
@ -64,17 +70,22 @@ func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, re
|
|||||||
}
|
}
|
||||||
|
|
||||||
client := Client{
|
client := Client{
|
||||||
Client: httpClient,
|
Client: httpClient,
|
||||||
maxBodySize: maxBodySize,
|
opt: opt,
|
||||||
charsetDetectDisabled: charsetDetectDisabled,
|
|
||||||
retryTimes: retryTimes,
|
|
||||||
retryHTTPCodes: retryHTTPCodes,
|
|
||||||
remoteAllocatorURL: remoteAllocatorURL,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return &client
|
return &client
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// newClientDefault creates new client with default options
|
||||||
|
func newClientDefault() *Client {
|
||||||
|
return NewClient(&Options{
|
||||||
|
MaxBodySize: DefaultMaxBody,
|
||||||
|
RetryTimes: DefaultRetryTimes,
|
||||||
|
RetryHTTPCodes: DefaultRetryHTTPCodes,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
// DoRequest selects appropriate request handler, client or Chrome
|
// DoRequest selects appropriate request handler, client or Chrome
|
||||||
func (c *Client) DoRequest(req *Request) (resp *Response, err error) {
|
func (c *Client) DoRequest(req *Request) (resp *Response, err error) {
|
||||||
if req.Rendered {
|
if req.Rendered {
|
||||||
@ -85,7 +96,7 @@ func (c *Client) DoRequest(req *Request) (resp *Response, err error) {
|
|||||||
|
|
||||||
// Retry on Error
|
// Retry on Error
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if req.retryCounter < c.retryTimes {
|
if req.retryCounter < c.opt.RetryTimes {
|
||||||
req.retryCounter++
|
req.retryCounter++
|
||||||
log.Println("Retrying:", req.URL.String())
|
log.Println("Retrying:", req.URL.String())
|
||||||
return c.DoRequest(req)
|
return c.DoRequest(req)
|
||||||
@ -94,8 +105,8 @@ func (c *Client) DoRequest(req *Request) (resp *Response, err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Retry on http status codes
|
// Retry on http status codes
|
||||||
for _, statusCode := range c.retryHTTPCodes {
|
for _, statusCode := range c.opt.RetryHTTPCodes {
|
||||||
if req.retryCounter < c.retryTimes {
|
if req.retryCounter < c.opt.RetryTimes {
|
||||||
if resp.StatusCode == statusCode {
|
if resp.StatusCode == statusCode {
|
||||||
req.retryCounter++
|
req.retryCounter++
|
||||||
log.Println("Retrying:", req.URL.String(), resp.StatusCode)
|
log.Println("Retrying:", req.URL.String(), resp.StatusCode)
|
||||||
@ -121,7 +132,7 @@ func (c *Client) DoRequestClient(req *Request) (*Response, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Limit response body reading
|
// Limit response body reading
|
||||||
bodyReader := io.LimitReader(resp.Body, c.maxBodySize)
|
bodyReader := io.LimitReader(resp.Body, c.opt.MaxBodySize)
|
||||||
|
|
||||||
// Decode response
|
// Decode response
|
||||||
if resp.Request.Method != "HEAD" && resp.ContentLength > 0 {
|
if resp.Request.Method != "HEAD" && resp.ContentLength > 0 {
|
||||||
@ -130,7 +141,7 @@ func (c *Client) DoRequestClient(req *Request) (*Response, error) {
|
|||||||
bodyReader = transform.NewReader(bodyReader, enc.NewDecoder())
|
bodyReader = transform.NewReader(bodyReader, enc.NewDecoder())
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if !c.charsetDetectDisabled {
|
if !c.opt.CharsetDetectDisabled {
|
||||||
bodyReader, err = charset.NewReader(bodyReader, req.Header.Get("Content-Type"))
|
bodyReader, err = charset.NewReader(bodyReader, req.Header.Get("Content-Type"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, errors.Wrap(err, "Reading determined encoding error")
|
return nil, errors.Wrap(err, "Reading determined encoding error")
|
||||||
@ -159,8 +170,8 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) {
|
|||||||
var res *network.Response
|
var res *network.Response
|
||||||
|
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
if c.remoteAllocatorURL != "" {
|
if c.opt.RemoteAllocatorURL != "" {
|
||||||
ctx, _ = chromedp.NewRemoteAllocator(ctx, c.remoteAllocatorURL)
|
ctx, _ = chromedp.NewRemoteAllocator(ctx, c.opt.RemoteAllocatorURL)
|
||||||
}
|
}
|
||||||
ctx, cancel := chromedp.NewContext(ctx)
|
ctx, cancel := chromedp.NewContext(ctx)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
@ -101,7 +101,7 @@ func TestCharsetFromHeaders(t *testing.T) {
|
|||||||
defer ts.Close()
|
defer ts.Close()
|
||||||
|
|
||||||
req, _ := NewRequest("GET", ts.URL, nil)
|
req, _ := NewRequest("GET", ts.URL, nil)
|
||||||
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req)
|
res, _ := newClientDefault().DoRequest(req)
|
||||||
|
|
||||||
if string(res.Body) != "Gültekin" {
|
if string(res.Body) != "Gültekin" {
|
||||||
t.Fatal(string(res.Body))
|
t.Fatal(string(res.Body))
|
||||||
@ -116,7 +116,7 @@ func TestCharsetFromBody(t *testing.T) {
|
|||||||
defer ts.Close()
|
defer ts.Close()
|
||||||
|
|
||||||
req, _ := NewRequest("GET", ts.URL, nil)
|
req, _ := NewRequest("GET", ts.URL, nil)
|
||||||
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req)
|
res, _ := newClientDefault().DoRequest(req)
|
||||||
|
|
||||||
if string(res.Body) != "Gültekin" {
|
if string(res.Body) != "Gültekin" {
|
||||||
t.Fatal(string(res.Body))
|
t.Fatal(string(res.Body))
|
||||||
@ -132,7 +132,7 @@ func TestCharsetProvidedWithRequest(t *testing.T) {
|
|||||||
|
|
||||||
req, _ := NewRequest("GET", ts.URL, nil)
|
req, _ := NewRequest("GET", ts.URL, nil)
|
||||||
req.Encoding = "windows-1254"
|
req.Encoding = "windows-1254"
|
||||||
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req)
|
res, _ := newClientDefault().DoRequest(req)
|
||||||
|
|
||||||
if string(res.Body) != "Gültekin" {
|
if string(res.Body) != "Gültekin" {
|
||||||
t.Fatal(string(res.Body))
|
t.Fatal(string(res.Body))
|
||||||
@ -141,7 +141,7 @@ func TestCharsetProvidedWithRequest(t *testing.T) {
|
|||||||
|
|
||||||
func TestRetry(t *testing.T) {
|
func TestRetry(t *testing.T) {
|
||||||
req, _ := NewRequest("GET", "https://httpbin.org/status/500", nil)
|
req, _ := NewRequest("GET", "https://httpbin.org/status/500", nil)
|
||||||
res, err := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req)
|
res, err := newClientDefault().DoRequest(req)
|
||||||
assert.Nil(t, res)
|
assert.Nil(t, res)
|
||||||
assert.Error(t, err)
|
assert.Error(t, err)
|
||||||
}
|
}
|
||||||
|
37
geziyor.go
37
geziyor.go
@ -36,6 +36,21 @@ type Geziyor struct {
|
|||||||
// NewGeziyor creates new Geziyor with default values.
|
// NewGeziyor creates new Geziyor with default values.
|
||||||
// If options provided, options
|
// If options provided, options
|
||||||
func NewGeziyor(opt *Options) *Geziyor {
|
func NewGeziyor(opt *Options) *Geziyor {
|
||||||
|
|
||||||
|
// Default Options
|
||||||
|
if opt.UserAgent == "" {
|
||||||
|
opt.UserAgent = client.DefaultUserAgent
|
||||||
|
}
|
||||||
|
if opt.MaxBodySize == 0 {
|
||||||
|
opt.MaxBodySize = client.DefaultMaxBody
|
||||||
|
}
|
||||||
|
if opt.RetryTimes == 0 {
|
||||||
|
opt.RetryTimes = client.DefaultRetryTimes
|
||||||
|
}
|
||||||
|
if len(opt.RetryHTTPCodes) == 0 {
|
||||||
|
opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes
|
||||||
|
}
|
||||||
|
|
||||||
geziyor := &Geziyor{
|
geziyor := &Geziyor{
|
||||||
Opt: opt,
|
Opt: opt,
|
||||||
Exports: make(chan interface{}, 1),
|
Exports: make(chan interface{}, 1),
|
||||||
@ -52,22 +67,14 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
metrics: metrics.NewMetrics(opt.MetricsType),
|
metrics: metrics.NewMetrics(opt.MetricsType),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Default
|
|
||||||
if opt.UserAgent == "" {
|
|
||||||
opt.UserAgent = client.DefaultUserAgent
|
|
||||||
}
|
|
||||||
if opt.MaxBodySize == 0 {
|
|
||||||
opt.MaxBodySize = client.DefaultMaxBody
|
|
||||||
}
|
|
||||||
if opt.RetryTimes == 0 {
|
|
||||||
opt.RetryTimes = client.DefaultRetryTimes
|
|
||||||
}
|
|
||||||
if len(opt.RetryHTTPCodes) == 0 {
|
|
||||||
opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes
|
|
||||||
}
|
|
||||||
|
|
||||||
// Client
|
// Client
|
||||||
geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes, opt.BrowserEndpoint)
|
geziyor.Client = client.NewClient(&client.Options{
|
||||||
|
MaxBodySize: opt.MaxBodySize,
|
||||||
|
CharsetDetectDisabled: opt.CharsetDetectDisabled,
|
||||||
|
RetryTimes: opt.RetryTimes,
|
||||||
|
RetryHTTPCodes: opt.RetryHTTPCodes,
|
||||||
|
RemoteAllocatorURL: opt.BrowserEndpoint,
|
||||||
|
})
|
||||||
if opt.Cache != nil {
|
if opt.Cache != nil {
|
||||||
geziyor.Client.Transport = &cache.Transport{
|
geziyor.Client.Transport = &cache.Transport{
|
||||||
Policy: opt.CachePolicy,
|
Policy: opt.CachePolicy,
|
||||||
|
@ -220,7 +220,11 @@ func BenchmarkRequests(b *testing.B) {
|
|||||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
fmt.Fprint(w, "Hello, client")
|
fmt.Fprint(w, "Hello, client")
|
||||||
}))
|
}))
|
||||||
ts.Client().Transport = client.NewClient(client.DefaultMaxBody, false, client.DefaultRetryTimes, client.DefaultRetryHTTPCodes, "").Transport
|
ts.Client().Transport = client.NewClient(&client.Options{
|
||||||
|
MaxBodySize: client.DefaultMaxBody,
|
||||||
|
RetryTimes: client.DefaultRetryTimes,
|
||||||
|
RetryHTTPCodes: client.DefaultRetryHTTPCodes,
|
||||||
|
}).Transport
|
||||||
defer ts.Close()
|
defer ts.Close()
|
||||||
|
|
||||||
// As we don't benchmark creating a server, reset timer.
|
// As we don't benchmark creating a server, reset timer.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user