From 9adff755097b74d75b969456012f4cf9b7f61ce2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Thu, 4 Jul 2019 13:36:10 +0300 Subject: [PATCH] Retry requests support implemented for client. --- client/client.go | 58 +++++++++++++++++++++++++++++++++++-------- client/client_test.go | 14 ++++++++--- client/request.go | 2 ++ export/csv.go | 4 +-- export/json.go | 2 +- geziyor.go | 23 ++++++++++++----- geziyor_test.go | 2 +- internal/strings.go | 20 +++++++-------- middleware.go | 4 +++ options.go | 13 +++++++++- 10 files changed, 108 insertions(+), 34 deletions(-) diff --git a/client/client.go b/client/client.go index 2be7c28..601bffd 100644 --- a/client/client.go +++ b/client/client.go @@ -10,6 +10,7 @@ import ( "golang.org/x/text/transform" "io" "io/ioutil" + "log" "net" "net/http" "net/url" @@ -25,14 +26,25 @@ var ( // Client is a small wrapper around *http.Client to provide new methods. type Client struct { *http.Client + maxBodySize int64 + charsetDetectDisabled bool + retryTimes int + retryHTTPCodes []int } -const DefaultUserAgent = "Geziyor 1.0" -const DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB +const ( + DefaultUserAgent = "Geziyor 1.0" + DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB + DefaultRetryTimes = 2 +) + +var ( + DefaultRetryHTTPCodes = []int{500, 502, 503, 504, 522, 524, 408} +) // NewClient creates http.Client with modified values for typical web scraper -func NewClient() *Client { - client := &http.Client{ +func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, retryHTTPCodes []int) *Client { + httpClient := &http.Client{ Transport: &http.Transport{ Proxy: http.ProxyFromEnvironment, DialContext: (&net.Dialer{ @@ -48,31 +60,57 @@ func NewClient() *Client { }, Timeout: time.Second * 180, // Google's timeout } - return &Client{Client: client} + + client := Client{ + Client: httpClient, + maxBodySize: maxBodySize, + charsetDetectDisabled: charsetDetectDisabled, + retryTimes: retryTimes, + retryHTTPCodes: retryHTTPCodes, + } + + return &client } // DoRequest selects appropriate request handler, client or Chrome -func (c *Client) DoRequest(req *Request, maxBodySize int64, charsetDetectDisabled bool) (*Response, error) { +func (c *Client) DoRequest(req *Request) (*Response, error) { if !req.Rendered { - return c.DoRequestClient(req, maxBodySize, charsetDetectDisabled) + return c.DoRequestClient(req) } else { return c.DoRequestChrome(req) } } // DoRequestClient is a simple wrapper to read response according to options. -func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectDisabled bool) (*Response, error) { +func (c *Client) DoRequestClient(req *Request) (*Response, error) { // Do request resp, err := c.Do(req.Request) if resp != nil { defer resp.Body.Close() } if err != nil { + // Retry on Error + if req.retryCounter < c.retryTimes { + req.retryCounter++ + log.Println("Retrying:", req.URL.String()) + return c.DoRequestClient(req) + } return nil, errors.Wrap(err, "Response error") } + // Checks status code to retry + if req.retryCounter < c.retryTimes { + for _, statusCode := range c.retryHTTPCodes { + if resp.StatusCode == statusCode { + req.retryCounter++ + log.Println("Retrying:", req.URL.String(), resp.StatusCode) + return c.DoRequestClient(req) + } + } + } + // Limit response body reading - bodyReader := io.LimitReader(resp.Body, maxBodySize) + bodyReader := io.LimitReader(resp.Body, c.maxBodySize) // Decode response if resp.Request.Method != "HEAD" { @@ -81,7 +119,7 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD bodyReader = transform.NewReader(bodyReader, enc.NewDecoder()) } } else { - if !charsetDetectDisabled { + if !c.charsetDetectDisabled { bodyReader, err = charset.NewReader(bodyReader, req.Header.Get("Content-Type")) if err != nil { return nil, errors.Wrap(err, "Reading determined encoding error") diff --git a/client/client_test.go b/client/client_test.go index b5ddc0d..bd98829 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -2,6 +2,7 @@ package client import ( "fmt" + "github.com/stretchr/testify/assert" "net/http" "net/http/httptest" "reflect" @@ -100,7 +101,7 @@ func TestCharsetFromHeaders(t *testing.T) { defer ts.Close() req, _ := NewRequest("GET", ts.URL, nil) - res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false) + res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequestClient(req) if string(res.Body) != "Gültekin" { t.Fatal(string(res.Body)) @@ -115,7 +116,7 @@ func TestCharsetFromBody(t *testing.T) { defer ts.Close() req, _ := NewRequest("GET", ts.URL, nil) - res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false) + res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequestClient(req) if string(res.Body) != "Gültekin" { t.Fatal(string(res.Body)) @@ -131,9 +132,16 @@ func TestCharsetProvidedWithRequest(t *testing.T) { req, _ := NewRequest("GET", ts.URL, nil) req.Encoding = "windows-1254" - res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false) + res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequestClient(req) if string(res.Body) != "Gültekin" { t.Fatal(string(res.Body)) } } + +func TestRetry(t *testing.T) { + req, _ := NewRequest("GET", "https://httpbin.org/status/500", nil) + res, err := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequestClient(req) + assert.Nil(t, res) + assert.Error(t, err) +} diff --git a/client/request.go b/client/request.go index 7298d49..9b00322 100644 --- a/client/request.go +++ b/client/request.go @@ -26,6 +26,8 @@ type Request struct { // Set this true to cancel requests. Should be used on middlewares. Cancelled bool + + retryCounter int } // Cancel request diff --git a/export/csv.go b/export/csv.go index e0e01c2..893c4e0 100644 --- a/export/csv.go +++ b/export/csv.go @@ -21,7 +21,7 @@ type CSV struct { func (e *CSV) Export(exports chan interface{}) { // Create or append file - file, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.csv"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) + file, err := os.OpenFile(internal.DefaultString(e.FileName, "out.csv"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) if err != nil { log.Printf("Output file creation error: %v\n", err) return @@ -29,7 +29,7 @@ func (e *CSV) Export(exports chan interface{}) { defer file.Close() writer := csv.NewWriter(file) - writer.Comma = internal.PreferFirstRune(e.Comma, ',') + writer.Comma = internal.DefaultRune(e.Comma, ',') writer.UseCRLF = e.UseCRLF // Export data as responses came diff --git a/export/json.go b/export/json.go index 3c8d3d0..55cab72 100644 --- a/export/json.go +++ b/export/json.go @@ -19,7 +19,7 @@ type JSON struct { func (e *JSON) Export(exports chan interface{}) { // Create or append file - file, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) + file, err := os.OpenFile(internal.DefaultString(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) if err != nil { log.Printf("Output file creation error: %v\n", err) return diff --git a/geziyor.go b/geziyor.go index 8b10c86..341e21e 100644 --- a/geziyor.go +++ b/geziyor.go @@ -33,7 +33,6 @@ type Geziyor struct { // If options provided, options func NewGeziyor(opt *Options) *Geziyor { geziyor := &Geziyor{ - Client: client.NewClient(), Opt: opt, Exports: make(chan interface{}, 1), requestMiddlewares: []RequestMiddleware{ @@ -52,12 +51,21 @@ func NewGeziyor(opt *Options) *Geziyor { metrics: metrics.NewMetrics(opt.MetricsType), } + // Default if opt.UserAgent == "" { - geziyor.Opt.UserAgent = client.DefaultUserAgent + opt.UserAgent = client.DefaultUserAgent } if opt.MaxBodySize == 0 { - geziyor.Opt.MaxBodySize = client.DefaultMaxBody + opt.MaxBodySize = client.DefaultMaxBody } + if opt.RetryTimes == 0 { + opt.RetryTimes = client.DefaultRetryTimes + } + if len(opt.RetryHTTPCodes) == 0 { + opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes + } + // Client + geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes) if opt.Cache != nil { geziyor.Client.Transport = &httpcache.Transport{ Transport: geziyor.Client.Transport, Cache: opt.Cache, MarkCachedResponses: true} @@ -71,6 +79,7 @@ func NewGeziyor(opt *Options) *Geziyor { if opt.MaxRedirect != 0 { geziyor.Client.CheckRedirect = client.NewRedirectionHandler(opt.MaxRedirect) } + // Concurrency if opt.ConcurrentRequests != 0 { geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests) } @@ -80,11 +89,13 @@ func NewGeziyor(opt *Options) *Geziyor { hostSems map[string]chan struct{} }{hostSems: make(map[string]chan struct{})} } + // Middlewares + geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...) + geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...) + // Logging if opt.LogDisabled { log.SetOutput(ioutil.Discard) } - geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...) - geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...) return geziyor } @@ -191,7 +202,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re } } - res, err := g.Client.DoRequest(req, g.Opt.MaxBodySize, g.Opt.CharsetDetectDisabled) + res, err := g.Client.DoRequest(req) if err != nil { log.Println(err) return diff --git a/geziyor_test.go b/geziyor_test.go index 3418ad3..e79d14b 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -211,7 +211,7 @@ func BenchmarkRequests(b *testing.B) { ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { fmt.Fprint(w, "Hello, client") })) - ts.Client().Transport = client.NewClient().Transport + ts.Client().Transport = client.NewClient(client.DefaultMaxBody, false, client.DefaultRetryTimes, client.DefaultRetryHTTPCodes).Transport defer ts.Close() // As we don't benchmark creating a server, reset timer. diff --git a/internal/strings.go b/internal/strings.go index f05be13..8be1945 100644 --- a/internal/strings.go +++ b/internal/strings.go @@ -1,19 +1,19 @@ package internal -// PreferFirst returns first non-empty string -func PreferFirst(first string, second string) string { - if first != "" { - return first +// DefaultString returns first non-empty string +func DefaultString(val string, valDefault string) string { + if val != "" { + return val } - return second + return valDefault } -// PreferFirstRune returns first non-empty rune -func PreferFirstRune(first rune, second rune) rune { - if first != 0 { - return first +// DefaultRune returns first non-empty rune +func DefaultRune(val rune, valDefault rune) rune { + if val != 0 { + return val } - return second + return valDefault } // Contains checks whether []string Contains string diff --git a/middleware.go b/middleware.go index b8ae850..4bffb5a 100644 --- a/middleware.go +++ b/middleware.go @@ -26,6 +26,8 @@ func init() { rand.Seed(time.Now().UnixNano()) } +// ---* REQUEST MIDDLEWARES *--- + // recoverMiddleware recovers scraping being crashed. // Logs error and stack trace func recoverMiddleware(g *Geziyor, r *client.Request) { @@ -86,6 +88,8 @@ func metricsRequestMiddleware(g *Geziyor, r *client.Request) { g.metrics.RequestCounter.With("method", r.Method).Add(1) } +// ---* RESPONSE MIDDLEWARES *--- + // parseHTMLMiddleware parses response if response is HTML func parseHTMLMiddleware(g *Geziyor, r *client.Response) { if !g.Opt.ParseHTMLDisabled && r.IsHTML() { diff --git a/options.go b/options.go index fa6c140..d56cdf9 100644 --- a/options.go +++ b/options.go @@ -40,7 +40,8 @@ type Options struct { // Concurrent requests per domain limit ConcurrentRequestsPerDomain int - // User Agent. Default: "Geziyor 1.0" + // User Agent. + // Default: "Geziyor 1.0" UserAgent string // Request delays @@ -69,6 +70,16 @@ type Options struct { // Charset Detection disable CharsetDetectDisabled bool + // Maximum number of times to retry, in addition to the first download. + // Set -1 to disable retrying + // Default: 2 + RetryTimes int + + // Which HTTP response codes to retry. + // Other errors (DNS lookup issues, connections lost, etc) are always retried. + // Default: []int{500, 502, 503, 504, 522, 524, 408} + RetryHTTPCodes []int + // If true, HTML parsing is disabled to improve performance. ParseHTMLDisabled bool