Retry requests support implemented for client.
This commit is contained in:
parent
da03567fae
commit
9adff75509
@ -10,6 +10,7 @@ import (
|
|||||||
"golang.org/x/text/transform"
|
"golang.org/x/text/transform"
|
||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
|
"log"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
@ -25,14 +26,25 @@ var (
|
|||||||
// Client is a small wrapper around *http.Client to provide new methods.
|
// Client is a small wrapper around *http.Client to provide new methods.
|
||||||
type Client struct {
|
type Client struct {
|
||||||
*http.Client
|
*http.Client
|
||||||
|
maxBodySize int64
|
||||||
|
charsetDetectDisabled bool
|
||||||
|
retryTimes int
|
||||||
|
retryHTTPCodes []int
|
||||||
}
|
}
|
||||||
|
|
||||||
const DefaultUserAgent = "Geziyor 1.0"
|
const (
|
||||||
const DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB
|
DefaultUserAgent = "Geziyor 1.0"
|
||||||
|
DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB
|
||||||
|
DefaultRetryTimes = 2
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
DefaultRetryHTTPCodes = []int{500, 502, 503, 504, 522, 524, 408}
|
||||||
|
)
|
||||||
|
|
||||||
// NewClient creates http.Client with modified values for typical web scraper
|
// NewClient creates http.Client with modified values for typical web scraper
|
||||||
func NewClient() *Client {
|
func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, retryHTTPCodes []int) *Client {
|
||||||
client := &http.Client{
|
httpClient := &http.Client{
|
||||||
Transport: &http.Transport{
|
Transport: &http.Transport{
|
||||||
Proxy: http.ProxyFromEnvironment,
|
Proxy: http.ProxyFromEnvironment,
|
||||||
DialContext: (&net.Dialer{
|
DialContext: (&net.Dialer{
|
||||||
@ -48,31 +60,57 @@ func NewClient() *Client {
|
|||||||
},
|
},
|
||||||
Timeout: time.Second * 180, // Google's timeout
|
Timeout: time.Second * 180, // Google's timeout
|
||||||
}
|
}
|
||||||
return &Client{Client: client}
|
|
||||||
|
client := Client{
|
||||||
|
Client: httpClient,
|
||||||
|
maxBodySize: maxBodySize,
|
||||||
|
charsetDetectDisabled: charsetDetectDisabled,
|
||||||
|
retryTimes: retryTimes,
|
||||||
|
retryHTTPCodes: retryHTTPCodes,
|
||||||
|
}
|
||||||
|
|
||||||
|
return &client
|
||||||
}
|
}
|
||||||
|
|
||||||
// DoRequest selects appropriate request handler, client or Chrome
|
// DoRequest selects appropriate request handler, client or Chrome
|
||||||
func (c *Client) DoRequest(req *Request, maxBodySize int64, charsetDetectDisabled bool) (*Response, error) {
|
func (c *Client) DoRequest(req *Request) (*Response, error) {
|
||||||
if !req.Rendered {
|
if !req.Rendered {
|
||||||
return c.DoRequestClient(req, maxBodySize, charsetDetectDisabled)
|
return c.DoRequestClient(req)
|
||||||
} else {
|
} else {
|
||||||
return c.DoRequestChrome(req)
|
return c.DoRequestChrome(req)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// DoRequestClient is a simple wrapper to read response according to options.
|
// DoRequestClient is a simple wrapper to read response according to options.
|
||||||
func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectDisabled bool) (*Response, error) {
|
func (c *Client) DoRequestClient(req *Request) (*Response, error) {
|
||||||
// Do request
|
// Do request
|
||||||
resp, err := c.Do(req.Request)
|
resp, err := c.Do(req.Request)
|
||||||
if resp != nil {
|
if resp != nil {
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// Retry on Error
|
||||||
|
if req.retryCounter < c.retryTimes {
|
||||||
|
req.retryCounter++
|
||||||
|
log.Println("Retrying:", req.URL.String())
|
||||||
|
return c.DoRequestClient(req)
|
||||||
|
}
|
||||||
return nil, errors.Wrap(err, "Response error")
|
return nil, errors.Wrap(err, "Response error")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Checks status code to retry
|
||||||
|
if req.retryCounter < c.retryTimes {
|
||||||
|
for _, statusCode := range c.retryHTTPCodes {
|
||||||
|
if resp.StatusCode == statusCode {
|
||||||
|
req.retryCounter++
|
||||||
|
log.Println("Retrying:", req.URL.String(), resp.StatusCode)
|
||||||
|
return c.DoRequestClient(req)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Limit response body reading
|
// Limit response body reading
|
||||||
bodyReader := io.LimitReader(resp.Body, maxBodySize)
|
bodyReader := io.LimitReader(resp.Body, c.maxBodySize)
|
||||||
|
|
||||||
// Decode response
|
// Decode response
|
||||||
if resp.Request.Method != "HEAD" {
|
if resp.Request.Method != "HEAD" {
|
||||||
@ -81,7 +119,7 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD
|
|||||||
bodyReader = transform.NewReader(bodyReader, enc.NewDecoder())
|
bodyReader = transform.NewReader(bodyReader, enc.NewDecoder())
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if !charsetDetectDisabled {
|
if !c.charsetDetectDisabled {
|
||||||
bodyReader, err = charset.NewReader(bodyReader, req.Header.Get("Content-Type"))
|
bodyReader, err = charset.NewReader(bodyReader, req.Header.Get("Content-Type"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, errors.Wrap(err, "Reading determined encoding error")
|
return nil, errors.Wrap(err, "Reading determined encoding error")
|
||||||
|
@ -2,6 +2,7 @@ package client
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"reflect"
|
"reflect"
|
||||||
@ -100,7 +101,7 @@ func TestCharsetFromHeaders(t *testing.T) {
|
|||||||
defer ts.Close()
|
defer ts.Close()
|
||||||
|
|
||||||
req, _ := NewRequest("GET", ts.URL, nil)
|
req, _ := NewRequest("GET", ts.URL, nil)
|
||||||
res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false)
|
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequestClient(req)
|
||||||
|
|
||||||
if string(res.Body) != "Gültekin" {
|
if string(res.Body) != "Gültekin" {
|
||||||
t.Fatal(string(res.Body))
|
t.Fatal(string(res.Body))
|
||||||
@ -115,7 +116,7 @@ func TestCharsetFromBody(t *testing.T) {
|
|||||||
defer ts.Close()
|
defer ts.Close()
|
||||||
|
|
||||||
req, _ := NewRequest("GET", ts.URL, nil)
|
req, _ := NewRequest("GET", ts.URL, nil)
|
||||||
res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false)
|
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequestClient(req)
|
||||||
|
|
||||||
if string(res.Body) != "Gültekin" {
|
if string(res.Body) != "Gültekin" {
|
||||||
t.Fatal(string(res.Body))
|
t.Fatal(string(res.Body))
|
||||||
@ -131,9 +132,16 @@ func TestCharsetProvidedWithRequest(t *testing.T) {
|
|||||||
|
|
||||||
req, _ := NewRequest("GET", ts.URL, nil)
|
req, _ := NewRequest("GET", ts.URL, nil)
|
||||||
req.Encoding = "windows-1254"
|
req.Encoding = "windows-1254"
|
||||||
res, _ := NewClient().DoRequestClient(req, DefaultMaxBody, false)
|
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequestClient(req)
|
||||||
|
|
||||||
if string(res.Body) != "Gültekin" {
|
if string(res.Body) != "Gültekin" {
|
||||||
t.Fatal(string(res.Body))
|
t.Fatal(string(res.Body))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRetry(t *testing.T) {
|
||||||
|
req, _ := NewRequest("GET", "https://httpbin.org/status/500", nil)
|
||||||
|
res, err := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequestClient(req)
|
||||||
|
assert.Nil(t, res)
|
||||||
|
assert.Error(t, err)
|
||||||
|
}
|
||||||
|
@ -26,6 +26,8 @@ type Request struct {
|
|||||||
|
|
||||||
// Set this true to cancel requests. Should be used on middlewares.
|
// Set this true to cancel requests. Should be used on middlewares.
|
||||||
Cancelled bool
|
Cancelled bool
|
||||||
|
|
||||||
|
retryCounter int
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cancel request
|
// Cancel request
|
||||||
|
@ -21,7 +21,7 @@ type CSV struct {
|
|||||||
func (e *CSV) Export(exports chan interface{}) {
|
func (e *CSV) Export(exports chan interface{}) {
|
||||||
|
|
||||||
// Create or append file
|
// Create or append file
|
||||||
file, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.csv"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
file, err := os.OpenFile(internal.DefaultString(e.FileName, "out.csv"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Output file creation error: %v\n", err)
|
log.Printf("Output file creation error: %v\n", err)
|
||||||
return
|
return
|
||||||
@ -29,7 +29,7 @@ func (e *CSV) Export(exports chan interface{}) {
|
|||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
writer := csv.NewWriter(file)
|
writer := csv.NewWriter(file)
|
||||||
writer.Comma = internal.PreferFirstRune(e.Comma, ',')
|
writer.Comma = internal.DefaultRune(e.Comma, ',')
|
||||||
writer.UseCRLF = e.UseCRLF
|
writer.UseCRLF = e.UseCRLF
|
||||||
|
|
||||||
// Export data as responses came
|
// Export data as responses came
|
||||||
|
@ -19,7 +19,7 @@ type JSON struct {
|
|||||||
func (e *JSON) Export(exports chan interface{}) {
|
func (e *JSON) Export(exports chan interface{}) {
|
||||||
|
|
||||||
// Create or append file
|
// Create or append file
|
||||||
file, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
file, err := os.OpenFile(internal.DefaultString(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Output file creation error: %v\n", err)
|
log.Printf("Output file creation error: %v\n", err)
|
||||||
return
|
return
|
||||||
|
23
geziyor.go
23
geziyor.go
@ -33,7 +33,6 @@ type Geziyor struct {
|
|||||||
// If options provided, options
|
// If options provided, options
|
||||||
func NewGeziyor(opt *Options) *Geziyor {
|
func NewGeziyor(opt *Options) *Geziyor {
|
||||||
geziyor := &Geziyor{
|
geziyor := &Geziyor{
|
||||||
Client: client.NewClient(),
|
|
||||||
Opt: opt,
|
Opt: opt,
|
||||||
Exports: make(chan interface{}, 1),
|
Exports: make(chan interface{}, 1),
|
||||||
requestMiddlewares: []RequestMiddleware{
|
requestMiddlewares: []RequestMiddleware{
|
||||||
@ -52,12 +51,21 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
metrics: metrics.NewMetrics(opt.MetricsType),
|
metrics: metrics.NewMetrics(opt.MetricsType),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Default
|
||||||
if opt.UserAgent == "" {
|
if opt.UserAgent == "" {
|
||||||
geziyor.Opt.UserAgent = client.DefaultUserAgent
|
opt.UserAgent = client.DefaultUserAgent
|
||||||
}
|
}
|
||||||
if opt.MaxBodySize == 0 {
|
if opt.MaxBodySize == 0 {
|
||||||
geziyor.Opt.MaxBodySize = client.DefaultMaxBody
|
opt.MaxBodySize = client.DefaultMaxBody
|
||||||
}
|
}
|
||||||
|
if opt.RetryTimes == 0 {
|
||||||
|
opt.RetryTimes = client.DefaultRetryTimes
|
||||||
|
}
|
||||||
|
if len(opt.RetryHTTPCodes) == 0 {
|
||||||
|
opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes
|
||||||
|
}
|
||||||
|
// Client
|
||||||
|
geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes)
|
||||||
if opt.Cache != nil {
|
if opt.Cache != nil {
|
||||||
geziyor.Client.Transport = &httpcache.Transport{
|
geziyor.Client.Transport = &httpcache.Transport{
|
||||||
Transport: geziyor.Client.Transport, Cache: opt.Cache, MarkCachedResponses: true}
|
Transport: geziyor.Client.Transport, Cache: opt.Cache, MarkCachedResponses: true}
|
||||||
@ -71,6 +79,7 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
if opt.MaxRedirect != 0 {
|
if opt.MaxRedirect != 0 {
|
||||||
geziyor.Client.CheckRedirect = client.NewRedirectionHandler(opt.MaxRedirect)
|
geziyor.Client.CheckRedirect = client.NewRedirectionHandler(opt.MaxRedirect)
|
||||||
}
|
}
|
||||||
|
// Concurrency
|
||||||
if opt.ConcurrentRequests != 0 {
|
if opt.ConcurrentRequests != 0 {
|
||||||
geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests)
|
geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests)
|
||||||
}
|
}
|
||||||
@ -80,11 +89,13 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
hostSems map[string]chan struct{}
|
hostSems map[string]chan struct{}
|
||||||
}{hostSems: make(map[string]chan struct{})}
|
}{hostSems: make(map[string]chan struct{})}
|
||||||
}
|
}
|
||||||
|
// Middlewares
|
||||||
|
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...)
|
||||||
|
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...)
|
||||||
|
// Logging
|
||||||
if opt.LogDisabled {
|
if opt.LogDisabled {
|
||||||
log.SetOutput(ioutil.Discard)
|
log.SetOutput(ioutil.Discard)
|
||||||
}
|
}
|
||||||
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...)
|
|
||||||
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...)
|
|
||||||
|
|
||||||
return geziyor
|
return geziyor
|
||||||
}
|
}
|
||||||
@ -191,7 +202,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
res, err := g.Client.DoRequest(req, g.Opt.MaxBodySize, g.Opt.CharsetDetectDisabled)
|
res, err := g.Client.DoRequest(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Println(err)
|
log.Println(err)
|
||||||
return
|
return
|
||||||
|
@ -211,7 +211,7 @@ func BenchmarkRequests(b *testing.B) {
|
|||||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
fmt.Fprint(w, "Hello, client")
|
fmt.Fprint(w, "Hello, client")
|
||||||
}))
|
}))
|
||||||
ts.Client().Transport = client.NewClient().Transport
|
ts.Client().Transport = client.NewClient(client.DefaultMaxBody, false, client.DefaultRetryTimes, client.DefaultRetryHTTPCodes).Transport
|
||||||
defer ts.Close()
|
defer ts.Close()
|
||||||
|
|
||||||
// As we don't benchmark creating a server, reset timer.
|
// As we don't benchmark creating a server, reset timer.
|
||||||
|
@ -1,19 +1,19 @@
|
|||||||
package internal
|
package internal
|
||||||
|
|
||||||
// PreferFirst returns first non-empty string
|
// DefaultString returns first non-empty string
|
||||||
func PreferFirst(first string, second string) string {
|
func DefaultString(val string, valDefault string) string {
|
||||||
if first != "" {
|
if val != "" {
|
||||||
return first
|
return val
|
||||||
}
|
}
|
||||||
return second
|
return valDefault
|
||||||
}
|
}
|
||||||
|
|
||||||
// PreferFirstRune returns first non-empty rune
|
// DefaultRune returns first non-empty rune
|
||||||
func PreferFirstRune(first rune, second rune) rune {
|
func DefaultRune(val rune, valDefault rune) rune {
|
||||||
if first != 0 {
|
if val != 0 {
|
||||||
return first
|
return val
|
||||||
}
|
}
|
||||||
return second
|
return valDefault
|
||||||
}
|
}
|
||||||
|
|
||||||
// Contains checks whether []string Contains string
|
// Contains checks whether []string Contains string
|
||||||
|
@ -26,6 +26,8 @@ func init() {
|
|||||||
rand.Seed(time.Now().UnixNano())
|
rand.Seed(time.Now().UnixNano())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---* REQUEST MIDDLEWARES *---
|
||||||
|
|
||||||
// recoverMiddleware recovers scraping being crashed.
|
// recoverMiddleware recovers scraping being crashed.
|
||||||
// Logs error and stack trace
|
// Logs error and stack trace
|
||||||
func recoverMiddleware(g *Geziyor, r *client.Request) {
|
func recoverMiddleware(g *Geziyor, r *client.Request) {
|
||||||
@ -86,6 +88,8 @@ func metricsRequestMiddleware(g *Geziyor, r *client.Request) {
|
|||||||
g.metrics.RequestCounter.With("method", r.Method).Add(1)
|
g.metrics.RequestCounter.With("method", r.Method).Add(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---* RESPONSE MIDDLEWARES *---
|
||||||
|
|
||||||
// parseHTMLMiddleware parses response if response is HTML
|
// parseHTMLMiddleware parses response if response is HTML
|
||||||
func parseHTMLMiddleware(g *Geziyor, r *client.Response) {
|
func parseHTMLMiddleware(g *Geziyor, r *client.Response) {
|
||||||
if !g.Opt.ParseHTMLDisabled && r.IsHTML() {
|
if !g.Opt.ParseHTMLDisabled && r.IsHTML() {
|
||||||
|
13
options.go
13
options.go
@ -40,7 +40,8 @@ type Options struct {
|
|||||||
// Concurrent requests per domain limit
|
// Concurrent requests per domain limit
|
||||||
ConcurrentRequestsPerDomain int
|
ConcurrentRequestsPerDomain int
|
||||||
|
|
||||||
// User Agent. Default: "Geziyor 1.0"
|
// User Agent.
|
||||||
|
// Default: "Geziyor 1.0"
|
||||||
UserAgent string
|
UserAgent string
|
||||||
|
|
||||||
// Request delays
|
// Request delays
|
||||||
@ -69,6 +70,16 @@ type Options struct {
|
|||||||
// Charset Detection disable
|
// Charset Detection disable
|
||||||
CharsetDetectDisabled bool
|
CharsetDetectDisabled bool
|
||||||
|
|
||||||
|
// Maximum number of times to retry, in addition to the first download.
|
||||||
|
// Set -1 to disable retrying
|
||||||
|
// Default: 2
|
||||||
|
RetryTimes int
|
||||||
|
|
||||||
|
// Which HTTP response codes to retry.
|
||||||
|
// Other errors (DNS lookup issues, connections lost, etc) are always retried.
|
||||||
|
// Default: []int{500, 502, 503, 504, 522, 524, 408}
|
||||||
|
RetryHTTPCodes []int
|
||||||
|
|
||||||
// If true, HTML parsing is disabled to improve performance.
|
// If true, HTML parsing is disabled to improve performance.
|
||||||
ParseHTMLDisabled bool
|
ParseHTMLDisabled bool
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user