Robots.txt support implemented
This commit is contained in:
parent
2cab68d2ce
commit
42faa92ece
@ -9,12 +9,11 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
|
|||||||
- 5.000+ Requests/Sec
|
- 5.000+ Requests/Sec
|
||||||
- JS Rendering
|
- JS Rendering
|
||||||
- Caching (Memory/Disk)
|
- Caching (Memory/Disk)
|
||||||
- Automatic Data Extracting (CSS Selectors)
|
|
||||||
- Automatic Data Exporting (JSON, CSV, or custom)
|
- Automatic Data Exporting (JSON, CSV, or custom)
|
||||||
- Metrics (Prometheus, Expvar, or custom)
|
- Metrics (Prometheus, Expvar, or custom)
|
||||||
- Limit Concurrency (Global/Per Domain)
|
- Limit Concurrency (Global/Per Domain)
|
||||||
- Request Delays (Constant/Randomized)
|
- Request Delays (Constant/Randomized)
|
||||||
- Cookies and Middlewares
|
- Cookies, Middlewares, robots.txt
|
||||||
- Automatic response decoding to UTF-8
|
- Automatic response decoding to UTF-8
|
||||||
|
|
||||||
See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings.
|
See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings.
|
||||||
@ -64,6 +63,8 @@ See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for
|
|||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
|
|
||||||
|
Go 1.12 required
|
||||||
|
|
||||||
go get github.com/geziyor/geziyor
|
go get github.com/geziyor/geziyor
|
||||||
|
|
||||||
**NOTE**: macOS limits the maximum number of open file descriptors.
|
**NOTE**: macOS limits the maximum number of open file descriptors.
|
||||||
@ -161,7 +162,6 @@ ok github.com/geziyor/geziyor 22.861s
|
|||||||
If you're interested in helping this project, please consider these features:
|
If you're interested in helping this project, please consider these features:
|
||||||
|
|
||||||
- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
|
- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
|
||||||
- Deploying Scrapers to Cloud
|
|
||||||
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~
|
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~
|
||||||
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))
|
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))
|
||||||
- ~~Realtime metrics (Prometheus etc.)~~
|
- ~~Realtime metrics (Prometheus etc.)~~
|
||||||
|
@ -74,11 +74,10 @@ func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, re
|
|||||||
|
|
||||||
// DoRequest selects appropriate request handler, client or Chrome
|
// DoRequest selects appropriate request handler, client or Chrome
|
||||||
func (c *Client) DoRequest(req *Request) (*Response, error) {
|
func (c *Client) DoRequest(req *Request) (*Response, error) {
|
||||||
if !req.Rendered {
|
if req.Rendered {
|
||||||
return c.DoRequestClient(req)
|
|
||||||
} else {
|
|
||||||
return c.DoRequestChrome(req)
|
return c.DoRequestChrome(req)
|
||||||
}
|
}
|
||||||
|
return c.DoRequestClient(req)
|
||||||
}
|
}
|
||||||
|
|
||||||
// DoRequestClient is a simple wrapper to read response according to options.
|
// DoRequestClient is a simple wrapper to read response according to options.
|
||||||
|
44
geziyor.go
44
geziyor.go
@ -19,13 +19,13 @@ type Geziyor struct {
|
|||||||
Client *client.Client
|
Client *client.Client
|
||||||
Exports chan interface{}
|
Exports chan interface{}
|
||||||
|
|
||||||
metrics *metrics.Metrics
|
metrics *metrics.Metrics
|
||||||
requestMiddlewares []middleware.RequestProcessor
|
reqMiddlewares []middleware.RequestProcessor
|
||||||
responseMiddlewares []middleware.ResponseProcessor
|
resMiddlewares []middleware.ResponseProcessor
|
||||||
wgRequests sync.WaitGroup
|
wgRequests sync.WaitGroup
|
||||||
wgExporters sync.WaitGroup
|
wgExporters sync.WaitGroup
|
||||||
semGlobal chan struct{}
|
semGlobal chan struct{}
|
||||||
semHosts struct {
|
semHosts struct {
|
||||||
sync.RWMutex
|
sync.RWMutex
|
||||||
hostSems map[string]chan struct{}
|
hostSems map[string]chan struct{}
|
||||||
}
|
}
|
||||||
@ -37,23 +37,19 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
geziyor := &Geziyor{
|
geziyor := &Geziyor{
|
||||||
Opt: opt,
|
Opt: opt,
|
||||||
Exports: make(chan interface{}, 1),
|
Exports: make(chan interface{}, 1),
|
||||||
requestMiddlewares: []middleware.RequestProcessor{
|
reqMiddlewares: []middleware.RequestProcessor{
|
||||||
&middleware.AllowedDomains{AllowedDomains: opt.AllowedDomains},
|
&middleware.AllowedDomains{AllowedDomains: opt.AllowedDomains},
|
||||||
&middleware.DuplicateRequests{RevisitEnabled: opt.URLRevisitEnabled},
|
&middleware.DuplicateRequests{RevisitEnabled: opt.URLRevisitEnabled},
|
||||||
&middleware.Headers{UserAgent: opt.UserAgent},
|
&middleware.Headers{UserAgent: opt.UserAgent},
|
||||||
middleware.NewDelay(opt.RequestDelayRandomize, opt.RequestDelay),
|
middleware.NewDelay(opt.RequestDelayRandomize, opt.RequestDelay),
|
||||||
},
|
},
|
||||||
responseMiddlewares: []middleware.ResponseProcessor{
|
resMiddlewares: []middleware.ResponseProcessor{
|
||||||
&middleware.ParseHTML{ParseHTMLDisabled: opt.ParseHTMLDisabled},
|
&middleware.ParseHTML{ParseHTMLDisabled: opt.ParseHTMLDisabled},
|
||||||
&middleware.LogStats{LogDisabled: opt.LogDisabled},
|
&middleware.LogStats{LogDisabled: opt.LogDisabled},
|
||||||
},
|
},
|
||||||
metrics: metrics.NewMetrics(opt.MetricsType),
|
metrics: metrics.NewMetrics(opt.MetricsType),
|
||||||
}
|
}
|
||||||
|
|
||||||
metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics}
|
|
||||||
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, metricsMiddleware)
|
|
||||||
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, metricsMiddleware)
|
|
||||||
|
|
||||||
// Default
|
// Default
|
||||||
if opt.UserAgent == "" {
|
if opt.UserAgent == "" {
|
||||||
opt.UserAgent = client.DefaultUserAgent
|
opt.UserAgent = client.DefaultUserAgent
|
||||||
@ -67,6 +63,7 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
if len(opt.RetryHTTPCodes) == 0 {
|
if len(opt.RetryHTTPCodes) == 0 {
|
||||||
opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes
|
opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes
|
||||||
}
|
}
|
||||||
|
|
||||||
// Client
|
// Client
|
||||||
geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes)
|
geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes)
|
||||||
if opt.Cache != nil {
|
if opt.Cache != nil {
|
||||||
@ -82,6 +79,7 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
if opt.MaxRedirect != 0 {
|
if opt.MaxRedirect != 0 {
|
||||||
geziyor.Client.CheckRedirect = client.NewRedirectionHandler(opt.MaxRedirect)
|
geziyor.Client.CheckRedirect = client.NewRedirectionHandler(opt.MaxRedirect)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Concurrency
|
// Concurrency
|
||||||
if opt.ConcurrentRequests != 0 {
|
if opt.ConcurrentRequests != 0 {
|
||||||
geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests)
|
geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests)
|
||||||
@ -92,9 +90,19 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
hostSems map[string]chan struct{}
|
hostSems map[string]chan struct{}
|
||||||
}{hostSems: make(map[string]chan struct{})}
|
}{hostSems: make(map[string]chan struct{})}
|
||||||
}
|
}
|
||||||
// Middlewares
|
|
||||||
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...)
|
// Base Middlewares
|
||||||
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...)
|
metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics}
|
||||||
|
geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, metricsMiddleware)
|
||||||
|
geziyor.resMiddlewares = append(geziyor.resMiddlewares, metricsMiddleware)
|
||||||
|
|
||||||
|
robotsMiddleware := middleware.NewRobotsTxt(geziyor.Client, opt.RobotsTxtDisabled)
|
||||||
|
geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, robotsMiddleware)
|
||||||
|
|
||||||
|
// Custom Middlewares
|
||||||
|
geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, opt.RequestMiddlewares...)
|
||||||
|
geziyor.resMiddlewares = append(geziyor.resMiddlewares, opt.ResponseMiddlewares...)
|
||||||
|
|
||||||
// Logging
|
// Logging
|
||||||
if opt.LogDisabled {
|
if opt.LogDisabled {
|
||||||
log.SetOutput(ioutil.Discard)
|
log.SetOutput(ioutil.Discard)
|
||||||
@ -200,7 +208,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
|
|||||||
}
|
}
|
||||||
defer g.recoverMe()
|
defer g.recoverMe()
|
||||||
|
|
||||||
for _, middlewareFunc := range g.requestMiddlewares {
|
for _, middlewareFunc := range g.reqMiddlewares {
|
||||||
middlewareFunc.ProcessRequest(req)
|
middlewareFunc.ProcessRequest(req)
|
||||||
if req.Cancelled {
|
if req.Cancelled {
|
||||||
return
|
return
|
||||||
@ -213,7 +221,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, middlewareFunc := range g.responseMiddlewares {
|
for _, middlewareFunc := range g.resMiddlewares {
|
||||||
middlewareFunc.ProcessResponse(res)
|
middlewareFunc.ProcessResponse(res)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -187,6 +187,16 @@ func TestConcurrentRequests(t *testing.T) {
|
|||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRobots(t *testing.T) {
|
||||||
|
defer leaktest.Check(t)()
|
||||||
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
|
StartURLs: []string{"https://httpbin.org/deny"},
|
||||||
|
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
||||||
|
t.Error("/deny should be blocked by robots.txt middleware")
|
||||||
|
},
|
||||||
|
}).Start()
|
||||||
|
}
|
||||||
|
|
||||||
// Make sure to increase open file descriptor limits before running
|
// Make sure to increase open file descriptor limits before running
|
||||||
func BenchmarkRequests(b *testing.B) {
|
func BenchmarkRequests(b *testing.B) {
|
||||||
|
|
||||||
|
1
go.mod
1
go.mod
@ -13,6 +13,7 @@ require (
|
|||||||
github.com/pkg/errors v0.8.1
|
github.com/pkg/errors v0.8.1
|
||||||
github.com/prometheus/client_golang v1.0.0
|
github.com/prometheus/client_golang v1.0.0
|
||||||
github.com/stretchr/testify v1.3.0
|
github.com/stretchr/testify v1.3.0
|
||||||
|
github.com/temoto/robotstxt v1.1.1
|
||||||
golang.org/x/net v0.0.0-20190522155817-f3200d17e092
|
golang.org/x/net v0.0.0-20190522155817-f3200d17e092
|
||||||
golang.org/x/text v0.3.2
|
golang.org/x/text v0.3.2
|
||||||
)
|
)
|
||||||
|
2
go.sum
2
go.sum
@ -69,6 +69,8 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
|
|||||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||||
|
github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
|
||||||
|
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||||
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||||
|
@ -4,6 +4,7 @@ import (
|
|||||||
"github.com/geziyor/geziyor/client"
|
"github.com/geziyor/geziyor/client"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// RequestResponseProcessor interface is for middlewares that needs to process both requests and responses
|
||||||
type RequestResponseProcessor interface {
|
type RequestResponseProcessor interface {
|
||||||
RequestProcessor
|
RequestProcessor
|
||||||
ResponseProcessor
|
ResponseProcessor
|
||||||
|
63
middleware/robotstxt.go
Normal file
63
middleware/robotstxt.go
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
package middleware
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/geziyor/geziyor/client"
|
||||||
|
"github.com/temoto/robotstxt"
|
||||||
|
"log"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
// RobotsTxt middleware filters out requests forbidden by the robots.txt exclusion standard.
|
||||||
|
type RobotsTxt struct {
|
||||||
|
robotsDisabled bool
|
||||||
|
client *client.Client
|
||||||
|
mut sync.RWMutex
|
||||||
|
robotsMap map[string]*robotstxt.RobotsData
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewRobotsTxt(client *client.Client, robotsDisabled bool) RequestProcessor {
|
||||||
|
return &RobotsTxt{
|
||||||
|
robotsDisabled: robotsDisabled,
|
||||||
|
client: client,
|
||||||
|
robotsMap: make(map[string]*robotstxt.RobotsData),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *RobotsTxt) ProcessRequest(r *client.Request) {
|
||||||
|
if m.robotsDisabled {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Locking like this improves performance but causes duplicate requests to robots.txt,
|
||||||
|
m.mut.RLock()
|
||||||
|
robotsData, exists := m.robotsMap[r.Host]
|
||||||
|
m.mut.RUnlock()
|
||||||
|
|
||||||
|
if !exists {
|
||||||
|
// TODO: Disable retry
|
||||||
|
robotsReq, err := client.NewRequest("GET", r.URL.Scheme+"://"+r.Host+"/robots.txt", nil)
|
||||||
|
if err != nil {
|
||||||
|
return // Don't Do anything
|
||||||
|
}
|
||||||
|
|
||||||
|
robotsResp, err := m.client.DoRequestClient(robotsReq)
|
||||||
|
if err != nil {
|
||||||
|
return // Don't Do anything
|
||||||
|
}
|
||||||
|
|
||||||
|
robotsData, err = robotstxt.FromStatusAndBytes(robotsResp.StatusCode, robotsResp.Body)
|
||||||
|
if err != nil {
|
||||||
|
return // Don't Do anything
|
||||||
|
}
|
||||||
|
|
||||||
|
m.mut.Lock()
|
||||||
|
m.robotsMap[r.Host] = robotsData
|
||||||
|
m.mut.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
if !robotsData.TestAgent(r.URL.Path, r.UserAgent()) {
|
||||||
|
// TODO: Forbidden requests metrics
|
||||||
|
log.Println("Forbidden by robots.txt:", r.URL.String())
|
||||||
|
r.Cancel()
|
||||||
|
}
|
||||||
|
}
|
86
options.go
86
options.go
@ -15,48 +15,28 @@ type Options struct {
|
|||||||
// If empty, any domain is allowed
|
// If empty, any domain is allowed
|
||||||
AllowedDomains []string
|
AllowedDomains []string
|
||||||
|
|
||||||
// First requests will made to this url array. (Concurrently)
|
|
||||||
StartURLs []string
|
|
||||||
|
|
||||||
// StartRequestsFunc called on scraper start
|
|
||||||
StartRequestsFunc func(g *Geziyor)
|
|
||||||
|
|
||||||
// ParseFunc is callback of StartURLs response.
|
|
||||||
ParseFunc func(g *Geziyor, r *client.Response)
|
|
||||||
|
|
||||||
// Timeout is global request timeout
|
|
||||||
Timeout time.Duration
|
|
||||||
|
|
||||||
// Set this to enable caching responses.
|
// Set this to enable caching responses.
|
||||||
// Memory Cache: httpcache.NewMemoryCache()
|
// Memory Cache: httpcache.NewMemoryCache()
|
||||||
// Disk Cache: diskcache.New(".cache")
|
// Disk Cache: diskcache.New(".cache")
|
||||||
Cache httpcache.Cache
|
Cache httpcache.Cache
|
||||||
|
|
||||||
|
// Charset Detection disable
|
||||||
|
CharsetDetectDisabled bool
|
||||||
|
|
||||||
// Concurrent requests limit
|
// Concurrent requests limit
|
||||||
ConcurrentRequests int
|
ConcurrentRequests int
|
||||||
|
|
||||||
// Concurrent requests per domain limit
|
// Concurrent requests per domain limit
|
||||||
ConcurrentRequestsPerDomain int
|
ConcurrentRequestsPerDomain int
|
||||||
|
|
||||||
// User Agent.
|
// If set true, cookies won't send.
|
||||||
// Default: "Geziyor 1.0"
|
CookiesDisabled bool
|
||||||
UserAgent string
|
|
||||||
|
|
||||||
// Request delays
|
|
||||||
RequestDelay time.Duration
|
|
||||||
// RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay
|
|
||||||
RequestDelayRandomize bool
|
|
||||||
|
|
||||||
// Disable logging by setting this true
|
|
||||||
LogDisabled bool
|
|
||||||
|
|
||||||
// For extracting data
|
// For extracting data
|
||||||
Exporters []export.Exporter
|
Exporters []export.Exporter
|
||||||
|
|
||||||
// Called before requests made to manipulate requests
|
// Disable logging by setting this true
|
||||||
RequestMiddlewares []middleware.RequestProcessor
|
LogDisabled bool
|
||||||
|
|
||||||
// Called after response received
|
|
||||||
ResponseMiddlewares []middleware.ResponseProcessor
|
|
||||||
|
|
||||||
// Max body reading size in bytes. Default: 1GB
|
// Max body reading size in bytes. Default: 1GB
|
||||||
MaxBodySize int64
|
MaxBodySize int64
|
||||||
@ -64,27 +44,53 @@ type Options struct {
|
|||||||
// Maximum redirection time. Default: 10
|
// Maximum redirection time. Default: 10
|
||||||
MaxRedirect int
|
MaxRedirect int
|
||||||
|
|
||||||
// Charset Detection disable
|
// Scraper metrics exporting type. See metrics.Type
|
||||||
CharsetDetectDisabled bool
|
MetricsType metrics.Type
|
||||||
|
|
||||||
// Maximum number of times to retry, in addition to the first download.
|
// ParseFunc is callback of StartURLs response.
|
||||||
// Set -1 to disable retrying
|
ParseFunc func(g *Geziyor, r *client.Response)
|
||||||
// Default: 2
|
|
||||||
RetryTimes int
|
// If true, HTML parsing is disabled to improve performance.
|
||||||
|
ParseHTMLDisabled bool
|
||||||
|
|
||||||
|
// Request delays
|
||||||
|
RequestDelay time.Duration
|
||||||
|
|
||||||
|
// RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay
|
||||||
|
RequestDelayRandomize bool
|
||||||
|
|
||||||
|
// Called before requests made to manipulate requests
|
||||||
|
RequestMiddlewares []middleware.RequestProcessor
|
||||||
|
|
||||||
|
// Called after response received
|
||||||
|
ResponseMiddlewares []middleware.ResponseProcessor
|
||||||
|
|
||||||
// Which HTTP response codes to retry.
|
// Which HTTP response codes to retry.
|
||||||
// Other errors (DNS lookup issues, connections lost, etc) are always retried.
|
// Other errors (DNS lookup issues, connections lost, etc) are always retried.
|
||||||
// Default: []int{500, 502, 503, 504, 522, 524, 408}
|
// Default: []int{500, 502, 503, 504, 522, 524, 408}
|
||||||
RetryHTTPCodes []int
|
RetryHTTPCodes []int
|
||||||
|
|
||||||
// If true, HTML parsing is disabled to improve performance.
|
// Maximum number of times to retry, in addition to the first download.
|
||||||
ParseHTMLDisabled bool
|
// Set -1 to disable retrying
|
||||||
|
// Default: 2
|
||||||
|
RetryTimes int
|
||||||
|
|
||||||
|
// If true, disable robots.txt checks
|
||||||
|
RobotsTxtDisabled bool
|
||||||
|
|
||||||
|
// StartRequestsFunc called on scraper start
|
||||||
|
StartRequestsFunc func(g *Geziyor)
|
||||||
|
|
||||||
|
// First requests will made to this url array. (Concurrently)
|
||||||
|
StartURLs []string
|
||||||
|
|
||||||
|
// Timeout is global request timeout
|
||||||
|
Timeout time.Duration
|
||||||
|
|
||||||
// Revisiting same URLs is disabled by default
|
// Revisiting same URLs is disabled by default
|
||||||
URLRevisitEnabled bool
|
URLRevisitEnabled bool
|
||||||
|
|
||||||
// If set true, cookies won't send.
|
// User Agent.
|
||||||
CookiesDisabled bool
|
// Default: "Geziyor 1.0"
|
||||||
|
UserAgent string
|
||||||
MetricsType metrics.Type
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user