Robots.txt support implemented
This commit is contained in:
parent
2cab68d2ce
commit
42faa92ece
@ -9,12 +9,11 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
|
||||
- 5.000+ Requests/Sec
|
||||
- JS Rendering
|
||||
- Caching (Memory/Disk)
|
||||
- Automatic Data Extracting (CSS Selectors)
|
||||
- Automatic Data Exporting (JSON, CSV, or custom)
|
||||
- Metrics (Prometheus, Expvar, or custom)
|
||||
- Limit Concurrency (Global/Per Domain)
|
||||
- Request Delays (Constant/Randomized)
|
||||
- Cookies and Middlewares
|
||||
- Cookies, Middlewares, robots.txt
|
||||
- Automatic response decoding to UTF-8
|
||||
|
||||
See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings.
|
||||
@ -64,6 +63,8 @@ See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for
|
||||
|
||||
### Installation
|
||||
|
||||
Go 1.12 required
|
||||
|
||||
go get github.com/geziyor/geziyor
|
||||
|
||||
**NOTE**: macOS limits the maximum number of open file descriptors.
|
||||
@ -161,7 +162,6 @@ ok github.com/geziyor/geziyor 22.861s
|
||||
If you're interested in helping this project, please consider these features:
|
||||
|
||||
- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
|
||||
- Deploying Scrapers to Cloud
|
||||
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~
|
||||
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))
|
||||
- ~~Realtime metrics (Prometheus etc.)~~
|
||||
|
@ -74,11 +74,10 @@ func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, re
|
||||
|
||||
// DoRequest selects appropriate request handler, client or Chrome
|
||||
func (c *Client) DoRequest(req *Request) (*Response, error) {
|
||||
if !req.Rendered {
|
||||
return c.DoRequestClient(req)
|
||||
} else {
|
||||
if req.Rendered {
|
||||
return c.DoRequestChrome(req)
|
||||
}
|
||||
return c.DoRequestClient(req)
|
||||
}
|
||||
|
||||
// DoRequestClient is a simple wrapper to read response according to options.
|
||||
|
44
geziyor.go
44
geziyor.go
@ -19,13 +19,13 @@ type Geziyor struct {
|
||||
Client *client.Client
|
||||
Exports chan interface{}
|
||||
|
||||
metrics *metrics.Metrics
|
||||
requestMiddlewares []middleware.RequestProcessor
|
||||
responseMiddlewares []middleware.ResponseProcessor
|
||||
wgRequests sync.WaitGroup
|
||||
wgExporters sync.WaitGroup
|
||||
semGlobal chan struct{}
|
||||
semHosts struct {
|
||||
metrics *metrics.Metrics
|
||||
reqMiddlewares []middleware.RequestProcessor
|
||||
resMiddlewares []middleware.ResponseProcessor
|
||||
wgRequests sync.WaitGroup
|
||||
wgExporters sync.WaitGroup
|
||||
semGlobal chan struct{}
|
||||
semHosts struct {
|
||||
sync.RWMutex
|
||||
hostSems map[string]chan struct{}
|
||||
}
|
||||
@ -37,23 +37,19 @@ func NewGeziyor(opt *Options) *Geziyor {
|
||||
geziyor := &Geziyor{
|
||||
Opt: opt,
|
||||
Exports: make(chan interface{}, 1),
|
||||
requestMiddlewares: []middleware.RequestProcessor{
|
||||
reqMiddlewares: []middleware.RequestProcessor{
|
||||
&middleware.AllowedDomains{AllowedDomains: opt.AllowedDomains},
|
||||
&middleware.DuplicateRequests{RevisitEnabled: opt.URLRevisitEnabled},
|
||||
&middleware.Headers{UserAgent: opt.UserAgent},
|
||||
middleware.NewDelay(opt.RequestDelayRandomize, opt.RequestDelay),
|
||||
},
|
||||
responseMiddlewares: []middleware.ResponseProcessor{
|
||||
resMiddlewares: []middleware.ResponseProcessor{
|
||||
&middleware.ParseHTML{ParseHTMLDisabled: opt.ParseHTMLDisabled},
|
||||
&middleware.LogStats{LogDisabled: opt.LogDisabled},
|
||||
},
|
||||
metrics: metrics.NewMetrics(opt.MetricsType),
|
||||
}
|
||||
|
||||
metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics}
|
||||
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, metricsMiddleware)
|
||||
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, metricsMiddleware)
|
||||
|
||||
// Default
|
||||
if opt.UserAgent == "" {
|
||||
opt.UserAgent = client.DefaultUserAgent
|
||||
@ -67,6 +63,7 @@ func NewGeziyor(opt *Options) *Geziyor {
|
||||
if len(opt.RetryHTTPCodes) == 0 {
|
||||
opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes
|
||||
}
|
||||
|
||||
// Client
|
||||
geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes)
|
||||
if opt.Cache != nil {
|
||||
@ -82,6 +79,7 @@ func NewGeziyor(opt *Options) *Geziyor {
|
||||
if opt.MaxRedirect != 0 {
|
||||
geziyor.Client.CheckRedirect = client.NewRedirectionHandler(opt.MaxRedirect)
|
||||
}
|
||||
|
||||
// Concurrency
|
||||
if opt.ConcurrentRequests != 0 {
|
||||
geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests)
|
||||
@ -92,9 +90,19 @@ func NewGeziyor(opt *Options) *Geziyor {
|
||||
hostSems map[string]chan struct{}
|
||||
}{hostSems: make(map[string]chan struct{})}
|
||||
}
|
||||
// Middlewares
|
||||
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...)
|
||||
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...)
|
||||
|
||||
// Base Middlewares
|
||||
metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics}
|
||||
geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, metricsMiddleware)
|
||||
geziyor.resMiddlewares = append(geziyor.resMiddlewares, metricsMiddleware)
|
||||
|
||||
robotsMiddleware := middleware.NewRobotsTxt(geziyor.Client, opt.RobotsTxtDisabled)
|
||||
geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, robotsMiddleware)
|
||||
|
||||
// Custom Middlewares
|
||||
geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, opt.RequestMiddlewares...)
|
||||
geziyor.resMiddlewares = append(geziyor.resMiddlewares, opt.ResponseMiddlewares...)
|
||||
|
||||
// Logging
|
||||
if opt.LogDisabled {
|
||||
log.SetOutput(ioutil.Discard)
|
||||
@ -200,7 +208,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
|
||||
}
|
||||
defer g.recoverMe()
|
||||
|
||||
for _, middlewareFunc := range g.requestMiddlewares {
|
||||
for _, middlewareFunc := range g.reqMiddlewares {
|
||||
middlewareFunc.ProcessRequest(req)
|
||||
if req.Cancelled {
|
||||
return
|
||||
@ -213,7 +221,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
|
||||
return
|
||||
}
|
||||
|
||||
for _, middlewareFunc := range g.responseMiddlewares {
|
||||
for _, middlewareFunc := range g.resMiddlewares {
|
||||
middlewareFunc.ProcessResponse(res)
|
||||
}
|
||||
|
||||
|
@ -187,6 +187,16 @@ func TestConcurrentRequests(t *testing.T) {
|
||||
}).Start()
|
||||
}
|
||||
|
||||
func TestRobots(t *testing.T) {
|
||||
defer leaktest.Check(t)()
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"https://httpbin.org/deny"},
|
||||
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
||||
t.Error("/deny should be blocked by robots.txt middleware")
|
||||
},
|
||||
}).Start()
|
||||
}
|
||||
|
||||
// Make sure to increase open file descriptor limits before running
|
||||
func BenchmarkRequests(b *testing.B) {
|
||||
|
||||
|
1
go.mod
1
go.mod
@ -13,6 +13,7 @@ require (
|
||||
github.com/pkg/errors v0.8.1
|
||||
github.com/prometheus/client_golang v1.0.0
|
||||
github.com/stretchr/testify v1.3.0
|
||||
github.com/temoto/robotstxt v1.1.1
|
||||
golang.org/x/net v0.0.0-20190522155817-f3200d17e092
|
||||
golang.org/x/text v0.3.2
|
||||
)
|
||||
|
2
go.sum
2
go.sum
@ -69,6 +69,8 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
|
||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
|
||||
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
|
@ -4,6 +4,7 @@ import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
)
|
||||
|
||||
// RequestResponseProcessor interface is for middlewares that needs to process both requests and responses
|
||||
type RequestResponseProcessor interface {
|
||||
RequestProcessor
|
||||
ResponseProcessor
|
||||
|
63
middleware/robotstxt.go
Normal file
63
middleware/robotstxt.go
Normal file
@ -0,0 +1,63 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/temoto/robotstxt"
|
||||
"log"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// RobotsTxt middleware filters out requests forbidden by the robots.txt exclusion standard.
|
||||
type RobotsTxt struct {
|
||||
robotsDisabled bool
|
||||
client *client.Client
|
||||
mut sync.RWMutex
|
||||
robotsMap map[string]*robotstxt.RobotsData
|
||||
}
|
||||
|
||||
func NewRobotsTxt(client *client.Client, robotsDisabled bool) RequestProcessor {
|
||||
return &RobotsTxt{
|
||||
robotsDisabled: robotsDisabled,
|
||||
client: client,
|
||||
robotsMap: make(map[string]*robotstxt.RobotsData),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *RobotsTxt) ProcessRequest(r *client.Request) {
|
||||
if m.robotsDisabled {
|
||||
return
|
||||
}
|
||||
|
||||
// TODO: Locking like this improves performance but causes duplicate requests to robots.txt,
|
||||
m.mut.RLock()
|
||||
robotsData, exists := m.robotsMap[r.Host]
|
||||
m.mut.RUnlock()
|
||||
|
||||
if !exists {
|
||||
// TODO: Disable retry
|
||||
robotsReq, err := client.NewRequest("GET", r.URL.Scheme+"://"+r.Host+"/robots.txt", nil)
|
||||
if err != nil {
|
||||
return // Don't Do anything
|
||||
}
|
||||
|
||||
robotsResp, err := m.client.DoRequestClient(robotsReq)
|
||||
if err != nil {
|
||||
return // Don't Do anything
|
||||
}
|
||||
|
||||
robotsData, err = robotstxt.FromStatusAndBytes(robotsResp.StatusCode, robotsResp.Body)
|
||||
if err != nil {
|
||||
return // Don't Do anything
|
||||
}
|
||||
|
||||
m.mut.Lock()
|
||||
m.robotsMap[r.Host] = robotsData
|
||||
m.mut.Unlock()
|
||||
}
|
||||
|
||||
if !robotsData.TestAgent(r.URL.Path, r.UserAgent()) {
|
||||
// TODO: Forbidden requests metrics
|
||||
log.Println("Forbidden by robots.txt:", r.URL.String())
|
||||
r.Cancel()
|
||||
}
|
||||
}
|
86
options.go
86
options.go
@ -15,48 +15,28 @@ type Options struct {
|
||||
// If empty, any domain is allowed
|
||||
AllowedDomains []string
|
||||
|
||||
// First requests will made to this url array. (Concurrently)
|
||||
StartURLs []string
|
||||
|
||||
// StartRequestsFunc called on scraper start
|
||||
StartRequestsFunc func(g *Geziyor)
|
||||
|
||||
// ParseFunc is callback of StartURLs response.
|
||||
ParseFunc func(g *Geziyor, r *client.Response)
|
||||
|
||||
// Timeout is global request timeout
|
||||
Timeout time.Duration
|
||||
|
||||
// Set this to enable caching responses.
|
||||
// Memory Cache: httpcache.NewMemoryCache()
|
||||
// Disk Cache: diskcache.New(".cache")
|
||||
Cache httpcache.Cache
|
||||
|
||||
// Charset Detection disable
|
||||
CharsetDetectDisabled bool
|
||||
|
||||
// Concurrent requests limit
|
||||
ConcurrentRequests int
|
||||
|
||||
// Concurrent requests per domain limit
|
||||
ConcurrentRequestsPerDomain int
|
||||
|
||||
// User Agent.
|
||||
// Default: "Geziyor 1.0"
|
||||
UserAgent string
|
||||
|
||||
// Request delays
|
||||
RequestDelay time.Duration
|
||||
// RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay
|
||||
RequestDelayRandomize bool
|
||||
|
||||
// Disable logging by setting this true
|
||||
LogDisabled bool
|
||||
// If set true, cookies won't send.
|
||||
CookiesDisabled bool
|
||||
|
||||
// For extracting data
|
||||
Exporters []export.Exporter
|
||||
|
||||
// Called before requests made to manipulate requests
|
||||
RequestMiddlewares []middleware.RequestProcessor
|
||||
|
||||
// Called after response received
|
||||
ResponseMiddlewares []middleware.ResponseProcessor
|
||||
// Disable logging by setting this true
|
||||
LogDisabled bool
|
||||
|
||||
// Max body reading size in bytes. Default: 1GB
|
||||
MaxBodySize int64
|
||||
@ -64,27 +44,53 @@ type Options struct {
|
||||
// Maximum redirection time. Default: 10
|
||||
MaxRedirect int
|
||||
|
||||
// Charset Detection disable
|
||||
CharsetDetectDisabled bool
|
||||
// Scraper metrics exporting type. See metrics.Type
|
||||
MetricsType metrics.Type
|
||||
|
||||
// Maximum number of times to retry, in addition to the first download.
|
||||
// Set -1 to disable retrying
|
||||
// Default: 2
|
||||
RetryTimes int
|
||||
// ParseFunc is callback of StartURLs response.
|
||||
ParseFunc func(g *Geziyor, r *client.Response)
|
||||
|
||||
// If true, HTML parsing is disabled to improve performance.
|
||||
ParseHTMLDisabled bool
|
||||
|
||||
// Request delays
|
||||
RequestDelay time.Duration
|
||||
|
||||
// RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay
|
||||
RequestDelayRandomize bool
|
||||
|
||||
// Called before requests made to manipulate requests
|
||||
RequestMiddlewares []middleware.RequestProcessor
|
||||
|
||||
// Called after response received
|
||||
ResponseMiddlewares []middleware.ResponseProcessor
|
||||
|
||||
// Which HTTP response codes to retry.
|
||||
// Other errors (DNS lookup issues, connections lost, etc) are always retried.
|
||||
// Default: []int{500, 502, 503, 504, 522, 524, 408}
|
||||
RetryHTTPCodes []int
|
||||
|
||||
// If true, HTML parsing is disabled to improve performance.
|
||||
ParseHTMLDisabled bool
|
||||
// Maximum number of times to retry, in addition to the first download.
|
||||
// Set -1 to disable retrying
|
||||
// Default: 2
|
||||
RetryTimes int
|
||||
|
||||
// If true, disable robots.txt checks
|
||||
RobotsTxtDisabled bool
|
||||
|
||||
// StartRequestsFunc called on scraper start
|
||||
StartRequestsFunc func(g *Geziyor)
|
||||
|
||||
// First requests will made to this url array. (Concurrently)
|
||||
StartURLs []string
|
||||
|
||||
// Timeout is global request timeout
|
||||
Timeout time.Duration
|
||||
|
||||
// Revisiting same URLs is disabled by default
|
||||
URLRevisitEnabled bool
|
||||
|
||||
// If set true, cookies won't send.
|
||||
CookiesDisabled bool
|
||||
|
||||
MetricsType metrics.Type
|
||||
// User Agent.
|
||||
// Default: "Geziyor 1.0"
|
||||
UserAgent string
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user