From 42faa92ece6475e224ce029385aae7292c7e1396 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Sat, 6 Jul 2019 16:18:03 +0300 Subject: [PATCH] Robots.txt support implemented --- README.md | 6 +-- client/client.go | 5 +-- geziyor.go | 44 +++++++++++--------- geziyor_test.go | 10 +++++ go.mod | 1 + go.sum | 2 + middleware/middleware.go | 1 + middleware/robotstxt.go | 63 +++++++++++++++++++++++++++++ options.go | 86 +++++++++++++++++++++------------------- 9 files changed, 154 insertions(+), 64 deletions(-) create mode 100644 middleware/robotstxt.go diff --git a/README.md b/README.md index ebc09f8..026a894 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,11 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use - 5.000+ Requests/Sec - JS Rendering - Caching (Memory/Disk) -- Automatic Data Extracting (CSS Selectors) - Automatic Data Exporting (JSON, CSV, or custom) - Metrics (Prometheus, Expvar, or custom) - Limit Concurrency (Global/Per Domain) - Request Delays (Constant/Randomized) -- Cookies and Middlewares +- Cookies, Middlewares, robots.txt - Automatic response decoding to UTF-8 See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings. @@ -64,6 +63,8 @@ See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for ### Installation +Go 1.12 required + go get github.com/geziyor/geziyor **NOTE**: macOS limits the maximum number of open file descriptors. @@ -161,7 +162,6 @@ ok github.com/geziyor/geziyor 22.861s If you're interested in helping this project, please consider these features: - Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html)) -- Deploying Scrapers to Cloud - ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~ - Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html)) - ~~Realtime metrics (Prometheus etc.)~~ diff --git a/client/client.go b/client/client.go index 601bffd..cfcc1e1 100644 --- a/client/client.go +++ b/client/client.go @@ -74,11 +74,10 @@ func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, re // DoRequest selects appropriate request handler, client or Chrome func (c *Client) DoRequest(req *Request) (*Response, error) { - if !req.Rendered { - return c.DoRequestClient(req) - } else { + if req.Rendered { return c.DoRequestChrome(req) } + return c.DoRequestClient(req) } // DoRequestClient is a simple wrapper to read response according to options. diff --git a/geziyor.go b/geziyor.go index 702e9fa..8e5e74a 100644 --- a/geziyor.go +++ b/geziyor.go @@ -19,13 +19,13 @@ type Geziyor struct { Client *client.Client Exports chan interface{} - metrics *metrics.Metrics - requestMiddlewares []middleware.RequestProcessor - responseMiddlewares []middleware.ResponseProcessor - wgRequests sync.WaitGroup - wgExporters sync.WaitGroup - semGlobal chan struct{} - semHosts struct { + metrics *metrics.Metrics + reqMiddlewares []middleware.RequestProcessor + resMiddlewares []middleware.ResponseProcessor + wgRequests sync.WaitGroup + wgExporters sync.WaitGroup + semGlobal chan struct{} + semHosts struct { sync.RWMutex hostSems map[string]chan struct{} } @@ -37,23 +37,19 @@ func NewGeziyor(opt *Options) *Geziyor { geziyor := &Geziyor{ Opt: opt, Exports: make(chan interface{}, 1), - requestMiddlewares: []middleware.RequestProcessor{ + reqMiddlewares: []middleware.RequestProcessor{ &middleware.AllowedDomains{AllowedDomains: opt.AllowedDomains}, &middleware.DuplicateRequests{RevisitEnabled: opt.URLRevisitEnabled}, &middleware.Headers{UserAgent: opt.UserAgent}, middleware.NewDelay(opt.RequestDelayRandomize, opt.RequestDelay), }, - responseMiddlewares: []middleware.ResponseProcessor{ + resMiddlewares: []middleware.ResponseProcessor{ &middleware.ParseHTML{ParseHTMLDisabled: opt.ParseHTMLDisabled}, &middleware.LogStats{LogDisabled: opt.LogDisabled}, }, metrics: metrics.NewMetrics(opt.MetricsType), } - metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics} - geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, metricsMiddleware) - geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, metricsMiddleware) - // Default if opt.UserAgent == "" { opt.UserAgent = client.DefaultUserAgent @@ -67,6 +63,7 @@ func NewGeziyor(opt *Options) *Geziyor { if len(opt.RetryHTTPCodes) == 0 { opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes } + // Client geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes) if opt.Cache != nil { @@ -82,6 +79,7 @@ func NewGeziyor(opt *Options) *Geziyor { if opt.MaxRedirect != 0 { geziyor.Client.CheckRedirect = client.NewRedirectionHandler(opt.MaxRedirect) } + // Concurrency if opt.ConcurrentRequests != 0 { geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests) @@ -92,9 +90,19 @@ func NewGeziyor(opt *Options) *Geziyor { hostSems map[string]chan struct{} }{hostSems: make(map[string]chan struct{})} } - // Middlewares - geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...) - geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...) + + // Base Middlewares + metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics} + geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, metricsMiddleware) + geziyor.resMiddlewares = append(geziyor.resMiddlewares, metricsMiddleware) + + robotsMiddleware := middleware.NewRobotsTxt(geziyor.Client, opt.RobotsTxtDisabled) + geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, robotsMiddleware) + + // Custom Middlewares + geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, opt.RequestMiddlewares...) + geziyor.resMiddlewares = append(geziyor.resMiddlewares, opt.ResponseMiddlewares...) + // Logging if opt.LogDisabled { log.SetOutput(ioutil.Discard) @@ -200,7 +208,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re } defer g.recoverMe() - for _, middlewareFunc := range g.requestMiddlewares { + for _, middlewareFunc := range g.reqMiddlewares { middlewareFunc.ProcessRequest(req) if req.Cancelled { return @@ -213,7 +221,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re return } - for _, middlewareFunc := range g.responseMiddlewares { + for _, middlewareFunc := range g.resMiddlewares { middlewareFunc.ProcessResponse(res) } diff --git a/geziyor_test.go b/geziyor_test.go index 41f8cd2..ff4bfbc 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -187,6 +187,16 @@ func TestConcurrentRequests(t *testing.T) { }).Start() } +func TestRobots(t *testing.T) { + defer leaktest.Check(t)() + geziyor.NewGeziyor(&geziyor.Options{ + StartURLs: []string{"https://httpbin.org/deny"}, + ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { + t.Error("/deny should be blocked by robots.txt middleware") + }, + }).Start() +} + // Make sure to increase open file descriptor limits before running func BenchmarkRequests(b *testing.B) { diff --git a/go.mod b/go.mod index 41b3e29..6681994 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( github.com/pkg/errors v0.8.1 github.com/prometheus/client_golang v1.0.0 github.com/stretchr/testify v1.3.0 + github.com/temoto/robotstxt v1.1.1 golang.org/x/net v0.0.0-20190522155817-f3200d17e092 golang.org/x/text v0.3.2 ) diff --git a/go.sum b/go.sum index 8ccae9f..065040a 100644 --- a/go.sum +++ b/go.sum @@ -69,6 +69,8 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA= +github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= diff --git a/middleware/middleware.go b/middleware/middleware.go index 04e27eb..948eaae 100644 --- a/middleware/middleware.go +++ b/middleware/middleware.go @@ -4,6 +4,7 @@ import ( "github.com/geziyor/geziyor/client" ) +// RequestResponseProcessor interface is for middlewares that needs to process both requests and responses type RequestResponseProcessor interface { RequestProcessor ResponseProcessor diff --git a/middleware/robotstxt.go b/middleware/robotstxt.go new file mode 100644 index 0000000..0c12800 --- /dev/null +++ b/middleware/robotstxt.go @@ -0,0 +1,63 @@ +package middleware + +import ( + "github.com/geziyor/geziyor/client" + "github.com/temoto/robotstxt" + "log" + "sync" +) + +// RobotsTxt middleware filters out requests forbidden by the robots.txt exclusion standard. +type RobotsTxt struct { + robotsDisabled bool + client *client.Client + mut sync.RWMutex + robotsMap map[string]*robotstxt.RobotsData +} + +func NewRobotsTxt(client *client.Client, robotsDisabled bool) RequestProcessor { + return &RobotsTxt{ + robotsDisabled: robotsDisabled, + client: client, + robotsMap: make(map[string]*robotstxt.RobotsData), + } +} + +func (m *RobotsTxt) ProcessRequest(r *client.Request) { + if m.robotsDisabled { + return + } + + // TODO: Locking like this improves performance but causes duplicate requests to robots.txt, + m.mut.RLock() + robotsData, exists := m.robotsMap[r.Host] + m.mut.RUnlock() + + if !exists { + // TODO: Disable retry + robotsReq, err := client.NewRequest("GET", r.URL.Scheme+"://"+r.Host+"/robots.txt", nil) + if err != nil { + return // Don't Do anything + } + + robotsResp, err := m.client.DoRequestClient(robotsReq) + if err != nil { + return // Don't Do anything + } + + robotsData, err = robotstxt.FromStatusAndBytes(robotsResp.StatusCode, robotsResp.Body) + if err != nil { + return // Don't Do anything + } + + m.mut.Lock() + m.robotsMap[r.Host] = robotsData + m.mut.Unlock() + } + + if !robotsData.TestAgent(r.URL.Path, r.UserAgent()) { + // TODO: Forbidden requests metrics + log.Println("Forbidden by robots.txt:", r.URL.String()) + r.Cancel() + } +} diff --git a/options.go b/options.go index 8dd8fe6..83e2c41 100644 --- a/options.go +++ b/options.go @@ -15,48 +15,28 @@ type Options struct { // If empty, any domain is allowed AllowedDomains []string - // First requests will made to this url array. (Concurrently) - StartURLs []string - - // StartRequestsFunc called on scraper start - StartRequestsFunc func(g *Geziyor) - - // ParseFunc is callback of StartURLs response. - ParseFunc func(g *Geziyor, r *client.Response) - - // Timeout is global request timeout - Timeout time.Duration - // Set this to enable caching responses. // Memory Cache: httpcache.NewMemoryCache() // Disk Cache: diskcache.New(".cache") Cache httpcache.Cache + // Charset Detection disable + CharsetDetectDisabled bool + // Concurrent requests limit ConcurrentRequests int + // Concurrent requests per domain limit ConcurrentRequestsPerDomain int - // User Agent. - // Default: "Geziyor 1.0" - UserAgent string - - // Request delays - RequestDelay time.Duration - // RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay - RequestDelayRandomize bool - - // Disable logging by setting this true - LogDisabled bool + // If set true, cookies won't send. + CookiesDisabled bool // For extracting data Exporters []export.Exporter - // Called before requests made to manipulate requests - RequestMiddlewares []middleware.RequestProcessor - - // Called after response received - ResponseMiddlewares []middleware.ResponseProcessor + // Disable logging by setting this true + LogDisabled bool // Max body reading size in bytes. Default: 1GB MaxBodySize int64 @@ -64,27 +44,53 @@ type Options struct { // Maximum redirection time. Default: 10 MaxRedirect int - // Charset Detection disable - CharsetDetectDisabled bool + // Scraper metrics exporting type. See metrics.Type + MetricsType metrics.Type - // Maximum number of times to retry, in addition to the first download. - // Set -1 to disable retrying - // Default: 2 - RetryTimes int + // ParseFunc is callback of StartURLs response. + ParseFunc func(g *Geziyor, r *client.Response) + + // If true, HTML parsing is disabled to improve performance. + ParseHTMLDisabled bool + + // Request delays + RequestDelay time.Duration + + // RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay + RequestDelayRandomize bool + + // Called before requests made to manipulate requests + RequestMiddlewares []middleware.RequestProcessor + + // Called after response received + ResponseMiddlewares []middleware.ResponseProcessor // Which HTTP response codes to retry. // Other errors (DNS lookup issues, connections lost, etc) are always retried. // Default: []int{500, 502, 503, 504, 522, 524, 408} RetryHTTPCodes []int - // If true, HTML parsing is disabled to improve performance. - ParseHTMLDisabled bool + // Maximum number of times to retry, in addition to the first download. + // Set -1 to disable retrying + // Default: 2 + RetryTimes int + + // If true, disable robots.txt checks + RobotsTxtDisabled bool + + // StartRequestsFunc called on scraper start + StartRequestsFunc func(g *Geziyor) + + // First requests will made to this url array. (Concurrently) + StartURLs []string + + // Timeout is global request timeout + Timeout time.Duration // Revisiting same URLs is disabled by default URLRevisitEnabled bool - // If set true, cookies won't send. - CookiesDisabled bool - - MetricsType metrics.Type + // User Agent. + // Default: "Geziyor 1.0" + UserAgent string }