Robots.txt support implemented

This commit is contained in:
Musab Gültekin 2019-07-06 16:18:03 +03:00
parent 2cab68d2ce
commit 42faa92ece
9 changed files with 154 additions and 64 deletions

View File

@ -9,12 +9,11 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
- 5.000+ Requests/Sec
- JS Rendering
- Caching (Memory/Disk)
- Automatic Data Extracting (CSS Selectors)
- Automatic Data Exporting (JSON, CSV, or custom)
- Metrics (Prometheus, Expvar, or custom)
- Limit Concurrency (Global/Per Domain)
- Request Delays (Constant/Randomized)
- Cookies and Middlewares
- Cookies, Middlewares, robots.txt
- Automatic response decoding to UTF-8
See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings.
@ -64,6 +63,8 @@ See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for
### Installation
Go 1.12 required
go get github.com/geziyor/geziyor
**NOTE**: macOS limits the maximum number of open file descriptors.
@ -161,7 +162,6 @@ ok github.com/geziyor/geziyor 22.861s
If you're interested in helping this project, please consider these features:
- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
- Deploying Scrapers to Cloud
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))
- ~~Realtime metrics (Prometheus etc.)~~

View File

@ -74,11 +74,10 @@ func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, re
// DoRequest selects appropriate request handler, client or Chrome
func (c *Client) DoRequest(req *Request) (*Response, error) {
if !req.Rendered {
return c.DoRequestClient(req)
} else {
if req.Rendered {
return c.DoRequestChrome(req)
}
return c.DoRequestClient(req)
}
// DoRequestClient is a simple wrapper to read response according to options.

View File

@ -20,8 +20,8 @@ type Geziyor struct {
Exports chan interface{}
metrics *metrics.Metrics
requestMiddlewares []middleware.RequestProcessor
responseMiddlewares []middleware.ResponseProcessor
reqMiddlewares []middleware.RequestProcessor
resMiddlewares []middleware.ResponseProcessor
wgRequests sync.WaitGroup
wgExporters sync.WaitGroup
semGlobal chan struct{}
@ -37,23 +37,19 @@ func NewGeziyor(opt *Options) *Geziyor {
geziyor := &Geziyor{
Opt: opt,
Exports: make(chan interface{}, 1),
requestMiddlewares: []middleware.RequestProcessor{
reqMiddlewares: []middleware.RequestProcessor{
&middleware.AllowedDomains{AllowedDomains: opt.AllowedDomains},
&middleware.DuplicateRequests{RevisitEnabled: opt.URLRevisitEnabled},
&middleware.Headers{UserAgent: opt.UserAgent},
middleware.NewDelay(opt.RequestDelayRandomize, opt.RequestDelay),
},
responseMiddlewares: []middleware.ResponseProcessor{
resMiddlewares: []middleware.ResponseProcessor{
&middleware.ParseHTML{ParseHTMLDisabled: opt.ParseHTMLDisabled},
&middleware.LogStats{LogDisabled: opt.LogDisabled},
},
metrics: metrics.NewMetrics(opt.MetricsType),
}
metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics}
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, metricsMiddleware)
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, metricsMiddleware)
// Default
if opt.UserAgent == "" {
opt.UserAgent = client.DefaultUserAgent
@ -67,6 +63,7 @@ func NewGeziyor(opt *Options) *Geziyor {
if len(opt.RetryHTTPCodes) == 0 {
opt.RetryHTTPCodes = client.DefaultRetryHTTPCodes
}
// Client
geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes)
if opt.Cache != nil {
@ -82,6 +79,7 @@ func NewGeziyor(opt *Options) *Geziyor {
if opt.MaxRedirect != 0 {
geziyor.Client.CheckRedirect = client.NewRedirectionHandler(opt.MaxRedirect)
}
// Concurrency
if opt.ConcurrentRequests != 0 {
geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests)
@ -92,9 +90,19 @@ func NewGeziyor(opt *Options) *Geziyor {
hostSems map[string]chan struct{}
}{hostSems: make(map[string]chan struct{})}
}
// Middlewares
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...)
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...)
// Base Middlewares
metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics}
geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, metricsMiddleware)
geziyor.resMiddlewares = append(geziyor.resMiddlewares, metricsMiddleware)
robotsMiddleware := middleware.NewRobotsTxt(geziyor.Client, opt.RobotsTxtDisabled)
geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, robotsMiddleware)
// Custom Middlewares
geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, opt.RequestMiddlewares...)
geziyor.resMiddlewares = append(geziyor.resMiddlewares, opt.ResponseMiddlewares...)
// Logging
if opt.LogDisabled {
log.SetOutput(ioutil.Discard)
@ -200,7 +208,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
}
defer g.recoverMe()
for _, middlewareFunc := range g.requestMiddlewares {
for _, middlewareFunc := range g.reqMiddlewares {
middlewareFunc.ProcessRequest(req)
if req.Cancelled {
return
@ -213,7 +221,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
return
}
for _, middlewareFunc := range g.responseMiddlewares {
for _, middlewareFunc := range g.resMiddlewares {
middlewareFunc.ProcessResponse(res)
}

View File

@ -187,6 +187,16 @@ func TestConcurrentRequests(t *testing.T) {
}).Start()
}
func TestRobots(t *testing.T) {
defer leaktest.Check(t)()
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://httpbin.org/deny"},
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
t.Error("/deny should be blocked by robots.txt middleware")
},
}).Start()
}
// Make sure to increase open file descriptor limits before running
func BenchmarkRequests(b *testing.B) {

1
go.mod
View File

@ -13,6 +13,7 @@ require (
github.com/pkg/errors v0.8.1
github.com/prometheus/client_golang v1.0.0
github.com/stretchr/testify v1.3.0
github.com/temoto/robotstxt v1.1.1
golang.org/x/net v0.0.0-20190522155817-f3200d17e092
golang.org/x/text v0.3.2
)

2
go.sum
View File

@ -69,6 +69,8 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=

View File

@ -4,6 +4,7 @@ import (
"github.com/geziyor/geziyor/client"
)
// RequestResponseProcessor interface is for middlewares that needs to process both requests and responses
type RequestResponseProcessor interface {
RequestProcessor
ResponseProcessor

63
middleware/robotstxt.go Normal file
View File

@ -0,0 +1,63 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"github.com/temoto/robotstxt"
"log"
"sync"
)
// RobotsTxt middleware filters out requests forbidden by the robots.txt exclusion standard.
type RobotsTxt struct {
robotsDisabled bool
client *client.Client
mut sync.RWMutex
robotsMap map[string]*robotstxt.RobotsData
}
func NewRobotsTxt(client *client.Client, robotsDisabled bool) RequestProcessor {
return &RobotsTxt{
robotsDisabled: robotsDisabled,
client: client,
robotsMap: make(map[string]*robotstxt.RobotsData),
}
}
func (m *RobotsTxt) ProcessRequest(r *client.Request) {
if m.robotsDisabled {
return
}
// TODO: Locking like this improves performance but causes duplicate requests to robots.txt,
m.mut.RLock()
robotsData, exists := m.robotsMap[r.Host]
m.mut.RUnlock()
if !exists {
// TODO: Disable retry
robotsReq, err := client.NewRequest("GET", r.URL.Scheme+"://"+r.Host+"/robots.txt", nil)
if err != nil {
return // Don't Do anything
}
robotsResp, err := m.client.DoRequestClient(robotsReq)
if err != nil {
return // Don't Do anything
}
robotsData, err = robotstxt.FromStatusAndBytes(robotsResp.StatusCode, robotsResp.Body)
if err != nil {
return // Don't Do anything
}
m.mut.Lock()
m.robotsMap[r.Host] = robotsData
m.mut.Unlock()
}
if !robotsData.TestAgent(r.URL.Path, r.UserAgent()) {
// TODO: Forbidden requests metrics
log.Println("Forbidden by robots.txt:", r.URL.String())
r.Cancel()
}
}

View File

@ -15,48 +15,28 @@ type Options struct {
// If empty, any domain is allowed
AllowedDomains []string
// First requests will made to this url array. (Concurrently)
StartURLs []string
// StartRequestsFunc called on scraper start
StartRequestsFunc func(g *Geziyor)
// ParseFunc is callback of StartURLs response.
ParseFunc func(g *Geziyor, r *client.Response)
// Timeout is global request timeout
Timeout time.Duration
// Set this to enable caching responses.
// Memory Cache: httpcache.NewMemoryCache()
// Disk Cache: diskcache.New(".cache")
Cache httpcache.Cache
// Charset Detection disable
CharsetDetectDisabled bool
// Concurrent requests limit
ConcurrentRequests int
// Concurrent requests per domain limit
ConcurrentRequestsPerDomain int
// User Agent.
// Default: "Geziyor 1.0"
UserAgent string
// Request delays
RequestDelay time.Duration
// RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay
RequestDelayRandomize bool
// Disable logging by setting this true
LogDisabled bool
// If set true, cookies won't send.
CookiesDisabled bool
// For extracting data
Exporters []export.Exporter
// Called before requests made to manipulate requests
RequestMiddlewares []middleware.RequestProcessor
// Called after response received
ResponseMiddlewares []middleware.ResponseProcessor
// Disable logging by setting this true
LogDisabled bool
// Max body reading size in bytes. Default: 1GB
MaxBodySize int64
@ -64,27 +44,53 @@ type Options struct {
// Maximum redirection time. Default: 10
MaxRedirect int
// Charset Detection disable
CharsetDetectDisabled bool
// Scraper metrics exporting type. See metrics.Type
MetricsType metrics.Type
// Maximum number of times to retry, in addition to the first download.
// Set -1 to disable retrying
// Default: 2
RetryTimes int
// ParseFunc is callback of StartURLs response.
ParseFunc func(g *Geziyor, r *client.Response)
// If true, HTML parsing is disabled to improve performance.
ParseHTMLDisabled bool
// Request delays
RequestDelay time.Duration
// RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay
RequestDelayRandomize bool
// Called before requests made to manipulate requests
RequestMiddlewares []middleware.RequestProcessor
// Called after response received
ResponseMiddlewares []middleware.ResponseProcessor
// Which HTTP response codes to retry.
// Other errors (DNS lookup issues, connections lost, etc) are always retried.
// Default: []int{500, 502, 503, 504, 522, 524, 408}
RetryHTTPCodes []int
// If true, HTML parsing is disabled to improve performance.
ParseHTMLDisabled bool
// Maximum number of times to retry, in addition to the first download.
// Set -1 to disable retrying
// Default: 2
RetryTimes int
// If true, disable robots.txt checks
RobotsTxtDisabled bool
// StartRequestsFunc called on scraper start
StartRequestsFunc func(g *Geziyor)
// First requests will made to this url array. (Concurrently)
StartURLs []string
// Timeout is global request timeout
Timeout time.Duration
// Revisiting same URLs is disabled by default
URLRevisitEnabled bool
// If set true, cookies won't send.
CookiesDisabled bool
MetricsType metrics.Type
// User Agent.
// Default: "Geziyor 1.0"
UserAgent string
}