From 7b23596a2ddf245c2d5fb999a5332707266d3b9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Sat, 15 Jun 2019 17:55:40 +0300 Subject: [PATCH] Middleware support added. HTML Parsing disable option added. Goroutine leaks will be tested using leaktest lib. --- .travis.yml | 3 +++ README.md | 2 +- exporter/csv.go | 11 +++++------ geziyor.go | 26 +++++++++++++++++--------- geziyor_test.go | 5 ++++- go.mod | 1 + go.sum | 2 ++ options.go | 6 ++++++ request.go | 14 ++++++++++++++ 9 files changed, 53 insertions(+), 17 deletions(-) diff --git a/.travis.yml b/.travis.yml index c63c570..e8b3279 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,9 @@ go: - 1.11.x - tip +env: + - GO111MODULE=on + before_install: - go get -t -v ./... diff --git a/README.md b/README.md index 9a60798..a3bcd1a 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings. ## Status -Since the project is in **development phase**, **API may change in time**. Also, we highly recommend you to use Geziyor with go modules. +Since the project is in **development phase**, **API may change in time**. Thus, we highly recommend you to use Geziyor with go modules. ## Usage Simple usage diff --git a/exporter/csv.go b/exporter/csv.go index 4bf0a4a..2712b52 100644 --- a/exporter/csv.go +++ b/exporter/csv.go @@ -49,12 +49,11 @@ func (e *CSVExporter) Export(response *geziyor.Response) { values = append(values, fmt.Sprint(val.Index(i))) } - // TODO: Map type support is incomplete. Ordering is wrong. Needs to be sorted by map keys (CSV headers). - case reflect.Map: - iter := val.MapRange() - for iter.Next() { - values = append(values, fmt.Sprint(iter.Value())) - } + //case reflect.Map: + // iter := val.MapRange() + // for iter.Next() { + // values = append(values, fmt.Sprint(iter.Value())) + // } } // Write to file diff --git a/geziyor.go b/geziyor.go index 05da58c..c55bfaa 100644 --- a/geziyor.go +++ b/geziyor.go @@ -23,9 +23,13 @@ import ( // Exporter interface is for extracting data to external resources type Exporter interface { - Export(response *Response) + Export(r *Response) } +// RequestMiddleware called before requests made. +// Set request.Cancelled = true to cancel request +type RequestMiddleware func(g *Geziyor, r *Request) + // Geziyor is our main scraper type type Geziyor struct { Opt Options @@ -41,6 +45,7 @@ type Geziyor struct { sync.RWMutex visitedURLS []string } + requestMiddlewaresBase []RequestMiddleware } func init() { @@ -68,7 +73,8 @@ func NewGeziyor(opt Options) *Geziyor { }, Timeout: time.Second * 180, // Google's timeout }, - Opt: opt, + Opt: opt, + requestMiddlewaresBase: []RequestMiddleware{defaultHeadersMiddleware}, } if opt.Cache != nil { @@ -168,6 +174,14 @@ func (g *Geziyor) do(req *Request, callback func(resp *Response)) { return } + // Request Middlewares + for _, middlewareFunc := range g.requestMiddlewaresBase { + middlewareFunc(g, req) + } + for _, middlewareFunc := range g.Opt.RequestMiddlewares { + middlewareFunc(g, req) + } + // Do request normal or Chrome and read response var response *Response var err error @@ -180,7 +194,7 @@ func (g *Geziyor) do(req *Request, callback func(resp *Response)) { return } - if response.isHTML() { + if !g.Opt.ParseHTMLDisabled && response.isHTML() { response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(response.Body)) } @@ -216,12 +230,6 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) { g.delay() - // Modify Request - req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") - req.Header.Set("Accept-Charset", "utf-8") - req.Header.Set("Accept-Language", "en") - req.Header.Set("User-Agent", g.Opt.UserAgent) - log.Println("Fetching: ", req.URL.String()) // Do request diff --git a/geziyor_test.go b/geziyor_test.go index 0e20d2b..56470fe 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -3,6 +3,7 @@ package geziyor_test import ( "fmt" "github.com/PuerkitoBio/goquery" + "github.com/fortytw2/leaktest" "github.com/fpfeng/httpcache" "github.com/geziyor/geziyor" "github.com/geziyor/geziyor/exporter" @@ -21,6 +22,7 @@ func TestSimple(t *testing.T) { } func TestSimpleCache(t *testing.T) { + defer leaktest.Check(t)() gez := geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://api.ipify.org"}, Cache: httpcache.NewMemoryCache(), @@ -34,6 +36,7 @@ func TestSimpleCache(t *testing.T) { } func TestQuotes(t *testing.T) { + defer leaktest.Check(t)() geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://quotes.toscrape.com/"}, ParseFunc: quotesParse, @@ -56,7 +59,7 @@ func quotesParse(r *geziyor.Response) { // Next Page if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok { - go r.Geziyor.Get(r.JoinURL(href), quotesParse) + r.Geziyor.Get(r.JoinURL(href), quotesParse) } } diff --git a/go.mod b/go.mod index cf03234..6f13bd8 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/PuerkitoBio/goquery v1.5.0 github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9 github.com/chromedp/chromedp v0.3.0 + github.com/fortytw2/leaktest v1.3.0 github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 golang.org/x/net v0.0.0-20190522155817-f3200d17e092 golang.org/x/text v0.3.2 // indirect diff --git a/go.sum b/go.sum index 5a38595..1ea7791 100644 --- a/go.sum +++ b/go.sum @@ -6,6 +6,8 @@ github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9 h1:ARnDd2vEk91rLN github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9/go.mod h1:xquOK9dIGFlLaIGI4c6IyfLI/Gz0LiYYuJtzhsUODgI= github.com/chromedp/chromedp v0.3.0 h1:7/pwrXFRq6/ym3sxCykm90DMoyw6VKXY48DgGRgUURA= github.com/chromedp/chromedp v0.3.0/go.mod h1:EktsZcC2iycVrRhC9fDmshBpCK9lNnZYi6x2q9uE7zI= +github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= +github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ= github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8= github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0= diff --git a/options.go b/options.go index 7aa5523..16a1e5e 100644 --- a/options.go +++ b/options.go @@ -47,12 +47,18 @@ type Options struct { // For extracting data Exporters []Exporter + // Called before requests made to manipulate requests + RequestMiddlewares []RequestMiddleware + // Max body reading size in bytes MaxBodySize int64 // Charset Detection disable CharsetDetectDisabled bool + // If true, HTML parsing is disabled to improve performance. + ParseHTMLDisabled bool + // Revisiting same URLs is disabled by default URLRevisitEnabled bool } diff --git a/request.go b/request.go index 43903fe..c472f12 100644 --- a/request.go +++ b/request.go @@ -10,3 +10,17 @@ type Request struct { Meta map[string]interface{} Rendered bool } + +func defaultHeadersMiddleware(g *Geziyor, r *Request) { + r.Header = headerSetDefault(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + r.Header = headerSetDefault(r.Header, "Accept-Charset", "utf-8") + r.Header = headerSetDefault(r.Header, "Accept-Language", "en") + r.Header = headerSetDefault(r.Header, "User-Agent", g.Opt.UserAgent) +} + +func headerSetDefault(header http.Header, key string, value string) http.Header { + if header.Get(key) == "" { + header.Set(key, value) + } + return header +}