From c0dd0393e660043c585e74970d2f6e0edd005d86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Mon, 1 Jul 2019 15:44:28 +0300 Subject: [PATCH] Maximum redirection option added. Performance improvement on exports. Duplicate requests only checked on GET requests. --- README.md | 21 ++++++++++----------- client/client.go | 10 ++++++++++ geziyor.go | 5 ++++- geziyor_test.go | 43 ++++++++++++++++++++++++++++++++++++++++++- middleware.go | 5 ++--- options.go | 3 +++ 6 files changed, 71 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 3c4fc3b..2f6c1b9 100644 --- a/README.md +++ b/README.md @@ -111,13 +111,13 @@ You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor) geziyor.NewGeziyor(&geziyor.Options{ StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"}, Extractors: []geziyor.Extractor{ - &extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"}, - &extract.Text{Name: "title", Selector: ".c-page-title"}, - &extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"}, - &extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"}, - &extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"}, - &extract.Text{Name: "summary", Selector: ".c-entry-summary"}, - &extract.Text{Name: "content", Selector: ".c-entry-content"}, + &extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"}, + &extract.Text{Name: "title", Selector: ".c-page-title"}, + &extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"}, + &extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"}, + &extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"}, + &extract.Text{Name: "summary", Selector: ".c-entry-summary"}, + &extract.Text{Name: "content", Selector: ".c-entry-content"}, }, Exporters: []geziyor.Exporter{&export.JSON{}}, }).Start() @@ -168,14 +168,13 @@ geziyor.NewGeziyor(&geziyor.Options{ See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for this benchmark function: ```bash ->> go test -run none -bench . -benchtime 10s +>> go test -run none -bench Requests -benchtime 10s goos: darwin goarch: amd64 pkg: github.com/geziyor/geziyor -BenchmarkGeziyor_Do-8 200000 112493 ns/op - +BenchmarkRequests-8 200000 108710 ns/op PASS -ok github.com/geziyor/geziyor 23.662s +ok github.com/geziyor/geziyor 22.861s ``` ## Roadmap diff --git a/client/client.go b/client/client.go index c36c07a..50dc712 100644 --- a/client/client.go +++ b/client/client.go @@ -207,3 +207,13 @@ func ConvertMapToHeader(m map[string]interface{}) http.Header { } return header } + +// NewRedirectionHandler returns maximum allowed redirection function with provided maxRedirect +func NewRedirectionHandler(maxRedirect int) func(req *http.Request, via []*http.Request) error { + return func(req *http.Request, via []*http.Request) error { + if len(via) >= maxRedirect { + return errors.Errorf("stopped after %d redirects", maxRedirect) + } + return nil + } +} diff --git a/geziyor.go b/geziyor.go index 46efe5f..4967c4f 100644 --- a/geziyor.go +++ b/geziyor.go @@ -48,7 +48,7 @@ func NewGeziyor(opt *Options) *Geziyor { geziyor := &Geziyor{ Client: client.NewClient(), Opt: opt, - Exports: make(chan interface{}), + Exports: make(chan interface{}, 1), requestMiddlewares: []RequestMiddleware{ allowedDomainsMiddleware, duplicateRequestsMiddleware, @@ -81,6 +81,9 @@ func NewGeziyor(opt *Options) *Geziyor { if !opt.CookiesDisabled { geziyor.Client.Jar, _ = cookiejar.New(nil) } + if opt.MaxRedirect != 0 { + geziyor.Client.CheckRedirect = client.NewRedirectionHandler(opt.MaxRedirect) + } if opt.ConcurrentRequests != 0 { geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests) } diff --git a/geziyor_test.go b/geziyor_test.go index ebc01a5..f36a8ad 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -202,8 +202,29 @@ func TestCharsetDetection(t *testing.T) { }).Start() } +func TestRedirect(t *testing.T) { + defer leaktest.Check(t)() + geziyor.NewGeziyor(&geziyor.Options{ + StartURLs: []string{"https://httpbin.org/absolute-redirect/1"}, + ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { + //t.Fail() + }, + MaxRedirect: -1, + }).Start() + + geziyor.NewGeziyor(&geziyor.Options{ + StartURLs: []string{"https://httpbin.org/absolute-redirect/1"}, + ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { + if r.StatusCode == 302 { + t.Fail() + } + }, + MaxRedirect: 0, + }).Start() +} + // Make sure to increase open file descriptor limits before running -func BenchmarkGeziyor_Do(b *testing.B) { +func BenchmarkRequests(b *testing.B) { // Create Server ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -230,3 +251,23 @@ func BenchmarkGeziyor_Do(b *testing.B) { LogDisabled: true, }).Start() } + +func BenchmarkWhole(b *testing.B) { + for i := 0; i < b.N; i++ { + geziyor.NewGeziyor(&geziyor.Options{ + AllowedDomains: []string{"quotes.toscrape.com"}, + StartURLs: []string{"http://quotes.toscrape.com/"}, + ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { + g.Exports <- []string{r.Request.URL.String()} + r.HTMLDoc.Find("a").Each(func(i int, s *goquery.Selection) { + if href, ok := s.Attr("href"); ok { + g.Get(r.JoinURL(href), g.Opt.ParseFunc) + } + }) + }, + Exporters: []geziyor.Exporter{&export.CSV{}}, + //MetricsType: metrics.Prometheus, + LogDisabled: true, + }).Start() + } +} diff --git a/middleware.go b/middleware.go index 58e7a60..981b834 100644 --- a/middleware.go +++ b/middleware.go @@ -46,9 +46,8 @@ func allowedDomainsMiddleware(g *Geziyor, r *client.Request) { // duplicateRequestsMiddleware checks for already visited URLs func duplicateRequestsMiddleware(g *Geziyor, r *client.Request) { - if !g.Opt.URLRevisitEnabled { - key := r.Request.URL.String() + r.Request.Method - if _, visited := g.visitedURLs.LoadOrStore(key, struct{}{}); visited { + if !g.Opt.URLRevisitEnabled && r.Request.Method == "GET" { + if _, visited := g.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited { //log.Printf("URL already visited %s\n", rawURL) r.Cancel() } diff --git a/options.go b/options.go index 432bc6f..285c64c 100644 --- a/options.go +++ b/options.go @@ -61,6 +61,9 @@ type Options struct { // Max body reading size in bytes. Default: 1GB MaxBodySize int64 + // Maximum redirection time. Default: 10 + MaxRedirect int + // Charset Detection disable CharsetDetectDisabled bool