From 276b248ebba5dbe983dad4a2b8ac6225047a418d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Fri, 28 Jun 2019 17:28:16 +0300 Subject: [PATCH] Synchronized requests support added. Benchmarks added. --- README.md | 26 +++++++++++++++++---- geziyor.go | 18 ++++++++++----- geziyor_test.go | 60 +++++++++++++++++++++++++++++++++++++++++++++++++ middleware.go | 6 ++--- request.go | 13 ++++++++--- 5 files changed, 107 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index edbe621..fc2d2a1 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,10 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use [![report card](https://goreportcard.com/badge/github.com/geziyor/geziyor)](http://goreportcard.com/report/geziyor/geziyor) ## Features -- 1.000+ Requests/Sec +- 5.000+ Requests/Sec - JS Rendering - Caching (Memory/Disk) +- Automatic Data Extracting (CSS Selectors) - Automatic Data Exporting (JSON, CSV, or custom) - Metrics (Prometheus, Expvar, or custom) - Limit Concurrency (Global/Per Domain) @@ -103,8 +104,8 @@ geziyor.NewGeziyor(&geziyor.Options{ ### Extracting Data #### Extractors -You can add [Extractor]() to []Extractors option to extract structured data. -Exporters need to be defined in order to extractors work. +You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor) to ```[]Extractors``` option to extract structured data. +```Exporters``` need to be defined in order extractors to work. ```go geziyor.NewGeziyor(&geziyor.Options{ @@ -157,12 +158,29 @@ geziyor.NewGeziyor(&geziyor.Options{ }).Start() ``` +## Benchmark + +**8452 request per seconds** on *Macbook Pro 15" 2016* + +See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for this benchmark function: + +```bash +>> go test -run none -bench . -benchtime 10s +goos: darwin +goarch: amd64 +pkg: github.com/geziyor/geziyor +BenchmarkGeziyor_Do-8 200000 112493 ns/op + +PASS +ok github.com/geziyor/geziyor 23.662s +``` + ## Roadmap If you're interested in helping this project, please consider these features: - Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html)) -- Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape)) +- ~~Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape))~~ - Deploying Scrapers to Cloud - ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~ - Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html)) diff --git a/geziyor.go b/geziyor.go index 762b579..7e86916 100644 --- a/geziyor.go +++ b/geziyor.go @@ -113,8 +113,9 @@ func (g *Geziyor) Start() { log.Println("Scraping Started") // Metrics - metricsServer := &stdhttp.Server{Addr: ":2112"} if g.Opt.MetricsType == metrics.Prometheus { + metricsServer := &stdhttp.Server{Addr: ":2112"} + defer metricsServer.Close() go func() { stdhttp.Handle("/metrics", promhttp.Handler()) metricsServer.ListenAndServe() @@ -144,7 +145,6 @@ func (g *Geziyor) Start() { g.wg.Wait() close(g.Exports) - metricsServer.Close() log.Println("Scraping Finished") } @@ -182,20 +182,26 @@ func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *Response)) { // Do sends an HTTP request func (g *Geziyor) Do(req *Request, callback func(g *Geziyor, r *Response)) { - g.wg.Add(1) - go g.do(req, callback) + if req.Synchronized { + g.do(req, callback) + } else { + g.wg.Add(1) + go g.do(req, callback) + } } // Do sends an HTTP request func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) { g.acquireSem(req) defer g.releaseSem(req) - defer g.wg.Done() + if !req.Synchronized { + defer g.wg.Done() + } defer recoverMiddleware() for _, middlewareFunc := range g.requestMiddlewares { middlewareFunc(g, req) - if req.Cancelled { + if req.cancelled { return } } diff --git a/geziyor_test.go b/geziyor_test.go index 6f240d2..f1ec588 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -8,8 +8,12 @@ import ( "github.com/geziyor/geziyor" "github.com/geziyor/geziyor/exporter" "github.com/geziyor/geziyor/extractor" + http2 "github.com/geziyor/geziyor/http" "github.com/geziyor/geziyor/metrics" + "net/http" + "net/http/httptest" "testing" + "unicode/utf8" ) func TestSimple(t *testing.T) { @@ -164,3 +168,59 @@ func TestExtractor(t *testing.T) { Exporters: []geziyor.Exporter{&exporter.JSON{}}, }).Start() } + +func TestCharsetDetection(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprint(w, "\xf0ültekin") + })) + defer ts.Close() + + geziyor.NewGeziyor(&geziyor.Options{ + StartURLs: []string{ts.URL}, + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { + if !utf8.Valid(r.Body) { + t.Fatal() + } + }, + CharsetDetectDisabled: false, + }).Start() + + geziyor.NewGeziyor(&geziyor.Options{ + StartURLs: []string{ts.URL}, + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { + if utf8.Valid(r.Body) { + t.Fatal() + } + }, + CharsetDetectDisabled: true, + }).Start() +} + +// Make sure to increase open file descriptor limits before running +func BenchmarkGeziyor_Do(b *testing.B) { + + // Create Server + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprint(w, "Hello, client") + })) + ts.Client().Transport = http2.NewClient().Transport + defer ts.Close() + + // As we don't benchmark creating a server, reset timer. + b.ResetTimer() + + geziyor.NewGeziyor(&geziyor.Options{ + StartRequestsFunc: func(g *geziyor.Geziyor) { + // Create Synchronized request to benchmark requests accurately. + req, _ := geziyor.NewRequest("GET", ts.URL, nil) + req.Synchronized = true + + // We only bench here ! + for i := 0; i < b.N; i++ { + g.Do(req, nil) + } + }, + URLRevisitEnabled: true, + LogDisabled: true, + }).Start() +} diff --git a/middleware.go b/middleware.go index 487c294..9d7053b 100644 --- a/middleware.go +++ b/middleware.go @@ -38,7 +38,7 @@ func recoverMiddleware() { func allowedDomainsMiddleware(g *Geziyor, r *Request) { if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) { //log.Printf("Domain not allowed: %s\n", req.Host) - r.Cancelled = true + r.Cancel() return } } @@ -49,7 +49,7 @@ func duplicateRequestsMiddleware(g *Geziyor, r *Request) { key := r.Request.URL.String() + r.Request.Method if _, visited := g.visitedURLs.LoadOrStore(key, struct{}{}); visited { //log.Printf("URL already visited %s\n", rawURL) - r.Cancelled = true + r.Cancel() } } } @@ -75,7 +75,7 @@ func delayMiddleware(g *Geziyor, r *Request) { // logMiddleware logs requests func logMiddleware(g *Geziyor, r *Request) { - log.Println("Fetching: ", r.URL.String()) + //log.Println("Fetching: ", r.URL.String()) } // metricsRequestMiddleware sets stats diff --git a/request.go b/request.go index b6d7331..b3e873b 100644 --- a/request.go +++ b/request.go @@ -8,9 +8,16 @@ import ( // Request is a small wrapper around *http.Request that contains Metadata and Rendering option type Request struct { *http.Request - Meta map[string]interface{} - Rendered bool - Cancelled bool + Meta map[string]interface{} + Synchronized bool + Rendered bool + + cancelled bool +} + +// Cancel request +func (r *Request) Cancel() { + r.cancelled = true } // NewRequest returns a new Request given a method, URL, and optional body.