Synchronized requests support added. Benchmarks added.

This commit is contained in:
Musab Gültekin 2019-06-28 17:28:16 +03:00
parent b000581c3d
commit 276b248ebb
5 changed files with 107 additions and 16 deletions

View File

@ -5,9 +5,10 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
[![report card](https://goreportcard.com/badge/github.com/geziyor/geziyor)](http://goreportcard.com/report/geziyor/geziyor) [![report card](https://goreportcard.com/badge/github.com/geziyor/geziyor)](http://goreportcard.com/report/geziyor/geziyor)
## Features ## Features
- 1.000+ Requests/Sec - 5.000+ Requests/Sec
- JS Rendering - JS Rendering
- Caching (Memory/Disk) - Caching (Memory/Disk)
- Automatic Data Extracting (CSS Selectors)
- Automatic Data Exporting (JSON, CSV, or custom) - Automatic Data Exporting (JSON, CSV, or custom)
- Metrics (Prometheus, Expvar, or custom) - Metrics (Prometheus, Expvar, or custom)
- Limit Concurrency (Global/Per Domain) - Limit Concurrency (Global/Per Domain)
@ -103,8 +104,8 @@ geziyor.NewGeziyor(&geziyor.Options{
### Extracting Data ### Extracting Data
#### Extractors #### Extractors
You can add [Extractor]() to []Extractors option to extract structured data. You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor) to ```[]Extractors``` option to extract structured data.
Exporters need to be defined in order to extractors work. ```Exporters``` need to be defined in order extractors to work.
```go ```go
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
@ -157,12 +158,29 @@ geziyor.NewGeziyor(&geziyor.Options{
}).Start() }).Start()
``` ```
## Benchmark
**8452 request per seconds** on *Macbook Pro 15" 2016*
See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for this benchmark function:
```bash
>> go test -run none -bench . -benchtime 10s
goos: darwin
goarch: amd64
pkg: github.com/geziyor/geziyor
BenchmarkGeziyor_Do-8 200000 112493 ns/op
PASS
ok github.com/geziyor/geziyor 23.662s
```
## Roadmap ## Roadmap
If you're interested in helping this project, please consider these features: If you're interested in helping this project, please consider these features:
- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html)) - Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
- Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape)) - ~~Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape))~~
- Deploying Scrapers to Cloud - Deploying Scrapers to Cloud
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~ - ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html)) - Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))

View File

@ -113,8 +113,9 @@ func (g *Geziyor) Start() {
log.Println("Scraping Started") log.Println("Scraping Started")
// Metrics // Metrics
metricsServer := &stdhttp.Server{Addr: ":2112"}
if g.Opt.MetricsType == metrics.Prometheus { if g.Opt.MetricsType == metrics.Prometheus {
metricsServer := &stdhttp.Server{Addr: ":2112"}
defer metricsServer.Close()
go func() { go func() {
stdhttp.Handle("/metrics", promhttp.Handler()) stdhttp.Handle("/metrics", promhttp.Handler())
metricsServer.ListenAndServe() metricsServer.ListenAndServe()
@ -144,7 +145,6 @@ func (g *Geziyor) Start() {
g.wg.Wait() g.wg.Wait()
close(g.Exports) close(g.Exports)
metricsServer.Close()
log.Println("Scraping Finished") log.Println("Scraping Finished")
} }
@ -182,20 +182,26 @@ func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *Response)) {
// Do sends an HTTP request // Do sends an HTTP request
func (g *Geziyor) Do(req *Request, callback func(g *Geziyor, r *Response)) { func (g *Geziyor) Do(req *Request, callback func(g *Geziyor, r *Response)) {
if req.Synchronized {
g.do(req, callback)
} else {
g.wg.Add(1) g.wg.Add(1)
go g.do(req, callback) go g.do(req, callback)
} }
}
// Do sends an HTTP request // Do sends an HTTP request
func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) { func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) {
g.acquireSem(req) g.acquireSem(req)
defer g.releaseSem(req) defer g.releaseSem(req)
if !req.Synchronized {
defer g.wg.Done() defer g.wg.Done()
}
defer recoverMiddleware() defer recoverMiddleware()
for _, middlewareFunc := range g.requestMiddlewares { for _, middlewareFunc := range g.requestMiddlewares {
middlewareFunc(g, req) middlewareFunc(g, req)
if req.Cancelled { if req.cancelled {
return return
} }
} }

View File

@ -8,8 +8,12 @@ import (
"github.com/geziyor/geziyor" "github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/exporter" "github.com/geziyor/geziyor/exporter"
"github.com/geziyor/geziyor/extractor" "github.com/geziyor/geziyor/extractor"
http2 "github.com/geziyor/geziyor/http"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
"net/http"
"net/http/httptest"
"testing" "testing"
"unicode/utf8"
) )
func TestSimple(t *testing.T) { func TestSimple(t *testing.T) {
@ -164,3 +168,59 @@ func TestExtractor(t *testing.T) {
Exporters: []geziyor.Exporter{&exporter.JSON{}}, Exporters: []geziyor.Exporter{&exporter.JSON{}},
}).Start() }).Start()
} }
func TestCharsetDetection(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, "\xf0ültekin")
}))
defer ts.Close()
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{ts.URL},
ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
if !utf8.Valid(r.Body) {
t.Fatal()
}
},
CharsetDetectDisabled: false,
}).Start()
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{ts.URL},
ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
if utf8.Valid(r.Body) {
t.Fatal()
}
},
CharsetDetectDisabled: true,
}).Start()
}
// Make sure to increase open file descriptor limits before running
func BenchmarkGeziyor_Do(b *testing.B) {
// Create Server
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, "Hello, client")
}))
ts.Client().Transport = http2.NewClient().Transport
defer ts.Close()
// As we don't benchmark creating a server, reset timer.
b.ResetTimer()
geziyor.NewGeziyor(&geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) {
// Create Synchronized request to benchmark requests accurately.
req, _ := geziyor.NewRequest("GET", ts.URL, nil)
req.Synchronized = true
// We only bench here !
for i := 0; i < b.N; i++ {
g.Do(req, nil)
}
},
URLRevisitEnabled: true,
LogDisabled: true,
}).Start()
}

View File

@ -38,7 +38,7 @@ func recoverMiddleware() {
func allowedDomainsMiddleware(g *Geziyor, r *Request) { func allowedDomainsMiddleware(g *Geziyor, r *Request) {
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) { if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
//log.Printf("Domain not allowed: %s\n", req.Host) //log.Printf("Domain not allowed: %s\n", req.Host)
r.Cancelled = true r.Cancel()
return return
} }
} }
@ -49,7 +49,7 @@ func duplicateRequestsMiddleware(g *Geziyor, r *Request) {
key := r.Request.URL.String() + r.Request.Method key := r.Request.URL.String() + r.Request.Method
if _, visited := g.visitedURLs.LoadOrStore(key, struct{}{}); visited { if _, visited := g.visitedURLs.LoadOrStore(key, struct{}{}); visited {
//log.Printf("URL already visited %s\n", rawURL) //log.Printf("URL already visited %s\n", rawURL)
r.Cancelled = true r.Cancel()
} }
} }
} }
@ -75,7 +75,7 @@ func delayMiddleware(g *Geziyor, r *Request) {
// logMiddleware logs requests // logMiddleware logs requests
func logMiddleware(g *Geziyor, r *Request) { func logMiddleware(g *Geziyor, r *Request) {
log.Println("Fetching: ", r.URL.String()) //log.Println("Fetching: ", r.URL.String())
} }
// metricsRequestMiddleware sets stats // metricsRequestMiddleware sets stats

View File

@ -9,8 +9,15 @@ import (
type Request struct { type Request struct {
*http.Request *http.Request
Meta map[string]interface{} Meta map[string]interface{}
Synchronized bool
Rendered bool Rendered bool
Cancelled bool
cancelled bool
}
// Cancel request
func (r *Request) Cancel() {
r.cancelled = true
} }
// NewRequest returns a new Request given a method, URL, and optional body. // NewRequest returns a new Request given a method, URL, and optional body.