Synchronized requests support added. Benchmarks added.

This commit is contained in:
Musab Gültekin 2019-06-28 17:28:16 +03:00
parent b000581c3d
commit 276b248ebb
5 changed files with 107 additions and 16 deletions

View File

@ -5,9 +5,10 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
[![report card](https://goreportcard.com/badge/github.com/geziyor/geziyor)](http://goreportcard.com/report/geziyor/geziyor)
## Features
- 1.000+ Requests/Sec
- 5.000+ Requests/Sec
- JS Rendering
- Caching (Memory/Disk)
- Automatic Data Extracting (CSS Selectors)
- Automatic Data Exporting (JSON, CSV, or custom)
- Metrics (Prometheus, Expvar, or custom)
- Limit Concurrency (Global/Per Domain)
@ -103,8 +104,8 @@ geziyor.NewGeziyor(&geziyor.Options{
### Extracting Data
#### Extractors
You can add [Extractor]() to []Extractors option to extract structured data.
Exporters need to be defined in order to extractors work.
You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor) to ```[]Extractors``` option to extract structured data.
```Exporters``` need to be defined in order extractors to work.
```go
geziyor.NewGeziyor(&geziyor.Options{
@ -157,12 +158,29 @@ geziyor.NewGeziyor(&geziyor.Options{
}).Start()
```
## Benchmark
**8452 request per seconds** on *Macbook Pro 15" 2016*
See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for this benchmark function:
```bash
>> go test -run none -bench . -benchtime 10s
goos: darwin
goarch: amd64
pkg: github.com/geziyor/geziyor
BenchmarkGeziyor_Do-8 200000 112493 ns/op
PASS
ok github.com/geziyor/geziyor 23.662s
```
## Roadmap
If you're interested in helping this project, please consider these features:
- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
- Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape))
- ~~Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape))~~
- Deploying Scrapers to Cloud
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))

View File

@ -113,8 +113,9 @@ func (g *Geziyor) Start() {
log.Println("Scraping Started")
// Metrics
metricsServer := &stdhttp.Server{Addr: ":2112"}
if g.Opt.MetricsType == metrics.Prometheus {
metricsServer := &stdhttp.Server{Addr: ":2112"}
defer metricsServer.Close()
go func() {
stdhttp.Handle("/metrics", promhttp.Handler())
metricsServer.ListenAndServe()
@ -144,7 +145,6 @@ func (g *Geziyor) Start() {
g.wg.Wait()
close(g.Exports)
metricsServer.Close()
log.Println("Scraping Finished")
}
@ -182,20 +182,26 @@ func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *Response)) {
// Do sends an HTTP request
func (g *Geziyor) Do(req *Request, callback func(g *Geziyor, r *Response)) {
g.wg.Add(1)
go g.do(req, callback)
if req.Synchronized {
g.do(req, callback)
} else {
g.wg.Add(1)
go g.do(req, callback)
}
}
// Do sends an HTTP request
func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) {
g.acquireSem(req)
defer g.releaseSem(req)
defer g.wg.Done()
if !req.Synchronized {
defer g.wg.Done()
}
defer recoverMiddleware()
for _, middlewareFunc := range g.requestMiddlewares {
middlewareFunc(g, req)
if req.Cancelled {
if req.cancelled {
return
}
}

View File

@ -8,8 +8,12 @@ import (
"github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/exporter"
"github.com/geziyor/geziyor/extractor"
http2 "github.com/geziyor/geziyor/http"
"github.com/geziyor/geziyor/metrics"
"net/http"
"net/http/httptest"
"testing"
"unicode/utf8"
)
func TestSimple(t *testing.T) {
@ -164,3 +168,59 @@ func TestExtractor(t *testing.T) {
Exporters: []geziyor.Exporter{&exporter.JSON{}},
}).Start()
}
func TestCharsetDetection(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, "\xf0ültekin")
}))
defer ts.Close()
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{ts.URL},
ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
if !utf8.Valid(r.Body) {
t.Fatal()
}
},
CharsetDetectDisabled: false,
}).Start()
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{ts.URL},
ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
if utf8.Valid(r.Body) {
t.Fatal()
}
},
CharsetDetectDisabled: true,
}).Start()
}
// Make sure to increase open file descriptor limits before running
func BenchmarkGeziyor_Do(b *testing.B) {
// Create Server
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, "Hello, client")
}))
ts.Client().Transport = http2.NewClient().Transport
defer ts.Close()
// As we don't benchmark creating a server, reset timer.
b.ResetTimer()
geziyor.NewGeziyor(&geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) {
// Create Synchronized request to benchmark requests accurately.
req, _ := geziyor.NewRequest("GET", ts.URL, nil)
req.Synchronized = true
// We only bench here !
for i := 0; i < b.N; i++ {
g.Do(req, nil)
}
},
URLRevisitEnabled: true,
LogDisabled: true,
}).Start()
}

View File

@ -38,7 +38,7 @@ func recoverMiddleware() {
func allowedDomainsMiddleware(g *Geziyor, r *Request) {
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
//log.Printf("Domain not allowed: %s\n", req.Host)
r.Cancelled = true
r.Cancel()
return
}
}
@ -49,7 +49,7 @@ func duplicateRequestsMiddleware(g *Geziyor, r *Request) {
key := r.Request.URL.String() + r.Request.Method
if _, visited := g.visitedURLs.LoadOrStore(key, struct{}{}); visited {
//log.Printf("URL already visited %s\n", rawURL)
r.Cancelled = true
r.Cancel()
}
}
}
@ -75,7 +75,7 @@ func delayMiddleware(g *Geziyor, r *Request) {
// logMiddleware logs requests
func logMiddleware(g *Geziyor, r *Request) {
log.Println("Fetching: ", r.URL.String())
//log.Println("Fetching: ", r.URL.String())
}
// metricsRequestMiddleware sets stats

View File

@ -8,9 +8,16 @@ import (
// Request is a small wrapper around *http.Request that contains Metadata and Rendering option
type Request struct {
*http.Request
Meta map[string]interface{}
Rendered bool
Cancelled bool
Meta map[string]interface{}
Synchronized bool
Rendered bool
cancelled bool
}
// Cancel request
func (r *Request) Cancel() {
r.cancelled = true
}
// NewRequest returns a new Request given a method, URL, and optional body.