From d967555b626869736109948cf00ef6158ad1ca6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Sun, 9 Jun 2019 11:53:40 +0300 Subject: [PATCH] Global and Domain Concurrency limit implemented. Updated README --- README.md | 19 +++++++++++++-- geziyor.go | 62 ++++++++++++++++++++++++++++++++++++++++++++----- geziyor_test.go | 2 +- options.go | 6 +++++ 4 files changed, 80 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 0c2d6c6..07793ca 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Geziyor -Geziyor is a fast web crawling and web scraping framework, used to crawl websites and extract structured data from their pages. It can be used for a wide range of purposes, from data mining to monitoring and automated testing. +Geziyor is a blazing fast web crawling and web scraping framework, used to crawl websites and extract structured data from their pages. It can be used for a wide range of purposes, from data mining to monitoring and automated testing. [![GoDoc](https://godoc.org/github.com/geziyor/geziyor?status.svg)](https://godoc.org/github.com/geziyor/geziyor) [![report card](https://goreportcard.com/badge/github.com/geziyor/geziyor)](http://goreportcard.com/report/geziyor/geziyor) @@ -8,9 +8,24 @@ Geziyor is a fast web crawling and web scraping framework, used to crawl website - 1.000+ Requests/Sec - Caching - Automatic Data Exporting +- Limit Concurrency Global/Per Domain +- Automatic response decoding to UTF-8 -## Example +## Usage +Simplest usage + +```go +geziyor.NewGeziyor(geziyor.Options{ + StartURLs: []string{"http://api.ipify.org"}, + ParseFunc: func(r *geziyor.Response) { + fmt.Println(r.Doc.Text()) + }, +}).Start() +``` + +Export all quotes and authors to out.json file. + ```go geziyor := NewGeziyor(Opt{ StartURLs: []string{"http://quotes.toscrape.com/"}, diff --git a/geziyor.go b/geziyor.go index 423ef17..dec97d2 100644 --- a/geziyor.go +++ b/geziyor.go @@ -21,6 +21,11 @@ type Geziyor struct { opt Options visitedURLS []string + semGlobal chan struct{} + semHosts struct { + sync.RWMutex + hostSems map[string]chan struct{} + } } func init() { @@ -43,6 +48,15 @@ func NewGeziyor(opt Options) *Geziyor { if opt.Timeout != 0 { geziyor.client.Timeout = opt.Timeout } + if opt.ConcurrentRequests != 0 { + geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests) + } + if opt.ConcurrentRequestsPerDomain != 0 { + geziyor.semHosts = struct { + sync.RWMutex + hostSems map[string]chan struct{} + }{hostSems: make(map[string]chan struct{})} + } return geziyor } @@ -86,12 +100,15 @@ func (g *Geziyor) Do(req *http.Request) { return } - // Log - log.Println("Fetching: ", req.URL.String()) - // Modify Request req.Header.Set("Accept-Charset", "utf-8") + // Acquire Semaphore + g.acquire(req) + + // Log + log.Println("Fetching: ", req.URL.String()) + // Do request resp, err := g.client.Do(req) if resp != nil { @@ -99,6 +116,7 @@ func (g *Geziyor) Do(req *http.Request) { } if err != nil { log.Printf("Response error: %v\n", err) + g.release(req) return } @@ -106,6 +124,7 @@ func (g *Geziyor) Do(req *http.Request) { reader, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type")) if err != nil { log.Printf("Determine encoding error: %v\n", err) + g.release(req) return } @@ -113,9 +132,13 @@ func (g *Geziyor) Do(req *http.Request) { body, err := ioutil.ReadAll(reader) if err != nil { log.Printf("Reading Body error: %v\n", err) + g.release(req) return } + // Release Semaphore + g.release(req) + // Create Document doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(body)) @@ -136,18 +159,45 @@ func (g *Geziyor) Do(req *http.Request) { time.Sleep(time.Millisecond) } +func (g *Geziyor) acquire(req *http.Request) { + if g.opt.ConcurrentRequests != 0 { + g.semGlobal <- struct{}{} + } + + if g.opt.ConcurrentRequestsPerDomain != 0 { + g.semHosts.RLock() + hostSem, exists := g.semHosts.hostSems[req.Host] + g.semHosts.RUnlock() + if !exists { + hostSem = make(chan struct{}, g.opt.ConcurrentRequestsPerDomain) + g.semHosts.Lock() + g.semHosts.hostSems[req.Host] = hostSem + g.semHosts.Unlock() + } + hostSem <- struct{}{} + } +} + +func (g *Geziyor) release(req *http.Request) { + if g.opt.ConcurrentRequests != 0 { + <-g.semGlobal + } + if g.opt.ConcurrentRequestsPerDomain != 0 { + <-g.semHosts.hostSems[req.Host] + } +} + func checkURL(parsedURL *url.URL, g *Geziyor) bool { rawURL := parsedURL.String() - // Check for allowed domains if len(g.opt.AllowedDomains) != 0 && !contains(g.opt.AllowedDomains, parsedURL.Host) { - log.Printf("Domain not allowed: %s\n", parsedURL.Host) + //log.Printf("Domain not allowed: %s\n", parsedURL.Host) return false } // Check for duplicate requests if contains(g.visitedURLS, rawURL) { - log.Printf("URL already visited %s\n", rawURL) + //log.Printf("URL already visited %s\n", rawURL) return false } g.visitedURLS = append(g.visitedURLS, rawURL) diff --git a/geziyor_test.go b/geziyor_test.go index 3b76548..92e07d8 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -12,7 +12,7 @@ func TestGeziyor_Simple(t *testing.T) { geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://api.ipify.org"}, ParseFunc: func(r *geziyor.Response) { - fmt.Println(string(r.Body)) + fmt.Println(r.Doc.Text()) }, }).Start() } diff --git a/options.go b/options.go index 59f1443..8d4ea78 100644 --- a/options.go +++ b/options.go @@ -21,4 +21,10 @@ type Options struct { // Memory Cache: httpcache.NewMemoryCache() // Disk Cache: diskcache.New(".cache") Cache httpcache.Cache + + // Concurrent requests limit + ConcurrentRequests int + + // Concurrent requests per domain limit + ConcurrentRequestsPerDomain int }