From c525e0d7d02b7bec76dc95b28174e5d4dec91034 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Sat, 8 Jun 2019 17:04:00 +0300 Subject: [PATCH] Don't visit already visited URLs. Update README --- README.md | 38 +++++++++++++++++++++++++++++++++++++- gezer.go | 15 ++++++++++++--- gezer_test.go | 3 +++ 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 03ccdd6..a12cee1 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,38 @@ # Gezer -Scraper and crawler framework for Golang +Scraper and crawler framework for Golang. Gezer uses go *channels* over *callbacks* + +## Features +- 1.000+ Requests/Sec +- Caching +- Automatic Data Exporting + + +## Example +```go +gezer := NewGezer(Opt{ + StartURLs: []string{"http://quotes.toscrape.com/"}, + ParseFunc: func(r *Response) { + r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) { + // Export Data + r.Exports <- map[string]interface{}{ + "text": s.Find("span.text").Text(), + "author": s.Find("small.author").Text(), + "tags": s.Find("div.tags > a.tag").Map(func(_ int, s *goquery.Selection) string { + return s.Text() + }), + } + }) + + // Next Page + if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok { + go r.Gezer.Get(r.JoinURL(href)) + } + }, +}) +gezer.Start() +``` + + +## Installation + + go get github.com/gogezer/gezer \ No newline at end of file diff --git a/gezer.go b/gezer.go index afe5d8b..38e9206 100644 --- a/gezer.go +++ b/gezer.go @@ -18,6 +18,8 @@ type Gezer struct { client *http.Client wg sync.WaitGroup opt Opt + + visitedURLS []string } type Opt struct { @@ -59,7 +61,7 @@ func (g *Gezer) Get(rawURL string) { g.wg.Add(1) defer g.wg.Done() - if !checkURL(rawURL, g.opt.AllowedDomains) { + if !checkURL(rawURL, g) { return } @@ -102,7 +104,7 @@ func (g *Gezer) Get(rawURL string) { time.Sleep(time.Millisecond) } -func checkURL(rawURL string, allowedDomains []string) bool { +func checkURL(rawURL string, g *Gezer) bool { // Parse URL parsedURL, err := url.Parse(rawURL) @@ -112,11 +114,18 @@ func checkURL(rawURL string, allowedDomains []string) bool { } // Check for allowed domains - if len(allowedDomains) != 0 && !Contains(allowedDomains, parsedURL.Host) { + if len(g.opt.AllowedDomains) != 0 && !Contains(g.opt.AllowedDomains, parsedURL.Host) { log.Printf("Domain not allowed: %s\n", parsedURL.Host) return false } + // Check for duplicate requests + if Contains(g.visitedURLS, rawURL) { + log.Printf("URL already visited %s\n", rawURL) + return false + } + g.visitedURLS = append(g.visitedURLS, rawURL) + return true } diff --git a/gezer_test.go b/gezer_test.go index b3960dc..c860560 100644 --- a/gezer_test.go +++ b/gezer_test.go @@ -24,6 +24,7 @@ func TestGezer_StartURLs_HTML(t *testing.T) { StartURLs: []string{"http://quotes.toscrape.com/"}, ParseFunc: func(r *Response) { r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) { + // Export Data r.Exports <- map[string]interface{}{ "text": s.Find("span.text").Text(), "author": s.Find("small.author").Text(), @@ -32,6 +33,8 @@ func TestGezer_StartURLs_HTML(t *testing.T) { }), } }) + + // Next Page if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok { go r.Gezer.Get(r.JoinURL(href)) }