Don't visit already visited URLs. Update README

2019-06-08 17:04:00 +03:00
parent edbddf74d8
commit c525e0d7d0
3 changed files with 52 additions and 4 deletions
@@ -1,2 +1,38 @@
 # Gezer
-Scraper and crawler framework for Golang
+Scraper and crawler framework for Golang. Gezer uses go *channels* over *callbacks*   
+
+## Features
+- 1.000+ Requests/Sec  
+- Caching
+- Automatic Data Exporting
+
+
+## Example
+```go
+gezer := NewGezer(Opt{
+    StartURLs: []string{"http://quotes.toscrape.com/"},
+    ParseFunc: func(r *Response) {
+        r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
+            // Export Data
+            r.Exports <- map[string]interface{}{
+                "text":   s.Find("span.text").Text(),
+                "author": s.Find("small.author").Text(),
+                "tags": s.Find("div.tags > a.tag").Map(func(_ int, s *goquery.Selection) string {
+                    return s.Text()
+                }),
+            }
+        })
+
+        // Next Page
+        if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok {
+            go r.Gezer.Get(r.JoinURL(href))
+        }
+    },
+})
+gezer.Start()
+```
+
+
+## Installation
+
+    go get github.com/gogezer/gezer
@@ -18,6 +18,8 @@ type Gezer struct {
 	client *http.Client
 	wg     sync.WaitGroup
 	opt    Opt
+
+	visitedURLS []string
 }

 type Opt struct {
@@ -59,7 +61,7 @@ func (g *Gezer) Get(rawURL string) {
 	g.wg.Add(1)
 	defer g.wg.Done()

-	if !checkURL(rawURL, g.opt.AllowedDomains) {
+	if !checkURL(rawURL, g) {
 		return
 	}

@@ -102,7 +104,7 @@ func (g *Gezer) Get(rawURL string) {
 	time.Sleep(time.Millisecond)
 }

-func checkURL(rawURL string, allowedDomains []string) bool {
+func checkURL(rawURL string, g *Gezer) bool {

 	// Parse URL
 	parsedURL, err := url.Parse(rawURL)
@@ -112,11 +114,18 @@ func checkURL(rawURL string, allowedDomains []string) bool {
 	}

 	// Check for allowed domains
-	if len(allowedDomains) != 0 && !Contains(allowedDomains, parsedURL.Host) {
+	if len(g.opt.AllowedDomains) != 0 && !Contains(g.opt.AllowedDomains, parsedURL.Host) {
 		log.Printf("Domain not allowed: %s\n", parsedURL.Host)
 		return false
 	}

+	// Check for duplicate requests
+	if Contains(g.visitedURLS, rawURL) {
+		log.Printf("URL already visited %s\n", rawURL)
+		return false
+	}
+	g.visitedURLS = append(g.visitedURLS, rawURL)
+
 	return true
 }

@@ -24,6 +24,7 @@ func TestGezer_StartURLs_HTML(t *testing.T) {
 		StartURLs: []string{"http://quotes.toscrape.com/"},
 		ParseFunc: func(r *Response) {
 			r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
+				// Export Data
 				r.Exports <- map[string]interface{}{
 					"text":   s.Find("span.text").Text(),
 					"author": s.Find("small.author").Text(),
@@ -32,6 +33,8 @@ func TestGezer_StartURLs_HTML(t *testing.T) {
 					}),
 				}
 			})
+
+			// Next Page
 			if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok {
 				go r.Gezer.Get(r.JoinURL(href))
 			}