Don't visit already visited URLs. Update README

This commit is contained in:
Musab Gültekin 2019-06-08 17:04:00 +03:00
parent edbddf74d8
commit c525e0d7d0
3 changed files with 52 additions and 4 deletions

View File

@ -1,2 +1,38 @@
# Gezer # Gezer
Scraper and crawler framework for Golang Scraper and crawler framework for Golang. Gezer uses go *channels* over *callbacks*
## Features
- 1.000+ Requests/Sec
- Caching
- Automatic Data Exporting
## Example
```go
gezer := NewGezer(Opt{
StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: func(r *Response) {
r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
// Export Data
r.Exports <- map[string]interface{}{
"text": s.Find("span.text").Text(),
"author": s.Find("small.author").Text(),
"tags": s.Find("div.tags > a.tag").Map(func(_ int, s *goquery.Selection) string {
return s.Text()
}),
}
})
// Next Page
if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok {
go r.Gezer.Get(r.JoinURL(href))
}
},
})
gezer.Start()
```
## Installation
go get github.com/gogezer/gezer

View File

@ -18,6 +18,8 @@ type Gezer struct {
client *http.Client client *http.Client
wg sync.WaitGroup wg sync.WaitGroup
opt Opt opt Opt
visitedURLS []string
} }
type Opt struct { type Opt struct {
@ -59,7 +61,7 @@ func (g *Gezer) Get(rawURL string) {
g.wg.Add(1) g.wg.Add(1)
defer g.wg.Done() defer g.wg.Done()
if !checkURL(rawURL, g.opt.AllowedDomains) { if !checkURL(rawURL, g) {
return return
} }
@ -102,7 +104,7 @@ func (g *Gezer) Get(rawURL string) {
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
} }
func checkURL(rawURL string, allowedDomains []string) bool { func checkURL(rawURL string, g *Gezer) bool {
// Parse URL // Parse URL
parsedURL, err := url.Parse(rawURL) parsedURL, err := url.Parse(rawURL)
@ -112,11 +114,18 @@ func checkURL(rawURL string, allowedDomains []string) bool {
} }
// Check for allowed domains // Check for allowed domains
if len(allowedDomains) != 0 && !Contains(allowedDomains, parsedURL.Host) { if len(g.opt.AllowedDomains) != 0 && !Contains(g.opt.AllowedDomains, parsedURL.Host) {
log.Printf("Domain not allowed: %s\n", parsedURL.Host) log.Printf("Domain not allowed: %s\n", parsedURL.Host)
return false return false
} }
// Check for duplicate requests
if Contains(g.visitedURLS, rawURL) {
log.Printf("URL already visited %s\n", rawURL)
return false
}
g.visitedURLS = append(g.visitedURLS, rawURL)
return true return true
} }

View File

@ -24,6 +24,7 @@ func TestGezer_StartURLs_HTML(t *testing.T) {
StartURLs: []string{"http://quotes.toscrape.com/"}, StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: func(r *Response) { ParseFunc: func(r *Response) {
r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) { r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
// Export Data
r.Exports <- map[string]interface{}{ r.Exports <- map[string]interface{}{
"text": s.Find("span.text").Text(), "text": s.Find("span.text").Text(),
"author": s.Find("small.author").Text(), "author": s.Find("small.author").Text(),
@ -32,6 +33,8 @@ func TestGezer_StartURLs_HTML(t *testing.T) {
}), }),
} }
}) })
// Next Page
if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok { if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok {
go r.Gezer.Get(r.JoinURL(href)) go r.Gezer.Get(r.JoinURL(href))
} }