Don't visit already visited URLs. Update README
This commit is contained in:
parent
edbddf74d8
commit
c525e0d7d0
38
README.md
38
README.md
@ -1,2 +1,38 @@
|
||||
# Gezer
|
||||
Scraper and crawler framework for Golang
|
||||
Scraper and crawler framework for Golang. Gezer uses go *channels* over *callbacks*
|
||||
|
||||
## Features
|
||||
- 1.000+ Requests/Sec
|
||||
- Caching
|
||||
- Automatic Data Exporting
|
||||
|
||||
|
||||
## Example
|
||||
```go
|
||||
gezer := NewGezer(Opt{
|
||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||
ParseFunc: func(r *Response) {
|
||||
r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
|
||||
// Export Data
|
||||
r.Exports <- map[string]interface{}{
|
||||
"text": s.Find("span.text").Text(),
|
||||
"author": s.Find("small.author").Text(),
|
||||
"tags": s.Find("div.tags > a.tag").Map(func(_ int, s *goquery.Selection) string {
|
||||
return s.Text()
|
||||
}),
|
||||
}
|
||||
})
|
||||
|
||||
// Next Page
|
||||
if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok {
|
||||
go r.Gezer.Get(r.JoinURL(href))
|
||||
}
|
||||
},
|
||||
})
|
||||
gezer.Start()
|
||||
```
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
go get github.com/gogezer/gezer
|
15
gezer.go
15
gezer.go
@ -18,6 +18,8 @@ type Gezer struct {
|
||||
client *http.Client
|
||||
wg sync.WaitGroup
|
||||
opt Opt
|
||||
|
||||
visitedURLS []string
|
||||
}
|
||||
|
||||
type Opt struct {
|
||||
@ -59,7 +61,7 @@ func (g *Gezer) Get(rawURL string) {
|
||||
g.wg.Add(1)
|
||||
defer g.wg.Done()
|
||||
|
||||
if !checkURL(rawURL, g.opt.AllowedDomains) {
|
||||
if !checkURL(rawURL, g) {
|
||||
return
|
||||
}
|
||||
|
||||
@ -102,7 +104,7 @@ func (g *Gezer) Get(rawURL string) {
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
|
||||
func checkURL(rawURL string, allowedDomains []string) bool {
|
||||
func checkURL(rawURL string, g *Gezer) bool {
|
||||
|
||||
// Parse URL
|
||||
parsedURL, err := url.Parse(rawURL)
|
||||
@ -112,11 +114,18 @@ func checkURL(rawURL string, allowedDomains []string) bool {
|
||||
}
|
||||
|
||||
// Check for allowed domains
|
||||
if len(allowedDomains) != 0 && !Contains(allowedDomains, parsedURL.Host) {
|
||||
if len(g.opt.AllowedDomains) != 0 && !Contains(g.opt.AllowedDomains, parsedURL.Host) {
|
||||
log.Printf("Domain not allowed: %s\n", parsedURL.Host)
|
||||
return false
|
||||
}
|
||||
|
||||
// Check for duplicate requests
|
||||
if Contains(g.visitedURLS, rawURL) {
|
||||
log.Printf("URL already visited %s\n", rawURL)
|
||||
return false
|
||||
}
|
||||
g.visitedURLS = append(g.visitedURLS, rawURL)
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
|
@ -24,6 +24,7 @@ func TestGezer_StartURLs_HTML(t *testing.T) {
|
||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||
ParseFunc: func(r *Response) {
|
||||
r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
|
||||
// Export Data
|
||||
r.Exports <- map[string]interface{}{
|
||||
"text": s.Find("span.text").Text(),
|
||||
"author": s.Find("small.author").Text(),
|
||||
@ -32,6 +33,8 @@ func TestGezer_StartURLs_HTML(t *testing.T) {
|
||||
}),
|
||||
}
|
||||
})
|
||||
|
||||
// Next Page
|
||||
if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok {
|
||||
go r.Gezer.Get(r.JoinURL(href))
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user