Don't visit already visited URLs. Update README
This commit is contained in:
parent
edbddf74d8
commit
c525e0d7d0
38
README.md
38
README.md
@ -1,2 +1,38 @@
|
|||||||
# Gezer
|
# Gezer
|
||||||
Scraper and crawler framework for Golang
|
Scraper and crawler framework for Golang. Gezer uses go *channels* over *callbacks*
|
||||||
|
|
||||||
|
## Features
|
||||||
|
- 1.000+ Requests/Sec
|
||||||
|
- Caching
|
||||||
|
- Automatic Data Exporting
|
||||||
|
|
||||||
|
|
||||||
|
## Example
|
||||||
|
```go
|
||||||
|
gezer := NewGezer(Opt{
|
||||||
|
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||||
|
ParseFunc: func(r *Response) {
|
||||||
|
r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
|
||||||
|
// Export Data
|
||||||
|
r.Exports <- map[string]interface{}{
|
||||||
|
"text": s.Find("span.text").Text(),
|
||||||
|
"author": s.Find("small.author").Text(),
|
||||||
|
"tags": s.Find("div.tags > a.tag").Map(func(_ int, s *goquery.Selection) string {
|
||||||
|
return s.Text()
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
// Next Page
|
||||||
|
if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok {
|
||||||
|
go r.Gezer.Get(r.JoinURL(href))
|
||||||
|
}
|
||||||
|
},
|
||||||
|
})
|
||||||
|
gezer.Start()
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
go get github.com/gogezer/gezer
|
15
gezer.go
15
gezer.go
@ -18,6 +18,8 @@ type Gezer struct {
|
|||||||
client *http.Client
|
client *http.Client
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
opt Opt
|
opt Opt
|
||||||
|
|
||||||
|
visitedURLS []string
|
||||||
}
|
}
|
||||||
|
|
||||||
type Opt struct {
|
type Opt struct {
|
||||||
@ -59,7 +61,7 @@ func (g *Gezer) Get(rawURL string) {
|
|||||||
g.wg.Add(1)
|
g.wg.Add(1)
|
||||||
defer g.wg.Done()
|
defer g.wg.Done()
|
||||||
|
|
||||||
if !checkURL(rawURL, g.opt.AllowedDomains) {
|
if !checkURL(rawURL, g) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -102,7 +104,7 @@ func (g *Gezer) Get(rawURL string) {
|
|||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
}
|
}
|
||||||
|
|
||||||
func checkURL(rawURL string, allowedDomains []string) bool {
|
func checkURL(rawURL string, g *Gezer) bool {
|
||||||
|
|
||||||
// Parse URL
|
// Parse URL
|
||||||
parsedURL, err := url.Parse(rawURL)
|
parsedURL, err := url.Parse(rawURL)
|
||||||
@ -112,11 +114,18 @@ func checkURL(rawURL string, allowedDomains []string) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check for allowed domains
|
// Check for allowed domains
|
||||||
if len(allowedDomains) != 0 && !Contains(allowedDomains, parsedURL.Host) {
|
if len(g.opt.AllowedDomains) != 0 && !Contains(g.opt.AllowedDomains, parsedURL.Host) {
|
||||||
log.Printf("Domain not allowed: %s\n", parsedURL.Host)
|
log.Printf("Domain not allowed: %s\n", parsedURL.Host)
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check for duplicate requests
|
||||||
|
if Contains(g.visitedURLS, rawURL) {
|
||||||
|
log.Printf("URL already visited %s\n", rawURL)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
g.visitedURLS = append(g.visitedURLS, rawURL)
|
||||||
|
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -24,6 +24,7 @@ func TestGezer_StartURLs_HTML(t *testing.T) {
|
|||||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||||
ParseFunc: func(r *Response) {
|
ParseFunc: func(r *Response) {
|
||||||
r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
|
r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
|
||||||
|
// Export Data
|
||||||
r.Exports <- map[string]interface{}{
|
r.Exports <- map[string]interface{}{
|
||||||
"text": s.Find("span.text").Text(),
|
"text": s.Find("span.text").Text(),
|
||||||
"author": s.Find("small.author").Text(),
|
"author": s.Find("small.author").Text(),
|
||||||
@ -32,6 +33,8 @@ func TestGezer_StartURLs_HTML(t *testing.T) {
|
|||||||
}),
|
}),
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// Next Page
|
||||||
if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok {
|
if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok {
|
||||||
go r.Gezer.Get(r.JoinURL(href))
|
go r.Gezer.Get(r.JoinURL(href))
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user