Start requests function implemented.

This commit is contained in:
Musab Gültekin
2019-06-12 12:40:38 +03:00
parent 2f6cb06982
commit bd8d58576f
3 changed files with 37 additions and 7 deletions

View File

@ -81,12 +81,22 @@ func NewGeziyor(opt Options) *Geziyor {
// Start starts scraping
func (g *Geziyor) Start() {
for _, startURL := range g.opt.StartURLs {
go g.Get(startURL, g.opt.ParseFunc)
log.Println("Scraping Started")
if g.opt.StartRequestsFunc == nil {
for _, startURL := range g.opt.StartURLs {
go g.Get(startURL, g.opt.ParseFunc)
}
} else {
for _, req := range g.opt.StartRequestsFunc() {
go g.Do(req, g.opt.ParseFunc)
}
}
time.Sleep(time.Millisecond)
g.wg.Wait()
log.Println("Scraping Finished")
}
// Get issues a GET to the specified URL.
@ -124,7 +134,9 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
}
// Modify Request
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Set("Accept-Charset", "utf-8")
req.Header.Set("Accept-Language", "en")
req.Header.Set("User-Agent", g.opt.UserAgent)
// Acquire Semaphore

View File

@ -7,6 +7,7 @@ import (
"github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/exporter"
"math/rand"
"net/http"
"testing"
"time"
)
@ -63,20 +64,20 @@ func quotesParse(r *geziyor.Response) {
func TestLinks(t *testing.T) {
geziyor.NewGeziyor(geziyor.Options{
AllowedDomains: []string{"quotes.toscrape.com"},
StartURLs: []string{"http://quotes.toscrape.com/"},
AllowedDomains: []string{"books.toscrape.com"},
StartURLs: []string{"http://books.toscrape.com/"},
ParseFunc: linksParse,
Exporters: []geziyor.Exporter{exporter.CSVExporter{}},
}).Start()
}
func linksParse(r *geziyor.Response) {
//r.Exports <- map[string]interface{}{"href": r.Request.URL.String()}
r.Exports <- []string{r.Request.URL.String()}
r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) {
if href, ok := s.Attr("href"); ok {
go r.Geziyor.Get(r.JoinURL(href), linksParse)
}
})
}
func TestRandomDelay(t *testing.T) {
@ -87,3 +88,16 @@ func TestRandomDelay(t *testing.T) {
randomDelay := rand.Intn(int(max-min)) + int(min)
fmt.Println(time.Duration(randomDelay))
}
func TestStartRequestsFunc(t *testing.T) {
geziyor.NewGeziyor(geziyor.Options{
StartRequestsFunc: func() []*http.Request {
req, _ := http.NewRequest("GET", "http://quotes.toscrape.com/", nil)
return []*http.Request{req}
},
ParseFunc: func(r *geziyor.Response) {
r.Exports <- []string{r.Status}
},
Exporters: []geziyor.Exporter{exporter.CSVExporter{}},
}).Start()
}

View File

@ -2,6 +2,7 @@ package geziyor
import (
"github.com/fpfeng/httpcache"
"net/http"
"time"
)
@ -14,8 +15,11 @@ type Options struct {
// First requests will made to this url array. (Concurrently)
StartURLs []string
// StartRequestsFunc called on scraper start
StartRequestsFunc func() []*http.Request
// ParseFunc is callback of StartURLs response.
ParseFunc func(response *Response)
ParseFunc func(r *Response)
// Timeout is global request timeout
Timeout time.Duration