diff --git a/geziyor.go b/geziyor.go index 9bbb4cb..5086d74 100644 --- a/geziyor.go +++ b/geziyor.go @@ -81,12 +81,22 @@ func NewGeziyor(opt Options) *Geziyor { // Start starts scraping func (g *Geziyor) Start() { - for _, startURL := range g.opt.StartURLs { - go g.Get(startURL, g.opt.ParseFunc) + log.Println("Scraping Started") + + if g.opt.StartRequestsFunc == nil { + for _, startURL := range g.opt.StartURLs { + go g.Get(startURL, g.opt.ParseFunc) + } + } else { + for _, req := range g.opt.StartRequestsFunc() { + go g.Do(req, g.opt.ParseFunc) + } } time.Sleep(time.Millisecond) g.wg.Wait() + + log.Println("Scraping Finished") } // Get issues a GET to the specified URL. @@ -124,7 +134,9 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) { } // Modify Request + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") req.Header.Set("Accept-Charset", "utf-8") + req.Header.Set("Accept-Language", "en") req.Header.Set("User-Agent", g.opt.UserAgent) // Acquire Semaphore diff --git a/geziyor_test.go b/geziyor_test.go index 1d34112..1bed710 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -7,6 +7,7 @@ import ( "github.com/geziyor/geziyor" "github.com/geziyor/geziyor/exporter" "math/rand" + "net/http" "testing" "time" ) @@ -63,20 +64,20 @@ func quotesParse(r *geziyor.Response) { func TestLinks(t *testing.T) { geziyor.NewGeziyor(geziyor.Options{ - AllowedDomains: []string{"quotes.toscrape.com"}, - StartURLs: []string{"http://quotes.toscrape.com/"}, + AllowedDomains: []string{"books.toscrape.com"}, + StartURLs: []string{"http://books.toscrape.com/"}, ParseFunc: linksParse, + Exporters: []geziyor.Exporter{exporter.CSVExporter{}}, }).Start() } func linksParse(r *geziyor.Response) { - //r.Exports <- map[string]interface{}{"href": r.Request.URL.String()} + r.Exports <- []string{r.Request.URL.String()} r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) { if href, ok := s.Attr("href"); ok { go r.Geziyor.Get(r.JoinURL(href), linksParse) } }) - } func TestRandomDelay(t *testing.T) { @@ -87,3 +88,16 @@ func TestRandomDelay(t *testing.T) { randomDelay := rand.Intn(int(max-min)) + int(min) fmt.Println(time.Duration(randomDelay)) } + +func TestStartRequestsFunc(t *testing.T) { + geziyor.NewGeziyor(geziyor.Options{ + StartRequestsFunc: func() []*http.Request { + req, _ := http.NewRequest("GET", "http://quotes.toscrape.com/", nil) + return []*http.Request{req} + }, + ParseFunc: func(r *geziyor.Response) { + r.Exports <- []string{r.Status} + }, + Exporters: []geziyor.Exporter{exporter.CSVExporter{}}, + }).Start() +} diff --git a/options.go b/options.go index 6629c5f..9f1f96f 100644 --- a/options.go +++ b/options.go @@ -2,6 +2,7 @@ package geziyor import ( "github.com/fpfeng/httpcache" + "net/http" "time" ) @@ -14,8 +15,11 @@ type Options struct { // First requests will made to this url array. (Concurrently) StartURLs []string + // StartRequestsFunc called on scraper start + StartRequestsFunc func() []*http.Request + // ParseFunc is callback of StartURLs response. - ParseFunc func(response *Response) + ParseFunc func(r *Response) // Timeout is global request timeout Timeout time.Duration