Start requests function implemented.

This commit is contained in:
Musab Gültekin
2019-06-12 12:40:38 +03:00
parent 2f6cb06982
commit bd8d58576f
3 changed files with 37 additions and 7 deletions

View File

@ -81,12 +81,22 @@ func NewGeziyor(opt Options) *Geziyor {
// Start starts scraping // Start starts scraping
func (g *Geziyor) Start() { func (g *Geziyor) Start() {
log.Println("Scraping Started")
if g.opt.StartRequestsFunc == nil {
for _, startURL := range g.opt.StartURLs { for _, startURL := range g.opt.StartURLs {
go g.Get(startURL, g.opt.ParseFunc) go g.Get(startURL, g.opt.ParseFunc)
} }
} else {
for _, req := range g.opt.StartRequestsFunc() {
go g.Do(req, g.opt.ParseFunc)
}
}
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
g.wg.Wait() g.wg.Wait()
log.Println("Scraping Finished")
} }
// Get issues a GET to the specified URL. // Get issues a GET to the specified URL.
@ -124,7 +134,9 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
} }
// Modify Request // Modify Request
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Set("Accept-Charset", "utf-8") req.Header.Set("Accept-Charset", "utf-8")
req.Header.Set("Accept-Language", "en")
req.Header.Set("User-Agent", g.opt.UserAgent) req.Header.Set("User-Agent", g.opt.UserAgent)
// Acquire Semaphore // Acquire Semaphore

View File

@ -7,6 +7,7 @@ import (
"github.com/geziyor/geziyor" "github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/exporter" "github.com/geziyor/geziyor/exporter"
"math/rand" "math/rand"
"net/http"
"testing" "testing"
"time" "time"
) )
@ -63,20 +64,20 @@ func quotesParse(r *geziyor.Response) {
func TestLinks(t *testing.T) { func TestLinks(t *testing.T) {
geziyor.NewGeziyor(geziyor.Options{ geziyor.NewGeziyor(geziyor.Options{
AllowedDomains: []string{"quotes.toscrape.com"}, AllowedDomains: []string{"books.toscrape.com"},
StartURLs: []string{"http://quotes.toscrape.com/"}, StartURLs: []string{"http://books.toscrape.com/"},
ParseFunc: linksParse, ParseFunc: linksParse,
Exporters: []geziyor.Exporter{exporter.CSVExporter{}},
}).Start() }).Start()
} }
func linksParse(r *geziyor.Response) { func linksParse(r *geziyor.Response) {
//r.Exports <- map[string]interface{}{"href": r.Request.URL.String()} r.Exports <- []string{r.Request.URL.String()}
r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) { r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) {
if href, ok := s.Attr("href"); ok { if href, ok := s.Attr("href"); ok {
go r.Geziyor.Get(r.JoinURL(href), linksParse) go r.Geziyor.Get(r.JoinURL(href), linksParse)
} }
}) })
} }
func TestRandomDelay(t *testing.T) { func TestRandomDelay(t *testing.T) {
@ -87,3 +88,16 @@ func TestRandomDelay(t *testing.T) {
randomDelay := rand.Intn(int(max-min)) + int(min) randomDelay := rand.Intn(int(max-min)) + int(min)
fmt.Println(time.Duration(randomDelay)) fmt.Println(time.Duration(randomDelay))
} }
func TestStartRequestsFunc(t *testing.T) {
geziyor.NewGeziyor(geziyor.Options{
StartRequestsFunc: func() []*http.Request {
req, _ := http.NewRequest("GET", "http://quotes.toscrape.com/", nil)
return []*http.Request{req}
},
ParseFunc: func(r *geziyor.Response) {
r.Exports <- []string{r.Status}
},
Exporters: []geziyor.Exporter{exporter.CSVExporter{}},
}).Start()
}

View File

@ -2,6 +2,7 @@ package geziyor
import ( import (
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"net/http"
"time" "time"
) )
@ -14,8 +15,11 @@ type Options struct {
// First requests will made to this url array. (Concurrently) // First requests will made to this url array. (Concurrently)
StartURLs []string StartURLs []string
// StartRequestsFunc called on scraper start
StartRequestsFunc func() []*http.Request
// ParseFunc is callback of StartURLs response. // ParseFunc is callback of StartURLs response.
ParseFunc func(response *Response) ParseFunc func(r *Response)
// Timeout is global request timeout // Timeout is global request timeout
Timeout time.Duration Timeout time.Duration