From f7f4e401e25a889074d1c8b6d04c72d343be10a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Wed, 12 Jun 2019 21:30:45 +0300 Subject: [PATCH] Metadata adding on requests support added. StartRequests function implemented. --- geziyor.go | 40 +++++++++++++++++++--------------------- geziyor_test.go | 44 ++++++++++++++++++++++++++++++++++++++++++-- options.go | 3 +-- request.go | 11 +++++++++++ response.go | 1 + 5 files changed, 74 insertions(+), 25 deletions(-) create mode 100644 request.go diff --git a/geziyor.go b/geziyor.go index 5086d74..b942b45 100644 --- a/geziyor.go +++ b/geziyor.go @@ -46,7 +46,7 @@ func init() { func NewGeziyor(opt Options) *Geziyor { geziyor := &Geziyor{ client: &http.Client{ - Timeout: time.Second * 60, + Timeout: time.Second * 180, // Google's timeout }, opt: opt, } @@ -106,7 +106,7 @@ func (g *Geziyor) Get(url string, callback func(resp *Response)) { log.Printf("Request creating error %v\n", err) return } - g.Do(req, callback) + g.Do(&Request{Request: req}, callback) } // Head issues a HEAD to the specified URL @@ -116,11 +116,11 @@ func (g *Geziyor) Head(url string, callback func(resp *Response)) { log.Printf("Request creating error %v\n", err) return } - g.Do(req, callback) + g.Do(&Request{Request: req}, callback) } // Do sends an HTTP request -func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) { +func (g *Geziyor) Do(req *Request, callback func(resp *Response)) { g.wg.Add(1) defer g.wg.Done() defer func() { @@ -139,23 +139,14 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) { req.Header.Set("Accept-Language", "en") req.Header.Set("User-Agent", g.opt.UserAgent) - // Acquire Semaphore g.acquireSem(req) - // Request Delay - if g.opt.RequestDelayRandomize { - min := float64(g.opt.RequestDelay) * 0.5 - max := float64(g.opt.RequestDelay) * 1.5 - time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min))) - } else { - time.Sleep(g.opt.RequestDelay) - } + g.delay() - // Log log.Println("Fetching: ", req.URL.String()) // Do request - resp, err := g.client.Do(req) + resp, err := g.client.Do(req.Request) if resp != nil { defer resp.Body.Close() } @@ -178,7 +169,6 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) { } } - // Continue reading body body, err := ioutil.ReadAll(bodyReader) if err != nil { log.Printf("Reading Body error: %v\n", err) @@ -186,18 +176,16 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) { return } - // Release Semaphore g.releaseSem(req) - // Create response response := Response{ Response: resp, Body: body, + Meta: req.Meta, Geziyor: g, Exports: make(chan interface{}), } - // Create HTML Document if response.isHTML() { response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(body)) } @@ -226,7 +214,7 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) { time.Sleep(time.Millisecond) } -func (g *Geziyor) acquireSem(req *http.Request) { +func (g *Geziyor) acquireSem(req *Request) { if g.opt.ConcurrentRequests != 0 { g.semGlobal <- struct{}{} } @@ -245,7 +233,7 @@ func (g *Geziyor) acquireSem(req *http.Request) { } } -func (g *Geziyor) releaseSem(req *http.Request) { +func (g *Geziyor) releaseSem(req *Request) { if g.opt.ConcurrentRequests != 0 { <-g.semGlobal } @@ -272,6 +260,16 @@ func (g *Geziyor) checkURL(parsedURL *url.URL) bool { return true } +func (g *Geziyor) delay() { + if g.opt.RequestDelayRandomize { + min := float64(g.opt.RequestDelay) * 0.5 + max := float64(g.opt.RequestDelay) * 1.5 + time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min))) + } else { + time.Sleep(g.opt.RequestDelay) + } +} + // contains checks whether []string contains string func contains(s []string, e string) bool { for _, a := range s { diff --git a/geziyor_test.go b/geziyor_test.go index 1bed710..6485bc3 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -1,6 +1,7 @@ package geziyor_test import ( + "encoding/json" "fmt" "github.com/PuerkitoBio/goquery" "github.com/fpfeng/httpcache" @@ -91,9 +92,9 @@ func TestRandomDelay(t *testing.T) { func TestStartRequestsFunc(t *testing.T) { geziyor.NewGeziyor(geziyor.Options{ - StartRequestsFunc: func() []*http.Request { + StartRequestsFunc: func() []*geziyor.Request { req, _ := http.NewRequest("GET", "http://quotes.toscrape.com/", nil) - return []*http.Request{req} + return []*geziyor.Request{{Request: req}} }, ParseFunc: func(r *geziyor.Response) { r.Exports <- []string{r.Status} @@ -101,3 +102,42 @@ func TestStartRequestsFunc(t *testing.T) { Exporters: []geziyor.Exporter{exporter.CSVExporter{}}, }).Start() } + +func TestAlmaany(t *testing.T) { + alphabet := "ab" + + geziyor.NewGeziyor(geziyor.Options{ + AllowedDomains: []string{"www.almaany.com"}, + StartRequestsFunc: func() []*geziyor.Request { + base := "http://www.almaany.com/suggest.php?term=%c%c&lang=turkish&t=d" + var requests []*geziyor.Request + for _, c1 := range alphabet { + for _, c2 := range alphabet { + req, _ := http.NewRequest("GET", fmt.Sprintf(base, c1, c2), nil) + requests = append(requests, &geziyor.Request{Request: req, Meta: map[string]interface{}{"word": string(c1) + string(c2)}}) + } + } + return requests + }, + ConcurrentRequests: 10, + ParseFunc: parseAlmaany, + Exporters: []geziyor.Exporter{exporter.CSVExporter{}}, + }).Start() + +} + +func parseAlmaany(r *geziyor.Response) { + var words []string + _ = json.Unmarshal(r.Body, &words) + r.Exports <- words + + if len(words) == 20 { + alphabet := "abcde" + base := "http://www.almaany.com/suggest.php?term=%s%c&lang=turkish&t=d" + + for _, c := range alphabet { + req, _ := http.NewRequest("GET", fmt.Sprintf(base, r.Meta["word"], c), nil) + go r.Geziyor.Do(&geziyor.Request{Request: req, Meta: map[string]interface{}{"word": r.Meta["word"].(string) + string(c)}}, parseAlmaany) + } + } +} diff --git a/options.go b/options.go index 9f1f96f..537aefd 100644 --- a/options.go +++ b/options.go @@ -2,7 +2,6 @@ package geziyor import ( "github.com/fpfeng/httpcache" - "net/http" "time" ) @@ -16,7 +15,7 @@ type Options struct { StartURLs []string // StartRequestsFunc called on scraper start - StartRequestsFunc func() []*http.Request + StartRequestsFunc func() []*Request // ParseFunc is callback of StartURLs response. ParseFunc func(r *Response) diff --git a/request.go b/request.go new file mode 100644 index 0000000..192326b --- /dev/null +++ b/request.go @@ -0,0 +1,11 @@ +package geziyor + +import ( + "net/http" +) + +// Request is a small wrapper around *http.Request that contains Metadata +type Request struct { + *http.Request + Meta map[string]interface{} +} diff --git a/response.go b/response.go index bb017d2..a7d4c11 100644 --- a/response.go +++ b/response.go @@ -13,6 +13,7 @@ type Response struct { *http.Response Body []byte DocHTML *goquery.Document + Meta map[string]interface{} Geziyor *Geziyor Exports chan interface{}