From e4e87234263efce93accf865d371ba6448e71c88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Tue, 11 Jun 2019 14:24:48 +0300 Subject: [PATCH] Callback are now mandatory as almost all the scrapers use it. --- README.md | 29 ++------------------ geziyor.go | 22 +++++++-------- geziyor_test.go | 73 +++++++++++++++++++++++++------------------------ go.mod | 2 -- go.sum | 4 --- 5 files changed, 52 insertions(+), 78 deletions(-) diff --git a/README.md b/README.md index ae81630..b330752 100644 --- a/README.md +++ b/README.md @@ -21,38 +21,15 @@ Simplest usage geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://api.ipify.org"}, ParseFunc: func(r *geziyor.Response) { - fmt.Println(r.Doc.Text()) + fmt.Println(string(r.Body)) }, }).Start() ``` -Export all quotes and authors to out.json file. - -```go -geziyor := NewGeziyor(Opt{ - StartURLs: []string{"http://quotes.toscrape.com/"}, - ParseFunc: func(r *Response) { - r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) { - // Export Data - r.Exports <- map[string]interface{}{ - "text": s.Find("span.text").Text(), - "author": s.Find("small.author").Text(), - } - }) - - // Next Page - if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok { - go r.Geziyor.Get(r.JoinURL(href)) - } - }, -}) -geziyor.Start() -``` +## Status +We highly recommend you to use go modules. As this project is in **development stage** right now and **API is not stable**. ## Installation go get github.com/geziyor/geziyor - -## Status -We highly recommend you to use go modules. As this project is in **development stage** right now and **API is not stable**. diff --git a/geziyor.go b/geziyor.go index 684eed2..e4530bd 100644 --- a/geziyor.go +++ b/geziyor.go @@ -73,7 +73,7 @@ func NewGeziyor(opt Options) *Geziyor { // Start starts scraping func (g *Geziyor) Start() { for _, startURL := range g.opt.StartURLs { - go g.Get(startURL) + go g.Get(startURL, g.opt.ParseFunc) } time.Sleep(time.Millisecond) @@ -81,27 +81,27 @@ func (g *Geziyor) Start() { } // Get issues a GET to the specified URL. -func (g *Geziyor) Get(url string, callbacks ...func(resp *Response)) { +func (g *Geziyor) Get(url string, callback func(resp *Response)) { req, err := http.NewRequest("GET", url, nil) if err != nil { log.Printf("Request creating error %v\n", err) return } - g.Do(req, callbacks...) + g.Do(req, callback) } // Head issues a HEAD to the specified URL -func (g *Geziyor) Head(url string, callbacks ...func(resp *Response)) { +func (g *Geziyor) Head(url string, callback func(resp *Response)) { req, err := http.NewRequest("HEAD", url, nil) if err != nil { log.Printf("Request creating error %v\n", err) return } - g.Do(req, callbacks...) + g.Do(req, callback) } // Do sends an HTTP request -func (g *Geziyor) Do(req *http.Request, callbacks ...func(resp *Response)) { +func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) { g.wg.Add(1) defer g.wg.Done() defer func() { @@ -179,12 +179,12 @@ func (g *Geziyor) Do(req *http.Request, callbacks ...func(resp *Response)) { // Export Function go Export(&response) - // ParseFunc response - if len(callbacks) == 0 && g.opt.ParseFunc != nil { - g.opt.ParseFunc(&response) + // Callbacks + if callback != nil { + callback(&response) } else { - for _, callback := range callbacks { - callback(&response) + if g.opt.ParseFunc != nil { + g.opt.ParseFunc(&response) } } time.Sleep(time.Millisecond) diff --git a/geziyor_test.go b/geziyor_test.go index 580fb0b..d9dc628 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -10,7 +10,7 @@ import ( "time" ) -func TestGeziyor_Simple(t *testing.T) { +func TestSimple(t *testing.T) { geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://api.ipify.org"}, ParseFunc: func(r *geziyor.Response) { @@ -19,57 +19,60 @@ func TestGeziyor_Simple(t *testing.T) { }).Start() } -func TestGeziyor_IP(t *testing.T) { +func TestSimpleCache(t *testing.T) { gez := geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://api.ipify.org"}, Cache: httpcache.NewMemoryCache(), ParseFunc: func(r *geziyor.Response) { fmt.Println(string(r.Body)) r.Exports <- string(r.Body) - r.Geziyor.Get("http://api.ipify.org") + r.Geziyor.Get("http://api.ipify.org", nil) }, }) gez.Start() } -func TestGeziyor_HTML(t *testing.T) { - gez := geziyor.NewGeziyor(geziyor.Options{ +func TestQuotes(t *testing.T) { + geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://quotes.toscrape.com/"}, - ParseFunc: func(r *geziyor.Response) { - r.DocHTML.Find("div.quote").Each(func(i int, s *goquery.Selection) { - // Export Data - r.Exports <- map[string]interface{}{ - "text": s.Find("span.text").Text(), - "author": s.Find("small.author").Text(), - "tags": s.Find("div.tags > a.tag").Map(func(_ int, s *goquery.Selection) string { - return s.Text() - }), - } - }) - - // Next Page - if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok { - go r.Geziyor.Get(r.JoinURL(href)) - } - }, - }) - gez.Start() + ParseFunc: quotesParse, + }).Start() } -func TestGeziyor_Concurrent_Requests(t *testing.T) { - gez := geziyor.NewGeziyor(geziyor.Options{ +func quotesParse(r *geziyor.Response) { + r.DocHTML.Find("div.quote").Each(func(i int, s *goquery.Selection) { + // Export Data + r.Exports <- map[string]interface{}{ + "text": s.Find("span.text").Text(), + "author": s.Find("small.author").Text(), + "tags": s.Find("div.tags > a.tag").Map(func(_ int, s *goquery.Selection) string { + return s.Text() + }), + } + }) + + // Next Page + if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok { + go r.Geziyor.Get(r.JoinURL(href), quotesParse) + } +} + +func TestLinks(t *testing.T) { + geziyor.NewGeziyor(geziyor.Options{ AllowedDomains: []string{"quotes.toscrape.com"}, StartURLs: []string{"http://quotes.toscrape.com/"}, - ParseFunc: func(r *geziyor.Response) { - //r.Exports <- map[string]interface{}{"href": r.Request.URL.String()} - r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) { - if href, ok := s.Attr("href"); ok { - go r.Geziyor.Get(r.JoinURL(href)) - } - }) - }, + ParseFunc: linksParse, + }).Start() +} + +func linksParse(r *geziyor.Response) { + //r.Exports <- map[string]interface{}{"href": r.Request.URL.String()} + r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) { + if href, ok := s.Attr("href"); ok { + go r.Geziyor.Get(r.JoinURL(href), linksParse) + } }) - gez.Start() + } func TestRandomDelay(t *testing.T) { diff --git a/go.mod b/go.mod index 5ce3958..7f5b3ba 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,6 @@ go 1.12 require ( github.com/PuerkitoBio/goquery v1.5.0 - github.com/antchfx/htmlquery v1.0.0 // indirect - github.com/antchfx/xpath v1.0.0 // indirect github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 golang.org/x/net v0.0.0-20181114220301-adae6a3d119a golang.org/x/text v0.3.2 // indirect diff --git a/go.sum b/go.sum index da6dcfb..6c455f9 100644 --- a/go.sum +++ b/go.sum @@ -2,10 +2,6 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= -github.com/antchfx/htmlquery v1.0.0 h1:O5IXz8fZF3B3MW+B33MZWbTHBlYmcfw0BAxgErHuaMA= -github.com/antchfx/htmlquery v1.0.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8= -github.com/antchfx/xpath v1.0.0 h1:Q5gFgh2O40VTSwMOVbFE7nFNRBu3tS21Tn0KAWeEjtk= -github.com/antchfx/xpath v1.0.0/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ= github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=