From ca2414c5c8cb0af1e079521c2f9d7ff746a6dc78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Sun, 9 Jun 2019 21:13:30 +0300 Subject: [PATCH] Request callbacks added. Recover from all panics and continue scraping. Only parse HTML if response is HTML. --- geziyor.go | 33 +++++++++++++++++++++++---------- geziyor_test.go | 12 ++++++------ go.mod | 2 ++ go.sum | 4 ++++ response.go | 15 +++++++++++++-- 5 files changed, 48 insertions(+), 18 deletions(-) diff --git a/geziyor.go b/geziyor.go index bbfca45..684eed2 100644 --- a/geziyor.go +++ b/geziyor.go @@ -11,6 +11,7 @@ import ( "net/http" "net/url" "os" + "runtime/debug" "sync" "time" ) @@ -80,29 +81,34 @@ func (g *Geziyor) Start() { } // Get issues a GET to the specified URL. -func (g *Geziyor) Get(url string) { +func (g *Geziyor) Get(url string, callbacks ...func(resp *Response)) { req, err := http.NewRequest("GET", url, nil) if err != nil { log.Printf("Request creating error %v\n", err) return } - g.Do(req) + g.Do(req, callbacks...) } // Head issues a HEAD to the specified URL -func (g *Geziyor) Head(url string) { +func (g *Geziyor) Head(url string, callbacks ...func(resp *Response)) { req, err := http.NewRequest("HEAD", url, nil) if err != nil { log.Printf("Request creating error %v\n", err) return } - g.Do(req) + g.Do(req, callbacks...) } // Do sends an HTTP request -func (g *Geziyor) Do(req *http.Request) { +func (g *Geziyor) Do(req *http.Request, callbacks ...func(resp *Response)) { g.wg.Add(1) defer g.wg.Done() + defer func() { + if r := recover(); r != nil { + log.Println(string(debug.Stack())) + } + }() if !g.checkURL(req.URL) { return @@ -157,23 +163,30 @@ func (g *Geziyor) Do(req *http.Request) { // Release Semaphore g.releaseSem(req) - // Create Document - doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(body)) - // Create response response := Response{ Response: resp, Body: body, - Doc: doc, Geziyor: g, Exports: make(chan interface{}, 1), } + // Create HTML Document + if response.isHTML() { + response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(body)) + } + // Export Function go Export(&response) // ParseFunc response - g.opt.ParseFunc(&response) + if len(callbacks) == 0 && g.opt.ParseFunc != nil { + g.opt.ParseFunc(&response) + } else { + for _, callback := range callbacks { + callback(&response) + } + } time.Sleep(time.Millisecond) } diff --git a/geziyor_test.go b/geziyor_test.go index 3fe1f93..580fb0b 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -14,7 +14,7 @@ func TestGeziyor_Simple(t *testing.T) { geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://api.ipify.org"}, ParseFunc: func(r *geziyor.Response) { - fmt.Println(r.Doc.Text()) + fmt.Println(string(r.Body)) }, }).Start() } @@ -24,8 +24,8 @@ func TestGeziyor_IP(t *testing.T) { StartURLs: []string{"http://api.ipify.org"}, Cache: httpcache.NewMemoryCache(), ParseFunc: func(r *geziyor.Response) { - fmt.Println(r.Doc.Text()) - r.Exports <- r.Doc.Text() + fmt.Println(string(r.Body)) + r.Exports <- string(r.Body) r.Geziyor.Get("http://api.ipify.org") }, }) @@ -36,7 +36,7 @@ func TestGeziyor_HTML(t *testing.T) { gez := geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://quotes.toscrape.com/"}, ParseFunc: func(r *geziyor.Response) { - r.Doc.Find("div.quote").Each(func(i int, s *goquery.Selection) { + r.DocHTML.Find("div.quote").Each(func(i int, s *goquery.Selection) { // Export Data r.Exports <- map[string]interface{}{ "text": s.Find("span.text").Text(), @@ -48,7 +48,7 @@ func TestGeziyor_HTML(t *testing.T) { }) // Next Page - if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok { + if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok { go r.Geziyor.Get(r.JoinURL(href)) } }, @@ -62,7 +62,7 @@ func TestGeziyor_Concurrent_Requests(t *testing.T) { StartURLs: []string{"http://quotes.toscrape.com/"}, ParseFunc: func(r *geziyor.Response) { //r.Exports <- map[string]interface{}{"href": r.Request.URL.String()} - r.Doc.Find("a").Each(func(i int, s *goquery.Selection) { + r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) { if href, ok := s.Attr("href"); ok { go r.Geziyor.Get(r.JoinURL(href)) } diff --git a/go.mod b/go.mod index 7f5b3ba..5ce3958 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,8 @@ go 1.12 require ( github.com/PuerkitoBio/goquery v1.5.0 + github.com/antchfx/htmlquery v1.0.0 // indirect + github.com/antchfx/xpath v1.0.0 // indirect github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 golang.org/x/net v0.0.0-20181114220301-adae6a3d119a golang.org/x/text v0.3.2 // indirect diff --git a/go.sum b/go.sum index 6c455f9..da6dcfb 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,10 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/antchfx/htmlquery v1.0.0 h1:O5IXz8fZF3B3MW+B33MZWbTHBlYmcfw0BAxgErHuaMA= +github.com/antchfx/htmlquery v1.0.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8= +github.com/antchfx/xpath v1.0.0 h1:Q5gFgh2O40VTSwMOVbFE7nFNRBu3tS21Tn0KAWeEjtk= +github.com/antchfx/xpath v1.0.0/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ= github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= diff --git a/response.go b/response.go index d64dc2a..bb017d2 100644 --- a/response.go +++ b/response.go @@ -4,14 +4,15 @@ import ( "github.com/PuerkitoBio/goquery" "net/http" "net/url" + "strings" ) // Response type wraps http.Response // Contains parsed response data and Geziyor functions. type Response struct { *http.Response - Body []byte - Doc *goquery.Document + Body []byte + DocHTML *goquery.Document Geziyor *Geziyor Exports chan interface{} @@ -27,3 +28,13 @@ func (r *Response) JoinURL(relativeURL string) string { joinedURL := r.Response.Request.URL.ResolveReference(parsedRelativeURL) return joinedURL.String() } + +func (r *Response) isHTML() bool { + contentType := r.Header.Get("Content-Type") + for _, htmlContentType := range []string{"text/html", "application/xhtml+xml", "application/vnd.wap.xhtml+xml"} { + if strings.Contains(contentType, htmlContentType) { + return true + } + } + return false +}