From 6caf1effd62d503765c920060be49803e201bd4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Fri, 14 Jun 2019 15:23:56 +0300 Subject: [PATCH 1/2] Rendered field exported to support rendered requests on Do function. Data races fixed. --- README.md | 2 +- exporter/csv.go | 2 +- exporter/json.go | 2 +- geziyor.go | 44 +++++++++++++++++++++++++++--------------- geziyor_test.go | 50 +++++------------------------------------------- go.mod | 2 +- go.sum | 6 ++++++ request.go | 7 +++---- 8 files changed, 47 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index 3e8b777..95df13a 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ func quotesParse(r *geziyor.Response) { } }) if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok { - go r.Geziyor.Get(r.JoinURL(href), quotesParse) + r.Geziyor.Get(r.JoinURL(href), quotesParse) } } ``` diff --git a/exporter/csv.go b/exporter/csv.go index b4b3a47..38103d4 100644 --- a/exporter/csv.go +++ b/exporter/csv.go @@ -19,7 +19,7 @@ type CSVExporter struct { writer *csv.Writer } -func (e CSVExporter) Export(response *geziyor.Response) { +func (e *CSVExporter) Export(response *geziyor.Response) { // Default filename if e.FileName == "" { diff --git a/exporter/json.go b/exporter/json.go index d9b2dc6..f3bea61 100644 --- a/exporter/json.go +++ b/exporter/json.go @@ -19,7 +19,7 @@ type JSONExporter struct { } // Export exports response data as JSON streaming file -func (e JSONExporter) Export(response *geziyor.Response) { +func (e *JSONExporter) Export(response *geziyor.Response) { // Default filename if e.FileName == "" { diff --git a/geziyor.go b/geziyor.go index 76d0e6f..f389951 100644 --- a/geziyor.go +++ b/geziyor.go @@ -29,14 +29,17 @@ type Exporter interface { type Geziyor struct { Opt Options - client *http.Client - wg sync.WaitGroup - visitedURLS []string - semGlobal chan struct{} - semHosts struct { + client *http.Client + wg sync.WaitGroup + semGlobal chan struct{} + semHosts struct { sync.RWMutex hostSems map[string]chan struct{} } + visitedURLS struct { + sync.RWMutex + visitedURLS []string + } } func init() { @@ -88,7 +91,7 @@ func (g *Geziyor) Start() { if g.Opt.StartRequestsFunc == nil { for _, startURL := range g.Opt.StartURLs { - go g.Get(startURL, g.Opt.ParseFunc) + g.Get(startURL, g.Opt.ParseFunc) } } else { g.Opt.StartRequestsFunc(g) @@ -112,13 +115,14 @@ func (g *Geziyor) Get(url string, callback func(resp *Response)) { // GetRendered issues GET request using headless browser // Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed. +// Rendered requests only supported for GET requests. func (g *Geziyor) GetRendered(url string, callback func(resp *Response)) { req, err := http.NewRequest("GET", url, nil) if err != nil { log.Printf("Request creating error %v\n", err) return } - g.Do(&Request{Request: req, rendered: true}, callback) + g.Do(&Request{Request: req, Rendered: true}, callback) } // Head issues a HEAD to the specified URL @@ -134,6 +138,11 @@ func (g *Geziyor) Head(url string, callback func(resp *Response)) { // Do sends an HTTP request func (g *Geziyor) Do(req *Request, callback func(resp *Response)) { g.wg.Add(1) + go g.do(req, callback) +} + +// Do sends an HTTP request +func (g *Geziyor) do(req *Request, callback func(resp *Response)) { defer g.wg.Done() defer func() { if r := recover(); r != nil { @@ -145,11 +154,11 @@ func (g *Geziyor) Do(req *Request, callback func(resp *Response)) { return } - // Do request normal or chrome and read response + // Do request normal or Chrome and read response var response *Response var err error - if !req.rendered { - response, err = g.doRequest(req) + if !req.Rendered { + response, err = g.doRequestClient(req) } else { response, err = g.doRequestChrome(req) } @@ -185,7 +194,7 @@ func (g *Geziyor) Do(req *Request, callback func(resp *Response)) { time.Sleep(time.Millisecond) } -func (g *Geziyor) doRequest(req *Request) (*Response, error) { +func (g *Geziyor) doRequestClient(req *Request) (*Response, error) { g.acquireSem(req) defer g.releaseSem(req) @@ -267,8 +276,8 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) { response := &Response{ //Response: resp, - Body: []byte(res), - //Meta: request.Meta, + Body: []byte(res), + Meta: req.Meta, Geziyor: g, Exports: make(chan interface{}), } @@ -314,11 +323,16 @@ func (g *Geziyor) checkURL(parsedURL *url.URL) bool { // Check for duplicate requests if !g.Opt.URLRevisitEnabled { - if contains(g.visitedURLS, rawURL) { + g.visitedURLS.RLock() + if contains(g.visitedURLS.visitedURLS, rawURL) { + g.visitedURLS.RUnlock() //log.Printf("URL already visited %s\n", rawURL) return false } - g.visitedURLS = append(g.visitedURLS, rawURL) + g.visitedURLS.RUnlock() + g.visitedURLS.Lock() + g.visitedURLS.visitedURLS = append(g.visitedURLS.visitedURLS, rawURL) + g.visitedURLS.Unlock() } return true diff --git a/geziyor_test.go b/geziyor_test.go index 44f17fd..0e20d2b 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -1,14 +1,12 @@ package geziyor_test import ( - "encoding/json" "fmt" "github.com/PuerkitoBio/goquery" "github.com/fpfeng/httpcache" "github.com/geziyor/geziyor" "github.com/geziyor/geziyor/exporter" "math/rand" - "net/http" "testing" "time" ) @@ -39,7 +37,7 @@ func TestQuotes(t *testing.T) { geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://quotes.toscrape.com/"}, ParseFunc: quotesParse, - Exporters: []geziyor.Exporter{exporter.JSONExporter{}}, + Exporters: []geziyor.Exporter{&exporter.JSONExporter{}}, }).Start() } @@ -54,8 +52,6 @@ func quotesParse(r *geziyor.Response) { return s.Text() }), } - // Or, for CSV - //r.Exports <- []string{s.Find("span.text").Text(), s.Find("small.author").Text()} }) // Next Page @@ -72,11 +68,11 @@ func TestLinks(t *testing.T) { r.Exports <- []string{r.Request.URL.String()} r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) { if href, ok := s.Attr("href"); ok { - go r.Geziyor.Get(r.JoinURL(href), r.Geziyor.Opt.ParseFunc) + r.Geziyor.Get(r.JoinURL(href), r.Geziyor.Opt.ParseFunc) } }) }, - Exporters: []geziyor.Exporter{exporter.CSVExporter{}}, + Exporters: []geziyor.Exporter{&exporter.CSVExporter{}}, }).Start() } @@ -92,53 +88,17 @@ func TestRandomDelay(t *testing.T) { func TestStartRequestsFunc(t *testing.T) { geziyor.NewGeziyor(geziyor.Options{ StartRequestsFunc: func(g *geziyor.Geziyor) { - go g.Get("http://quotes.toscrape.com/", g.Opt.ParseFunc) + g.Get("http://quotes.toscrape.com/", g.Opt.ParseFunc) }, ParseFunc: func(r *geziyor.Response) { r.DocHTML.Find("a").Each(func(_ int, s *goquery.Selection) { r.Exports <- s.AttrOr("href", "") }) }, - Exporters: []geziyor.Exporter{exporter.JSONExporter{}}, + Exporters: []geziyor.Exporter{&exporter.JSONExporter{}}, }).Start() } -func TestAlmaany(t *testing.T) { - alphabet := "ab" - - geziyor.NewGeziyor(geziyor.Options{ - AllowedDomains: []string{"www.almaany.com"}, - StartRequestsFunc: func(g *geziyor.Geziyor) { - base := "http://www.almaany.com/suggest.php?term=%c%c&lang=turkish&t=d" - for _, c1 := range alphabet { - for _, c2 := range alphabet { - req, _ := http.NewRequest("GET", fmt.Sprintf(base, c1, c2), nil) - go g.Do(&geziyor.Request{Request: req, Meta: map[string]interface{}{"word": string(c1) + string(c2)}}, parseAlmaany) - } - } - }, - ConcurrentRequests: 10, - Exporters: []geziyor.Exporter{exporter.CSVExporter{}}, - }).Start() - -} - -func parseAlmaany(r *geziyor.Response) { - var words []string - _ = json.Unmarshal(r.Body, &words) - r.Exports <- words - - if len(words) == 20 { - alphabet := "ab" - base := "http://www.almaany.com/suggest.php?term=%s%c&lang=turkish&t=d" - - for _, c := range alphabet { - req, _ := http.NewRequest("GET", fmt.Sprintf(base, r.Meta["word"], c), nil) - go r.Geziyor.Do(&geziyor.Request{Request: req, Meta: map[string]interface{}{"word": r.Meta["word"].(string) + string(c)}}, parseAlmaany) - } - } -} - func TestGetRendered(t *testing.T) { geziyor.NewGeziyor(geziyor.Options{ StartRequestsFunc: func(g *geziyor.Geziyor) { diff --git a/go.mod b/go.mod index fcde546..cf03234 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,6 @@ require ( github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9 github.com/chromedp/chromedp v0.3.0 github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 - golang.org/x/net v0.0.0-20181114220301-adae6a3d119a + golang.org/x/net v0.0.0-20190522155817-f3200d17e092 golang.org/x/text v0.3.2 // indirect ) diff --git a/go.sum b/go.sum index 10868b5..5a38595 100644 --- a/go.sum +++ b/go.sum @@ -19,10 +19,16 @@ github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307/go.mod h1:BjPj+aVjl9FW github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983 h1:wL11wNW7dhKIcRCHSm4sHKPWz0tt4mwBsVodG7+Xyqg= github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190522155817-f3200d17e092 h1:4QSRKanuywn15aTZvI/mIDEgPQpswuFndXpOj3rKEco= +golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190509141414-a5b02f93d862 h1:rM0ROo5vb9AdYJi1110yjWGMej9ITfKddS89P3Fkhug= golang.org/x/sys v0.0.0-20190509141414-a5b02f93d862/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/request.go b/request.go index b9c72dc..43903fe 100644 --- a/request.go +++ b/request.go @@ -4,10 +4,9 @@ import ( "net/http" ) -// Request is a small wrapper around *http.Request that contains Metadata +// Request is a small wrapper around *http.Request that contains Metadata and Rendering option type Request struct { *http.Request - Meta map[string]interface{} - - rendered bool + Meta map[string]interface{} + Rendered bool } From c6ea82447aa8d3373d0bbe338f8ce0b33824b63f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Fri, 14 Jun 2019 15:24:15 +0300 Subject: [PATCH 2/2] Travis-CI configuration added. Now, we'll test coverage on every commit. --- .travis.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..1ae3698 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,14 @@ +language: go + +go: + - 1.12.x + - tip + +before_install: + - go get -t -v ./... + +script: + - go test -race -coverprofile=coverage.txt -covermode=atomic + +after_success: + - bash <(curl -s https://codecov.io/bash) \ No newline at end of file