From a64a262554bbb1fc76bbfeba758d1ffba882dcf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Sat, 22 Jun 2019 13:12:05 +0300 Subject: [PATCH] HTTP Client can be changed now. Docs updated. --- README.md | 21 +++++++++++++++++++-- geziyor.go | 24 ++++++++++++------------ geziyor_test.go | 11 ++++++----- {internal => http}/http.go | 2 +- middleware.go | 11 ++++++----- response.go | 2 +- 6 files changed, 45 insertions(+), 26 deletions(-) rename {internal => http}/http.go (99%) diff --git a/README.md b/README.md index e70b3ec..c9535c5 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,6 @@ func quotesParse(g *geziyor.Geziyor, r *geziyor.Response) { See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for more usage examples. - ## Documentation ### Installation @@ -101,7 +100,25 @@ geziyor.NewGeziyor(&geziyor.Options{ }).Start() ``` +### Exporting Data +You can export data automatically using exporters. Just send data to ```Geziyor.Exports``` chan. +[Available exporters](https://godoc.org/github.com/geziyor/geziyor/exporter) + +```go +geziyor.NewGeziyor(&geziyor.Options{ + StartURLs: []string{"http://quotes.toscrape.com/"}, + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { + r.HTMLDoc.Find("div.quote").Each(func(_ int, s *goquery.Selection) { + g.Exports <- map[string]interface{}{ + "text": s.Find("span.text").Text(), + "author": s.Find("small.author").Text(), + } + }) + }, + Exporters: []geziyor.Exporter{&exporter.JSONExporter{}}, +}).Start() +``` ## Roadmap @@ -112,6 +129,6 @@ If you're interested in helping this project, please consider these features: - Deploying Scrapers to Cloud - ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~ - Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html)) -- Realtime metrics (Prometheus etc.) +- ~~Realtime metrics (Prometheus etc.)~~ \ No newline at end of file diff --git a/geziyor.go b/geziyor.go index bd348cd..6130fc6 100644 --- a/geziyor.go +++ b/geziyor.go @@ -6,7 +6,7 @@ import ( "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" "github.com/fpfeng/httpcache" - "github.com/geziyor/geziyor/internal" + "github.com/geziyor/geziyor/http" "github.com/geziyor/geziyor/metrics" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -14,7 +14,7 @@ import ( "io" "io/ioutil" "log" - "net/http" + stdhttp "net/http" "net/http/cookiejar" "net/url" "sync" @@ -28,7 +28,7 @@ type Exporter interface { // Geziyor is our main scraper type type Geziyor struct { Opt *Options - Client *internal.Client + Client *http.Client Exports chan interface{} metrics *metrics.Metrics @@ -47,7 +47,7 @@ type Geziyor struct { // If options provided, options func NewGeziyor(opt *Options) *Geziyor { geziyor := &Geziyor{ - Client: internal.NewClient(), + Client: http.NewClient(), Opt: opt, Exports: make(chan interface{}), requestMiddlewares: []RequestMiddleware{ @@ -104,10 +104,10 @@ func (g *Geziyor) Start() { log.Println("Scraping Started") // Metrics - metricsServer := &http.Server{Addr: ":2112"} + metricsServer := &stdhttp.Server{Addr: ":2112"} if g.Opt.MetricsType == metrics.Prometheus { go func() { - http.Handle("/metrics", promhttp.Handler()) + stdhttp.Handle("/metrics", promhttp.Handler()) metricsServer.ListenAndServe() }() } @@ -141,7 +141,7 @@ func (g *Geziyor) Start() { // Get issues a GET to the specified URL. func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *Response)) { - req, err := http.NewRequest("GET", url, nil) + req, err := stdhttp.NewRequest("GET", url, nil) if err != nil { log.Printf("Request creating error %v\n", err) return @@ -153,7 +153,7 @@ func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *Response)) { // Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed. // Rendered requests only supported for GET requests. func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *Response)) { - req, err := http.NewRequest("GET", url, nil) + req, err := stdhttp.NewRequest("GET", url, nil) if err != nil { log.Printf("Request creating error %v\n", err) return @@ -163,7 +163,7 @@ func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *Response) // Head issues a HEAD to the specified URL func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *Response)) { - req, err := http.NewRequest("HEAD", url, nil) + req, err := stdhttp.NewRequest("HEAD", url, nil) if err != nil { log.Printf("Request creating error %v\n", err) return @@ -265,7 +265,7 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) { if err := chromedp.Run(ctx, network.Enable(), - network.SetExtraHTTPHeaders(network.Headers(internal.ConvertHeaderToMap(req.Header))), + network.SetExtraHTTPHeaders(network.Headers(http.ConvertHeaderToMap(req.Header))), chromedp.ActionFunc(func(ctx context.Context) error { chromedp.ListenTarget(ctx, func(ev interface{}) { switch ev.(type) { @@ -299,10 +299,10 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) { req.URL, _ = url.Parse(res.URL) response := Response{ - Response: &http.Response{ + Response: &stdhttp.Response{ Request: req.Request, StatusCode: int(res.Status), - Header: internal.ConvertMapToHeader(res.Headers), + Header: http.ConvertMapToHeader(res.Headers), }, Body: []byte(body), Meta: req.Meta, diff --git a/geziyor_test.go b/geziyor_test.go index 1722c16..058926d 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -46,7 +46,7 @@ func TestQuotes(t *testing.T) { } func quotesParse(g *geziyor.Geziyor, r *geziyor.Response) { - r.DocHTML.Find("div.quote").Each(func(i int, s *goquery.Selection) { + r.HTMLDoc.Find("div.quote").Each(func(i int, s *goquery.Selection) { // Export Data g.Exports <- map[string]interface{}{ "number": i, @@ -59,7 +59,7 @@ func quotesParse(g *geziyor.Geziyor, r *geziyor.Response) { }) // Next Page - if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok { + if href, ok := r.HTMLDoc.Find("li.next > a").Attr("href"); ok { g.Get(r.JoinURL(href), quotesParse) } } @@ -72,13 +72,14 @@ func TestAllLinks(t *testing.T) { StartURLs: []string{"http://books.toscrape.com/"}, ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { g.Exports <- []string{r.Request.URL.String()} - r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) { + r.HTMLDoc.Find("a").Each(func(i int, s *goquery.Selection) { if href, ok := s.Attr("href"); ok { g.Get(r.JoinURL(href), g.Opt.ParseFunc) } }) }, - Exporters: []geziyor.Exporter{&exporter.CSVExporter{}}, + Exporters: []geziyor.Exporter{&exporter.CSVExporter{}}, + MetricsType: metrics.Prometheus, }).Start() } @@ -97,7 +98,7 @@ func TestStartRequestsFunc(t *testing.T) { g.Get("http://quotes.toscrape.com/", g.Opt.ParseFunc) }, ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { - r.DocHTML.Find("a").Each(func(_ int, s *goquery.Selection) { + r.HTMLDoc.Find("a").Each(func(_ int, s *goquery.Selection) { g.Exports <- s.AttrOr("href", "") }) }, diff --git a/internal/http.go b/http/http.go similarity index 99% rename from internal/http.go rename to http/http.go index 5fe50cd..76ec8f1 100644 --- a/internal/http.go +++ b/http/http.go @@ -1,4 +1,4 @@ -package internal +package http import ( "errors" diff --git a/middleware.go b/middleware.go index 1d617a7..04b9ca2 100644 --- a/middleware.go +++ b/middleware.go @@ -3,6 +3,7 @@ package geziyor import ( "bytes" "github.com/PuerkitoBio/goquery" + "github.com/geziyor/geziyor/http" "github.com/geziyor/geziyor/internal" "log" "math/rand" @@ -53,10 +54,10 @@ func duplicateRequestsMiddleware(g *Geziyor, r *Request) { // defaultHeadersMiddleware sets default request headers func defaultHeadersMiddleware(g *Geziyor, r *Request) { - r.Header = internal.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") - r.Header = internal.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8") - r.Header = internal.SetDefaultHeader(r.Header, "Accept-Language", "en") - r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent) + r.Header = http.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + r.Header = http.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8") + r.Header = http.SetDefaultHeader(r.Header, "Accept-Language", "en") + r.Header = http.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent) } // delayMiddleware delays requests @@ -83,7 +84,7 @@ func metricsRequestMiddleware(g *Geziyor, r *Request) { // parseHTMLMiddleware parses response if response is HTML func parseHTMLMiddleware(g *Geziyor, r *Response) { if !g.Opt.ParseHTMLDisabled && r.isHTML() { - r.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) + r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) } } diff --git a/response.go b/response.go index 861c934..a0b21ba 100644 --- a/response.go +++ b/response.go @@ -12,7 +12,7 @@ import ( type Response struct { *http.Response Body []byte - DocHTML *goquery.Document + HTMLDoc *goquery.Document Meta map[string]interface{} Request *Request }