From da03567fae1a5bbd73554e56c75fcd5664defc1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Thu, 4 Jul 2019 02:13:29 +0300 Subject: [PATCH] Extractors refactored to support pass by value. Documentation added for request and response. --- client/client.go | 2 -- client/request.go | 21 +++++++++++++++++---- client/response.go | 8 ++++++-- extract/attr.go | 4 ++-- extract/extract.go | 2 +- extract/html.go | 8 ++++---- extract/text.go | 18 +++++++++++++----- geziyor_test.go | 14 +++++++------- middleware.go | 2 +- 9 files changed, 51 insertions(+), 28 deletions(-) diff --git a/client/client.go b/client/client.go index 13c54f0..2be7c28 100644 --- a/client/client.go +++ b/client/client.go @@ -98,7 +98,6 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD response := Response{ Response: resp, Body: body, - Meta: req.Meta, Request: req, } @@ -161,7 +160,6 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) { Header: ConvertMapToHeader(res.Headers), }, Body: []byte(body), - Meta: req.Meta, Request: req, } diff --git a/client/request.go b/client/request.go index 4dc6892..7298d49 100644 --- a/client/request.go +++ b/client/request.go @@ -8,11 +8,24 @@ import ( // Request is a small wrapper around *http.Request that contains Metadata and Rendering option type Request struct { *http.Request - Meta map[string]interface{} + + // Meta contains arbitrary data. + // Use this Meta map to store contextual data between your requests + Meta map[string]interface{} + + // If true, requests will be synchronized Synchronized bool - Rendered bool - Cancelled bool - Encoding string + + // If true request will be opened in Chrome and + // fully rendered HTML DOM response will returned as response + Rendered bool + + // Optional response body encoding. Leave empty for automatic detection. + // If you're having issues with auto detection, set this. + Encoding string + + // Set this true to cancel requests. Should be used on middlewares. + Cancelled bool } // Cancel request diff --git a/client/response.go b/client/response.go index c9646cd..a15a3d3 100644 --- a/client/response.go +++ b/client/response.go @@ -11,9 +11,13 @@ import ( // Contains parsed response data and Geziyor functions. type Response struct { *http.Response - Body []byte + + // Response body + Body []byte + + // Goquery Document object. If response IsHTML, its non-nil. HTMLDoc *goquery.Document - Meta map[string]interface{} + Request *Request } diff --git a/extract/attr.go b/extract/attr.go index c9405ac..90c3a9e 100644 --- a/extract/attr.go +++ b/extract/attr.go @@ -15,8 +15,8 @@ type Attr struct { } // Extract returns HTML attribute value of provided selector -func (e *Attr) Extract(doc *goquery.Document) (interface{}, error) { - attr, exists := doc.Find(e.Selector).Attr(e.Attr) +func (e Attr) Extract(sel *goquery.Selection) (interface{}, error) { + attr, exists := sel.Find(e.Selector).Attr(e.Attr) if !exists { return nil, ErrAttrNotExists } diff --git a/extract/extract.go b/extract/extract.go index b17de05..0342193 100644 --- a/extract/extract.go +++ b/extract/extract.go @@ -4,5 +4,5 @@ import "github.com/PuerkitoBio/goquery" // Extractor interface is for extracting data from HTML document type Extractor interface { - Extract(doc *goquery.Document) (interface{}, error) + Extract(sel *goquery.Selection) (interface{}, error) } diff --git a/extract/html.go b/extract/html.go index e612d9f..56b2d37 100644 --- a/extract/html.go +++ b/extract/html.go @@ -13,11 +13,11 @@ type HTML struct { } // Extract extracts and returns the HTML from inside each element of the given selection. -func (e *HTML) Extract(doc *goquery.Document) (interface{}, error) { +func (e HTML) Extract(sel *goquery.Selection) (interface{}, error) { var ret, h string var err error - doc.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool { + sel.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool { h, err = s.Html() if err != nil { return false @@ -40,9 +40,9 @@ type OuterHTML struct { } // Extract extracts and returns the HTML of each element of the given selection. -func (e *OuterHTML) Extract(doc *goquery.Document) (interface{}, error) { +func (e OuterHTML) Extract(sel *goquery.Selection) (interface{}, error) { output := bytes.NewBufferString("") - for _, node := range doc.Find(e.Selector).Nodes { + for _, node := range sel.Find(e.Selector).Nodes { if err := html.Render(output, node); err != nil { return nil, err } diff --git a/extract/text.go b/extract/text.go index e02803b..9800ef5 100644 --- a/extract/text.go +++ b/extract/text.go @@ -1,14 +1,22 @@ package extract -import "github.com/PuerkitoBio/goquery" +import ( + "github.com/PuerkitoBio/goquery" + "strings" +) // Text returns the combined text contents of provided selector. type Text struct { - Name string - Selector string + Name string + Selector string + TrimSpace bool } // Extract returns the combined text contents of provided selector. -func (e *Text) Extract(doc *goquery.Document) (interface{}, error) { - return map[string]string{e.Name: doc.Find(e.Selector).Text()}, nil +func (e Text) Extract(sel *goquery.Selection) (interface{}, error) { + text := sel.Find(e.Selector).Text() + if e.TrimSpace { + text = strings.TrimSpace(text) + } + return map[string]string{e.Name: text}, nil } diff --git a/geziyor_test.go b/geziyor_test.go index 21a9d3c..3418ad3 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -162,13 +162,13 @@ func TestExtractor(t *testing.T) { geziyor.NewGeziyor(&geziyor.Options{ StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"}, Extractors: []extract.Extractor{ - &extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"}, - &extract.Text{Name: "title", Selector: ".c-page-title"}, - &extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"}, - &extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"}, - &extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"}, - &extract.Text{Name: "summary", Selector: ".c-entry-summary"}, - &extract.Text{Name: "content", Selector: ".c-entry-content"}, + extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"}, + extract.Text{Name: "title", Selector: ".c-page-title"}, + extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"}, + extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"}, + extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"}, + extract.Text{Name: "summary", Selector: ".c-entry-summary"}, + extract.Text{Name: "content", Selector: ".c-entry-content"}, }, Exporters: []export.Exporter{&export.JSON{}}, }).Start() diff --git a/middleware.go b/middleware.go index 981b834..b8ae850 100644 --- a/middleware.go +++ b/middleware.go @@ -106,7 +106,7 @@ func extractorsMiddleware(g *Geziyor, r *client.Response) { exports := map[string]interface{}{} for _, extractor := range g.Opt.Extractors { - extracted, err := extractor.Extract(r.HTMLDoc) + extracted, err := extractor.Extract(r.HTMLDoc.Selection) if err != nil { log.Println("extraction error: ", err) continue