Extractors refactored to support pass by value. Documentation added for request and response.

2019-07-04 02:13:29 +03:00
parent 71683ec6de
commit da03567fae
9 changed files with 51 additions and 28 deletions
@@ -98,7 +98,6 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD
 	response := Response{
 		Response: resp,
 		Body:     body,
-		Meta:     req.Meta,
 		Request:  req,
 	}

@@ -161,7 +160,6 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) {
 			Header:     ConvertMapToHeader(res.Headers),
 		},
 		Body:    []byte(body),
-		Meta:    req.Meta,
 		Request: req,
 	}

@@ -8,11 +8,24 @@ import (
 // Request is a small wrapper around *http.Request that contains Metadata and Rendering option
 type Request struct {
 	*http.Request
-	Meta         map[string]interface{}
+
+	// Meta contains arbitrary data.
+	// Use this Meta map to store contextual data between your requests
+	Meta map[string]interface{}
+
+	// If true, requests will be synchronized
 	Synchronized bool
-	Rendered     bool
-	Cancelled    bool
-	Encoding     string
+
+	// If true request will be opened in Chrome and
+	// fully rendered HTML DOM response will returned as response
+	Rendered bool
+
+	// Optional response body encoding. Leave empty for automatic detection.
+	// If you're having issues with auto detection, set this.
+	Encoding string
+
+	// Set this true to cancel requests. Should be used on middlewares.
+	Cancelled bool
 }

 // Cancel request
@@ -11,9 +11,13 @@ import (
 // Contains parsed response data and Geziyor functions.
 type Response struct {
 	*http.Response
-	Body    []byte
+
+	// Response body
+	Body []byte
+
+	// Goquery Document object. If response IsHTML, its non-nil.
 	HTMLDoc *goquery.Document
-	Meta    map[string]interface{}
+
 	Request *Request
 }

@@ -15,8 +15,8 @@ type Attr struct {
 }

 // Extract returns HTML attribute value of provided selector
-func (e *Attr) Extract(doc *goquery.Document) (interface{}, error) {
-	attr, exists := doc.Find(e.Selector).Attr(e.Attr)
+func (e Attr) Extract(sel *goquery.Selection) (interface{}, error) {
+	attr, exists := sel.Find(e.Selector).Attr(e.Attr)
 	if !exists {
 		return nil, ErrAttrNotExists
 	}
@@ -4,5 +4,5 @@ import "github.com/PuerkitoBio/goquery"

 // Extractor interface is for extracting data from HTML document
 type Extractor interface {
-	Extract(doc *goquery.Document) (interface{}, error)
+	Extract(sel *goquery.Selection) (interface{}, error)
 }
@@ -13,11 +13,11 @@ type HTML struct {
 }

 // Extract extracts and returns the HTML from inside each element of the given selection.
-func (e *HTML) Extract(doc *goquery.Document) (interface{}, error) {
+func (e HTML) Extract(sel *goquery.Selection) (interface{}, error) {
 	var ret, h string
 	var err error

-	doc.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
+	sel.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
 		h, err = s.Html()
 		if err != nil {
 			return false
@@ -40,9 +40,9 @@ type OuterHTML struct {
 }

 // Extract extracts and returns the HTML of each element of the given selection.
-func (e *OuterHTML) Extract(doc *goquery.Document) (interface{}, error) {
+func (e OuterHTML) Extract(sel *goquery.Selection) (interface{}, error) {
 	output := bytes.NewBufferString("")
-	for _, node := range doc.Find(e.Selector).Nodes {
+	for _, node := range sel.Find(e.Selector).Nodes {
 		if err := html.Render(output, node); err != nil {
 			return nil, err
 		}
@@ -1,14 +1,22 @@
 package extract

-import "github.com/PuerkitoBio/goquery"
+import (
+	"github.com/PuerkitoBio/goquery"
+	"strings"
+)

 // Text returns the combined text contents of provided selector.
 type Text struct {
-	Name     string
-	Selector string
+	Name      string
+	Selector  string
+	TrimSpace bool
 }

 // Extract returns the combined text contents of provided selector.
-func (e *Text) Extract(doc *goquery.Document) (interface{}, error) {
-	return map[string]string{e.Name: doc.Find(e.Selector).Text()}, nil
+func (e Text) Extract(sel *goquery.Selection) (interface{}, error) {
+	text := sel.Find(e.Selector).Text()
+	if e.TrimSpace {
+		text = strings.TrimSpace(text)
+	}
+	return map[string]string{e.Name: text}, nil
 }
@@ -162,13 +162,13 @@ func TestExtractor(t *testing.T) {
 	geziyor.NewGeziyor(&geziyor.Options{
 		StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
 		Extractors: []extract.Extractor{
-			&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
-			&extract.Text{Name: "title", Selector: ".c-page-title"},
-			&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
-			&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
-			&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
-			&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
-			&extract.Text{Name: "content", Selector: ".c-entry-content"},
+			extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
+			extract.Text{Name: "title", Selector: ".c-page-title"},
+			extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
+			extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
+			extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
+			extract.Text{Name: "summary", Selector: ".c-entry-summary"},
+			extract.Text{Name: "content", Selector: ".c-entry-content"},
 		},
 		Exporters: []export.Exporter{&export.JSON{}},
 	}).Start()
@@ -106,7 +106,7 @@ func extractorsMiddleware(g *Geziyor, r *client.Response) {
 		exports := map[string]interface{}{}

 		for _, extractor := range g.Opt.Extractors {
-			extracted, err := extractor.Extract(r.HTMLDoc)
+			extracted, err := extractor.Extract(r.HTMLDoc.Selection)
 			if err != nil {
 				log.Println("extraction error: ", err)
 				continue