Extractors refactored to support pass by value. Documentation added for request and response.

This commit is contained in:
Musab Gültekin 2019-07-04 02:13:29 +03:00
parent 71683ec6de
commit da03567fae
9 changed files with 51 additions and 28 deletions

View File

@ -98,7 +98,6 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD
response := Response{
Response: resp,
Body: body,
Meta: req.Meta,
Request: req,
}
@ -161,7 +160,6 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) {
Header: ConvertMapToHeader(res.Headers),
},
Body: []byte(body),
Meta: req.Meta,
Request: req,
}

View File

@ -8,11 +8,24 @@ import (
// Request is a small wrapper around *http.Request that contains Metadata and Rendering option
type Request struct {
*http.Request
Meta map[string]interface{}
// Meta contains arbitrary data.
// Use this Meta map to store contextual data between your requests
Meta map[string]interface{}
// If true, requests will be synchronized
Synchronized bool
Rendered bool
Cancelled bool
Encoding string
// If true request will be opened in Chrome and
// fully rendered HTML DOM response will returned as response
Rendered bool
// Optional response body encoding. Leave empty for automatic detection.
// If you're having issues with auto detection, set this.
Encoding string
// Set this true to cancel requests. Should be used on middlewares.
Cancelled bool
}
// Cancel request

View File

@ -11,9 +11,13 @@ import (
// Contains parsed response data and Geziyor functions.
type Response struct {
*http.Response
Body []byte
// Response body
Body []byte
// Goquery Document object. If response IsHTML, its non-nil.
HTMLDoc *goquery.Document
Meta map[string]interface{}
Request *Request
}

View File

@ -15,8 +15,8 @@ type Attr struct {
}
// Extract returns HTML attribute value of provided selector
func (e *Attr) Extract(doc *goquery.Document) (interface{}, error) {
attr, exists := doc.Find(e.Selector).Attr(e.Attr)
func (e Attr) Extract(sel *goquery.Selection) (interface{}, error) {
attr, exists := sel.Find(e.Selector).Attr(e.Attr)
if !exists {
return nil, ErrAttrNotExists
}

View File

@ -4,5 +4,5 @@ import "github.com/PuerkitoBio/goquery"
// Extractor interface is for extracting data from HTML document
type Extractor interface {
Extract(doc *goquery.Document) (interface{}, error)
Extract(sel *goquery.Selection) (interface{}, error)
}

View File

@ -13,11 +13,11 @@ type HTML struct {
}
// Extract extracts and returns the HTML from inside each element of the given selection.
func (e *HTML) Extract(doc *goquery.Document) (interface{}, error) {
func (e HTML) Extract(sel *goquery.Selection) (interface{}, error) {
var ret, h string
var err error
doc.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
sel.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
h, err = s.Html()
if err != nil {
return false
@ -40,9 +40,9 @@ type OuterHTML struct {
}
// Extract extracts and returns the HTML of each element of the given selection.
func (e *OuterHTML) Extract(doc *goquery.Document) (interface{}, error) {
func (e OuterHTML) Extract(sel *goquery.Selection) (interface{}, error) {
output := bytes.NewBufferString("")
for _, node := range doc.Find(e.Selector).Nodes {
for _, node := range sel.Find(e.Selector).Nodes {
if err := html.Render(output, node); err != nil {
return nil, err
}

View File

@ -1,14 +1,22 @@
package extract
import "github.com/PuerkitoBio/goquery"
import (
"github.com/PuerkitoBio/goquery"
"strings"
)
// Text returns the combined text contents of provided selector.
type Text struct {
Name string
Selector string
Name string
Selector string
TrimSpace bool
}
// Extract returns the combined text contents of provided selector.
func (e *Text) Extract(doc *goquery.Document) (interface{}, error) {
return map[string]string{e.Name: doc.Find(e.Selector).Text()}, nil
func (e Text) Extract(sel *goquery.Selection) (interface{}, error) {
text := sel.Find(e.Selector).Text()
if e.TrimSpace {
text = strings.TrimSpace(text)
}
return map[string]string{e.Name: text}, nil
}

View File

@ -162,13 +162,13 @@ func TestExtractor(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
Extractors: []extract.Extractor{
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
&extract.Text{Name: "title", Selector: ".c-page-title"},
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
&extract.Text{Name: "content", Selector: ".c-entry-content"},
extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
extract.Text{Name: "title", Selector: ".c-page-title"},
extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
extract.Text{Name: "summary", Selector: ".c-entry-summary"},
extract.Text{Name: "content", Selector: ".c-entry-content"},
},
Exporters: []export.Exporter{&export.JSON{}},
}).Start()

View File

@ -106,7 +106,7 @@ func extractorsMiddleware(g *Geziyor, r *client.Response) {
exports := map[string]interface{}{}
for _, extractor := range g.Opt.Extractors {
extracted, err := extractor.Extract(r.HTMLDoc)
extracted, err := extractor.Extract(r.HTMLDoc.Selection)
if err != nil {
log.Println("extraction error: ", err)
continue