Extractors refactored to support pass by value. Documentation added for request and response.
This commit is contained in:
parent
71683ec6de
commit
da03567fae
@ -98,7 +98,6 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD
|
||||
response := Response{
|
||||
Response: resp,
|
||||
Body: body,
|
||||
Meta: req.Meta,
|
||||
Request: req,
|
||||
}
|
||||
|
||||
@ -161,7 +160,6 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) {
|
||||
Header: ConvertMapToHeader(res.Headers),
|
||||
},
|
||||
Body: []byte(body),
|
||||
Meta: req.Meta,
|
||||
Request: req,
|
||||
}
|
||||
|
||||
|
@ -8,11 +8,24 @@ import (
|
||||
// Request is a small wrapper around *http.Request that contains Metadata and Rendering option
|
||||
type Request struct {
|
||||
*http.Request
|
||||
|
||||
// Meta contains arbitrary data.
|
||||
// Use this Meta map to store contextual data between your requests
|
||||
Meta map[string]interface{}
|
||||
|
||||
// If true, requests will be synchronized
|
||||
Synchronized bool
|
||||
|
||||
// If true request will be opened in Chrome and
|
||||
// fully rendered HTML DOM response will returned as response
|
||||
Rendered bool
|
||||
Cancelled bool
|
||||
|
||||
// Optional response body encoding. Leave empty for automatic detection.
|
||||
// If you're having issues with auto detection, set this.
|
||||
Encoding string
|
||||
|
||||
// Set this true to cancel requests. Should be used on middlewares.
|
||||
Cancelled bool
|
||||
}
|
||||
|
||||
// Cancel request
|
||||
|
@ -11,9 +11,13 @@ import (
|
||||
// Contains parsed response data and Geziyor functions.
|
||||
type Response struct {
|
||||
*http.Response
|
||||
|
||||
// Response body
|
||||
Body []byte
|
||||
|
||||
// Goquery Document object. If response IsHTML, its non-nil.
|
||||
HTMLDoc *goquery.Document
|
||||
Meta map[string]interface{}
|
||||
|
||||
Request *Request
|
||||
}
|
||||
|
||||
|
@ -15,8 +15,8 @@ type Attr struct {
|
||||
}
|
||||
|
||||
// Extract returns HTML attribute value of provided selector
|
||||
func (e *Attr) Extract(doc *goquery.Document) (interface{}, error) {
|
||||
attr, exists := doc.Find(e.Selector).Attr(e.Attr)
|
||||
func (e Attr) Extract(sel *goquery.Selection) (interface{}, error) {
|
||||
attr, exists := sel.Find(e.Selector).Attr(e.Attr)
|
||||
if !exists {
|
||||
return nil, ErrAttrNotExists
|
||||
}
|
||||
|
@ -4,5 +4,5 @@ import "github.com/PuerkitoBio/goquery"
|
||||
|
||||
// Extractor interface is for extracting data from HTML document
|
||||
type Extractor interface {
|
||||
Extract(doc *goquery.Document) (interface{}, error)
|
||||
Extract(sel *goquery.Selection) (interface{}, error)
|
||||
}
|
||||
|
@ -13,11 +13,11 @@ type HTML struct {
|
||||
}
|
||||
|
||||
// Extract extracts and returns the HTML from inside each element of the given selection.
|
||||
func (e *HTML) Extract(doc *goquery.Document) (interface{}, error) {
|
||||
func (e HTML) Extract(sel *goquery.Selection) (interface{}, error) {
|
||||
var ret, h string
|
||||
var err error
|
||||
|
||||
doc.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
|
||||
sel.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
|
||||
h, err = s.Html()
|
||||
if err != nil {
|
||||
return false
|
||||
@ -40,9 +40,9 @@ type OuterHTML struct {
|
||||
}
|
||||
|
||||
// Extract extracts and returns the HTML of each element of the given selection.
|
||||
func (e *OuterHTML) Extract(doc *goquery.Document) (interface{}, error) {
|
||||
func (e OuterHTML) Extract(sel *goquery.Selection) (interface{}, error) {
|
||||
output := bytes.NewBufferString("")
|
||||
for _, node := range doc.Find(e.Selector).Nodes {
|
||||
for _, node := range sel.Find(e.Selector).Nodes {
|
||||
if err := html.Render(output, node); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -1,14 +1,22 @@
|
||||
package extract
|
||||
|
||||
import "github.com/PuerkitoBio/goquery"
|
||||
import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Text returns the combined text contents of provided selector.
|
||||
type Text struct {
|
||||
Name string
|
||||
Selector string
|
||||
TrimSpace bool
|
||||
}
|
||||
|
||||
// Extract returns the combined text contents of provided selector.
|
||||
func (e *Text) Extract(doc *goquery.Document) (interface{}, error) {
|
||||
return map[string]string{e.Name: doc.Find(e.Selector).Text()}, nil
|
||||
func (e Text) Extract(sel *goquery.Selection) (interface{}, error) {
|
||||
text := sel.Find(e.Selector).Text()
|
||||
if e.TrimSpace {
|
||||
text = strings.TrimSpace(text)
|
||||
}
|
||||
return map[string]string{e.Name: text}, nil
|
||||
}
|
||||
|
@ -162,13 +162,13 @@ func TestExtractor(t *testing.T) {
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||
Extractors: []extract.Extractor{
|
||||
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||
&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
|
||||
&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
|
||||
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||
extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||
extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||
extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||
extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
|
||||
extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
|
||||
extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||
extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||
},
|
||||
Exporters: []export.Exporter{&export.JSON{}},
|
||||
}).Start()
|
||||
|
@ -106,7 +106,7 @@ func extractorsMiddleware(g *Geziyor, r *client.Response) {
|
||||
exports := map[string]interface{}{}
|
||||
|
||||
for _, extractor := range g.Opt.Extractors {
|
||||
extracted, err := extractor.Extract(r.HTMLDoc)
|
||||
extracted, err := extractor.Extract(r.HTMLDoc.Selection)
|
||||
if err != nil {
|
||||
log.Println("extraction error: ", err)
|
||||
continue
|
||||
|
Loading…
x
Reference in New Issue
Block a user