Extractors refactored to support pass by value. Documentation added for request and response.
This commit is contained in:
parent
71683ec6de
commit
da03567fae
@ -98,7 +98,6 @@ func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectD
|
|||||||
response := Response{
|
response := Response{
|
||||||
Response: resp,
|
Response: resp,
|
||||||
Body: body,
|
Body: body,
|
||||||
Meta: req.Meta,
|
|
||||||
Request: req,
|
Request: req,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -161,7 +160,6 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) {
|
|||||||
Header: ConvertMapToHeader(res.Headers),
|
Header: ConvertMapToHeader(res.Headers),
|
||||||
},
|
},
|
||||||
Body: []byte(body),
|
Body: []byte(body),
|
||||||
Meta: req.Meta,
|
|
||||||
Request: req,
|
Request: req,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8,11 +8,24 @@ import (
|
|||||||
// Request is a small wrapper around *http.Request that contains Metadata and Rendering option
|
// Request is a small wrapper around *http.Request that contains Metadata and Rendering option
|
||||||
type Request struct {
|
type Request struct {
|
||||||
*http.Request
|
*http.Request
|
||||||
Meta map[string]interface{}
|
|
||||||
|
// Meta contains arbitrary data.
|
||||||
|
// Use this Meta map to store contextual data between your requests
|
||||||
|
Meta map[string]interface{}
|
||||||
|
|
||||||
|
// If true, requests will be synchronized
|
||||||
Synchronized bool
|
Synchronized bool
|
||||||
Rendered bool
|
|
||||||
Cancelled bool
|
// If true request will be opened in Chrome and
|
||||||
Encoding string
|
// fully rendered HTML DOM response will returned as response
|
||||||
|
Rendered bool
|
||||||
|
|
||||||
|
// Optional response body encoding. Leave empty for automatic detection.
|
||||||
|
// If you're having issues with auto detection, set this.
|
||||||
|
Encoding string
|
||||||
|
|
||||||
|
// Set this true to cancel requests. Should be used on middlewares.
|
||||||
|
Cancelled bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cancel request
|
// Cancel request
|
||||||
|
@ -11,9 +11,13 @@ import (
|
|||||||
// Contains parsed response data and Geziyor functions.
|
// Contains parsed response data and Geziyor functions.
|
||||||
type Response struct {
|
type Response struct {
|
||||||
*http.Response
|
*http.Response
|
||||||
Body []byte
|
|
||||||
|
// Response body
|
||||||
|
Body []byte
|
||||||
|
|
||||||
|
// Goquery Document object. If response IsHTML, its non-nil.
|
||||||
HTMLDoc *goquery.Document
|
HTMLDoc *goquery.Document
|
||||||
Meta map[string]interface{}
|
|
||||||
Request *Request
|
Request *Request
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,8 +15,8 @@ type Attr struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Extract returns HTML attribute value of provided selector
|
// Extract returns HTML attribute value of provided selector
|
||||||
func (e *Attr) Extract(doc *goquery.Document) (interface{}, error) {
|
func (e Attr) Extract(sel *goquery.Selection) (interface{}, error) {
|
||||||
attr, exists := doc.Find(e.Selector).Attr(e.Attr)
|
attr, exists := sel.Find(e.Selector).Attr(e.Attr)
|
||||||
if !exists {
|
if !exists {
|
||||||
return nil, ErrAttrNotExists
|
return nil, ErrAttrNotExists
|
||||||
}
|
}
|
||||||
|
@ -4,5 +4,5 @@ import "github.com/PuerkitoBio/goquery"
|
|||||||
|
|
||||||
// Extractor interface is for extracting data from HTML document
|
// Extractor interface is for extracting data from HTML document
|
||||||
type Extractor interface {
|
type Extractor interface {
|
||||||
Extract(doc *goquery.Document) (interface{}, error)
|
Extract(sel *goquery.Selection) (interface{}, error)
|
||||||
}
|
}
|
||||||
|
@ -13,11 +13,11 @@ type HTML struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Extract extracts and returns the HTML from inside each element of the given selection.
|
// Extract extracts and returns the HTML from inside each element of the given selection.
|
||||||
func (e *HTML) Extract(doc *goquery.Document) (interface{}, error) {
|
func (e HTML) Extract(sel *goquery.Selection) (interface{}, error) {
|
||||||
var ret, h string
|
var ret, h string
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
doc.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
|
sel.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
|
||||||
h, err = s.Html()
|
h, err = s.Html()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false
|
return false
|
||||||
@ -40,9 +40,9 @@ type OuterHTML struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Extract extracts and returns the HTML of each element of the given selection.
|
// Extract extracts and returns the HTML of each element of the given selection.
|
||||||
func (e *OuterHTML) Extract(doc *goquery.Document) (interface{}, error) {
|
func (e OuterHTML) Extract(sel *goquery.Selection) (interface{}, error) {
|
||||||
output := bytes.NewBufferString("")
|
output := bytes.NewBufferString("")
|
||||||
for _, node := range doc.Find(e.Selector).Nodes {
|
for _, node := range sel.Find(e.Selector).Nodes {
|
||||||
if err := html.Render(output, node); err != nil {
|
if err := html.Render(output, node); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -1,14 +1,22 @@
|
|||||||
package extract
|
package extract
|
||||||
|
|
||||||
import "github.com/PuerkitoBio/goquery"
|
import (
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
// Text returns the combined text contents of provided selector.
|
// Text returns the combined text contents of provided selector.
|
||||||
type Text struct {
|
type Text struct {
|
||||||
Name string
|
Name string
|
||||||
Selector string
|
Selector string
|
||||||
|
TrimSpace bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract returns the combined text contents of provided selector.
|
// Extract returns the combined text contents of provided selector.
|
||||||
func (e *Text) Extract(doc *goquery.Document) (interface{}, error) {
|
func (e Text) Extract(sel *goquery.Selection) (interface{}, error) {
|
||||||
return map[string]string{e.Name: doc.Find(e.Selector).Text()}, nil
|
text := sel.Find(e.Selector).Text()
|
||||||
|
if e.TrimSpace {
|
||||||
|
text = strings.TrimSpace(text)
|
||||||
|
}
|
||||||
|
return map[string]string{e.Name: text}, nil
|
||||||
}
|
}
|
||||||
|
@ -162,13 +162,13 @@ func TestExtractor(t *testing.T) {
|
|||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||||
Extractors: []extract.Extractor{
|
Extractors: []extract.Extractor{
|
||||||
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||||
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||||
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||||
&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
|
extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
|
||||||
&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
|
extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
|
||||||
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||||
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||||
},
|
},
|
||||||
Exporters: []export.Exporter{&export.JSON{}},
|
Exporters: []export.Exporter{&export.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
|
@ -106,7 +106,7 @@ func extractorsMiddleware(g *Geziyor, r *client.Response) {
|
|||||||
exports := map[string]interface{}{}
|
exports := map[string]interface{}{}
|
||||||
|
|
||||||
for _, extractor := range g.Opt.Extractors {
|
for _, extractor := range g.Opt.Extractors {
|
||||||
extracted, err := extractor.Extract(r.HTMLDoc)
|
extracted, err := extractor.Extract(r.HTMLDoc.Selection)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Println("extraction error: ", err)
|
log.Println("extraction error: ", err)
|
||||||
continue
|
continue
|
||||||
|
Loading…
x
Reference in New Issue
Block a user