Attribute extractor added. HTML extractor added. Outer HTML Extractor added.
exporter package renamed to export, extractor package renamed to extract for simplicity.
This commit is contained in:
parent
7c383b175f
commit
0eda056065
@ -1,4 +1,4 @@
|
||||
package exporter
|
||||
package export
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
@ -1,4 +1,4 @@
|
||||
package exporter
|
||||
package export
|
||||
|
||||
import "testing"
|
||||
|
@ -1,4 +1,4 @@
|
||||
package exporter
|
||||
package export
|
||||
|
||||
import (
|
||||
"encoding/json"
|
@ -1,4 +1,4 @@
|
||||
package exporter
|
||||
package export
|
||||
|
||||
import "testing"
|
||||
|
@ -1,4 +1,4 @@
|
||||
package exporter
|
||||
package export
|
||||
|
||||
import (
|
||||
"encoding/json"
|
24
extract/attr.go
Normal file
24
extract/attr.go
Normal file
@ -0,0 +1,24 @@
|
||||
package extract
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
var ErrAttrNotExists = errors.New("attribute not exist")
|
||||
|
||||
// Attr returns HTML attribute value of provided selector
|
||||
type Attr struct {
|
||||
Name string
|
||||
Selector string
|
||||
Attr string
|
||||
}
|
||||
|
||||
// Extract returns HTML attribute value of provided selector
|
||||
func (e *Attr) Extract(doc *goquery.Document) (interface{}, error) {
|
||||
attr, exists := doc.Find(e.Selector).Attr(e.Attr)
|
||||
if !exists {
|
||||
return nil, ErrAttrNotExists
|
||||
}
|
||||
return map[string]string{e.Name: attr}, nil
|
||||
}
|
53
extract/html.go
Normal file
53
extract/html.go
Normal file
@ -0,0 +1,53 @@
|
||||
package extract
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// HTML extracts and returns the HTML from inside each element of the given selection.
|
||||
type HTML struct {
|
||||
Name string
|
||||
Selector string
|
||||
}
|
||||
|
||||
// Extract extracts and returns the HTML from inside each element of the given selection.
|
||||
func (e *HTML) Extract(doc *goquery.Document) (interface{}, error) {
|
||||
var ret, h string
|
||||
var err error
|
||||
|
||||
doc.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
|
||||
h, err = s.Html()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
ret += h
|
||||
return true
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return map[string]string{e.Name: ret}, nil
|
||||
}
|
||||
|
||||
// OuterHTML extracts and returns the HTML of each element of the given selection.
|
||||
type OuterHTML struct {
|
||||
Name string
|
||||
Selector string
|
||||
}
|
||||
|
||||
// Extract extracts and returns the HTML of each element of the given selection.
|
||||
func (e *OuterHTML) Extract(doc *goquery.Document) (interface{}, error) {
|
||||
output := bytes.NewBufferString("")
|
||||
for _, node := range doc.Find(e.Selector).Nodes {
|
||||
if err := html.Render(output, node); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return map[string]string{e.Name: output.String()}, nil
|
||||
}
|
14
extract/text.go
Normal file
14
extract/text.go
Normal file
@ -0,0 +1,14 @@
|
||||
package extract
|
||||
|
||||
import "github.com/PuerkitoBio/goquery"
|
||||
|
||||
// Text returns the combined text contents of provided selector.
|
||||
type Text struct {
|
||||
Name string
|
||||
Selector string
|
||||
}
|
||||
|
||||
// Extract returns the combined text contents of provided selector.
|
||||
func (e *Text) Extract(doc *goquery.Document) (interface{}, error) {
|
||||
return map[string]string{e.Name: doc.Find(e.Selector).Text()}, nil
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
package extractor
|
||||
|
||||
import "github.com/PuerkitoBio/goquery"
|
||||
|
||||
// Text extracts texts from selected nodes
|
||||
type Text struct {
|
||||
Name string
|
||||
Selector string
|
||||
}
|
||||
|
||||
// Extract extracts texts from selected nodes
|
||||
func (e *Text) Extract(doc *goquery.Document) interface{} {
|
||||
return map[string]string{e.Name: doc.Find(e.Selector).Text()}
|
||||
}
|
@ -13,7 +13,7 @@ import (
|
||||
|
||||
// Extractor interface is for extracting data from HTML document
|
||||
type Extractor interface {
|
||||
Extract(doc *goquery.Document) interface{}
|
||||
Extract(doc *goquery.Document) (interface{}, error)
|
||||
}
|
||||
|
||||
// Exporter interface is for extracting data to external resources.
|
||||
|
@ -7,8 +7,8 @@ import (
|
||||
"github.com/fpfeng/httpcache"
|
||||
"github.com/geziyor/geziyor"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/exporter"
|
||||
"github.com/geziyor/geziyor/extractor"
|
||||
"github.com/geziyor/geziyor/export"
|
||||
"github.com/geziyor/geziyor/extract"
|
||||
"github.com/geziyor/geziyor/metrics"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
@ -44,7 +44,7 @@ func TestQuotes(t *testing.T) {
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||
ParseFunc: quotesParse,
|
||||
Exporters: []geziyor.Exporter{&exporter.JSON{}},
|
||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
||||
}).Start()
|
||||
}
|
||||
|
||||
@ -84,7 +84,7 @@ func TestAllLinks(t *testing.T) {
|
||||
}
|
||||
})
|
||||
},
|
||||
Exporters: []geziyor.Exporter{&exporter.CSV{}},
|
||||
Exporters: []geziyor.Exporter{&export.CSV{}},
|
||||
MetricsType: metrics.Prometheus,
|
||||
}).Start()
|
||||
}
|
||||
@ -99,7 +99,7 @@ func TestStartRequestsFunc(t *testing.T) {
|
||||
g.Exports <- s.AttrOr("href", "")
|
||||
})
|
||||
},
|
||||
Exporters: []geziyor.Exporter{&exporter.JSON{}},
|
||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
||||
}).Start()
|
||||
}
|
||||
|
||||
@ -163,12 +163,15 @@ func TestExtractor(t *testing.T) {
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||
Extractors: []geziyor.Extractor{
|
||||
&extractor.Text{Name: "title", Selector: ".c-page-title"},
|
||||
&extractor.Text{Name: "byline", Selector: ".c-byline__item:nth-child(1) > a"},
|
||||
&extractor.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||
&extractor.Text{Name: "content", Selector: ".c-entry-content"},
|
||||
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||
&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
|
||||
&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
|
||||
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||
},
|
||||
Exporters: []geziyor.Exporter{&exporter.JSON{}},
|
||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
||||
}).Start()
|
||||
}
|
||||
|
||||
|
@ -107,7 +107,11 @@ func extractorsMiddleware(g *Geziyor, r *client.Response) {
|
||||
exports := map[string]interface{}{}
|
||||
|
||||
for _, extractor := range g.Opt.Extractors {
|
||||
extracted := extractor.Extract(r.HTMLDoc)
|
||||
extracted, err := extractor.Extract(r.HTMLDoc)
|
||||
if err != nil {
|
||||
log.Println("extraction error: ", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check extracted data type and use it accordingly
|
||||
val := reflect.ValueOf(extracted)
|
||||
|
Loading…
x
Reference in New Issue
Block a user