Attribute extractor added. HTML extractor added. Outer HTML Extractor added.
exporter package renamed to export, extractor package renamed to extract for simplicity.
This commit is contained in:
parent
7c383b175f
commit
0eda056065
@ -1,4 +1,4 @@
|
|||||||
package exporter
|
package export
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/csv"
|
"encoding/csv"
|
@ -1,4 +1,4 @@
|
|||||||
package exporter
|
package export
|
||||||
|
|
||||||
import "testing"
|
import "testing"
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package exporter
|
package export
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
@ -1,4 +1,4 @@
|
|||||||
package exporter
|
package export
|
||||||
|
|
||||||
import "testing"
|
import "testing"
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package exporter
|
package export
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
24
extract/attr.go
Normal file
24
extract/attr.go
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
package extract
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
)
|
||||||
|
|
||||||
|
var ErrAttrNotExists = errors.New("attribute not exist")
|
||||||
|
|
||||||
|
// Attr returns HTML attribute value of provided selector
|
||||||
|
type Attr struct {
|
||||||
|
Name string
|
||||||
|
Selector string
|
||||||
|
Attr string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract returns HTML attribute value of provided selector
|
||||||
|
func (e *Attr) Extract(doc *goquery.Document) (interface{}, error) {
|
||||||
|
attr, exists := doc.Find(e.Selector).Attr(e.Attr)
|
||||||
|
if !exists {
|
||||||
|
return nil, ErrAttrNotExists
|
||||||
|
}
|
||||||
|
return map[string]string{e.Name: attr}, nil
|
||||||
|
}
|
53
extract/html.go
Normal file
53
extract/html.go
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
package extract
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
)
|
||||||
|
|
||||||
|
// HTML extracts and returns the HTML from inside each element of the given selection.
|
||||||
|
type HTML struct {
|
||||||
|
Name string
|
||||||
|
Selector string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract extracts and returns the HTML from inside each element of the given selection.
|
||||||
|
func (e *HTML) Extract(doc *goquery.Document) (interface{}, error) {
|
||||||
|
var ret, h string
|
||||||
|
var err error
|
||||||
|
|
||||||
|
doc.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
|
||||||
|
h, err = s.Html()
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
ret += h
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return map[string]string{e.Name: ret}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// OuterHTML extracts and returns the HTML of each element of the given selection.
|
||||||
|
type OuterHTML struct {
|
||||||
|
Name string
|
||||||
|
Selector string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract extracts and returns the HTML of each element of the given selection.
|
||||||
|
func (e *OuterHTML) Extract(doc *goquery.Document) (interface{}, error) {
|
||||||
|
output := bytes.NewBufferString("")
|
||||||
|
for _, node := range doc.Find(e.Selector).Nodes {
|
||||||
|
if err := html.Render(output, node); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return map[string]string{e.Name: output.String()}, nil
|
||||||
|
}
|
14
extract/text.go
Normal file
14
extract/text.go
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
package extract
|
||||||
|
|
||||||
|
import "github.com/PuerkitoBio/goquery"
|
||||||
|
|
||||||
|
// Text returns the combined text contents of provided selector.
|
||||||
|
type Text struct {
|
||||||
|
Name string
|
||||||
|
Selector string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract returns the combined text contents of provided selector.
|
||||||
|
func (e *Text) Extract(doc *goquery.Document) (interface{}, error) {
|
||||||
|
return map[string]string{e.Name: doc.Find(e.Selector).Text()}, nil
|
||||||
|
}
|
@ -1,14 +0,0 @@
|
|||||||
package extractor
|
|
||||||
|
|
||||||
import "github.com/PuerkitoBio/goquery"
|
|
||||||
|
|
||||||
// Text extracts texts from selected nodes
|
|
||||||
type Text struct {
|
|
||||||
Name string
|
|
||||||
Selector string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract extracts texts from selected nodes
|
|
||||||
func (e *Text) Extract(doc *goquery.Document) interface{} {
|
|
||||||
return map[string]string{e.Name: doc.Find(e.Selector).Text()}
|
|
||||||
}
|
|
@ -13,7 +13,7 @@ import (
|
|||||||
|
|
||||||
// Extractor interface is for extracting data from HTML document
|
// Extractor interface is for extracting data from HTML document
|
||||||
type Extractor interface {
|
type Extractor interface {
|
||||||
Extract(doc *goquery.Document) interface{}
|
Extract(doc *goquery.Document) (interface{}, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Exporter interface is for extracting data to external resources.
|
// Exporter interface is for extracting data to external resources.
|
||||||
|
@ -7,8 +7,8 @@ import (
|
|||||||
"github.com/fpfeng/httpcache"
|
"github.com/fpfeng/httpcache"
|
||||||
"github.com/geziyor/geziyor"
|
"github.com/geziyor/geziyor"
|
||||||
"github.com/geziyor/geziyor/client"
|
"github.com/geziyor/geziyor/client"
|
||||||
"github.com/geziyor/geziyor/exporter"
|
"github.com/geziyor/geziyor/export"
|
||||||
"github.com/geziyor/geziyor/extractor"
|
"github.com/geziyor/geziyor/extract"
|
||||||
"github.com/geziyor/geziyor/metrics"
|
"github.com/geziyor/geziyor/metrics"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
@ -44,7 +44,7 @@ func TestQuotes(t *testing.T) {
|
|||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||||
ParseFunc: quotesParse,
|
ParseFunc: quotesParse,
|
||||||
Exporters: []geziyor.Exporter{&exporter.JSON{}},
|
Exporters: []geziyor.Exporter{&export.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,7 +84,7 @@ func TestAllLinks(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
Exporters: []geziyor.Exporter{&exporter.CSV{}},
|
Exporters: []geziyor.Exporter{&export.CSV{}},
|
||||||
MetricsType: metrics.Prometheus,
|
MetricsType: metrics.Prometheus,
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
@ -99,7 +99,7 @@ func TestStartRequestsFunc(t *testing.T) {
|
|||||||
g.Exports <- s.AttrOr("href", "")
|
g.Exports <- s.AttrOr("href", "")
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
Exporters: []geziyor.Exporter{&exporter.JSON{}},
|
Exporters: []geziyor.Exporter{&export.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -163,12 +163,15 @@ func TestExtractor(t *testing.T) {
|
|||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||||
Extractors: []geziyor.Extractor{
|
Extractors: []geziyor.Extractor{
|
||||||
&extractor.Text{Name: "title", Selector: ".c-page-title"},
|
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||||
&extractor.Text{Name: "byline", Selector: ".c-byline__item:nth-child(1) > a"},
|
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||||
&extractor.Text{Name: "summary", Selector: ".c-entry-summary"},
|
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||||
&extractor.Text{Name: "content", Selector: ".c-entry-content"},
|
&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
|
||||||
|
&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
|
||||||
|
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||||
|
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||||
},
|
},
|
||||||
Exporters: []geziyor.Exporter{&exporter.JSON{}},
|
Exporters: []geziyor.Exporter{&export.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -107,7 +107,11 @@ func extractorsMiddleware(g *Geziyor, r *client.Response) {
|
|||||||
exports := map[string]interface{}{}
|
exports := map[string]interface{}{}
|
||||||
|
|
||||||
for _, extractor := range g.Opt.Extractors {
|
for _, extractor := range g.Opt.Extractors {
|
||||||
extracted := extractor.Extract(r.HTMLDoc)
|
extracted, err := extractor.Extract(r.HTMLDoc)
|
||||||
|
if err != nil {
|
||||||
|
log.Println("extraction error: ", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
// Check extracted data type and use it accordingly
|
// Check extracted data type and use it accordingly
|
||||||
val := reflect.ValueOf(extracted)
|
val := reflect.ValueOf(extracted)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user