Attribute extractor added. HTML extractor added. Outer HTML Extractor added.

exporter package renamed to export, extractor package renamed to extract for simplicity.
This commit is contained in:
Musab Gültekin 2019-06-30 22:20:17 +03:00
parent 7c383b175f
commit 0eda056065
12 changed files with 115 additions and 31 deletions

View File

@ -1,4 +1,4 @@
package exporter package export
import ( import (
"encoding/csv" "encoding/csv"

View File

@ -1,4 +1,4 @@
package exporter package export
import "testing" import "testing"

View File

@ -1,4 +1,4 @@
package exporter package export
import ( import (
"encoding/json" "encoding/json"

View File

@ -1,4 +1,4 @@
package exporter package export
import "testing" import "testing"

View File

@ -1,4 +1,4 @@
package exporter package export
import ( import (
"encoding/json" "encoding/json"

24
extract/attr.go Normal file
View File

@ -0,0 +1,24 @@
package extract
import (
"errors"
"github.com/PuerkitoBio/goquery"
)
var ErrAttrNotExists = errors.New("attribute not exist")
// Attr returns HTML attribute value of provided selector
type Attr struct {
Name string
Selector string
Attr string
}
// Extract returns HTML attribute value of provided selector
func (e *Attr) Extract(doc *goquery.Document) (interface{}, error) {
attr, exists := doc.Find(e.Selector).Attr(e.Attr)
if !exists {
return nil, ErrAttrNotExists
}
return map[string]string{e.Name: attr}, nil
}

53
extract/html.go Normal file
View File

@ -0,0 +1,53 @@
package extract
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
)
// HTML extracts and returns the HTML from inside each element of the given selection.
type HTML struct {
Name string
Selector string
}
// Extract extracts and returns the HTML from inside each element of the given selection.
func (e *HTML) Extract(doc *goquery.Document) (interface{}, error) {
var ret, h string
var err error
doc.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
h, err = s.Html()
if err != nil {
return false
}
ret += h
return true
})
if err != nil {
return nil, err
}
return map[string]string{e.Name: ret}, nil
}
// OuterHTML extracts and returns the HTML of each element of the given selection.
type OuterHTML struct {
Name string
Selector string
}
// Extract extracts and returns the HTML of each element of the given selection.
func (e *OuterHTML) Extract(doc *goquery.Document) (interface{}, error) {
output := bytes.NewBufferString("")
for _, node := range doc.Find(e.Selector).Nodes {
if err := html.Render(output, node); err != nil {
return nil, err
}
}
return map[string]string{e.Name: output.String()}, nil
}

14
extract/text.go Normal file
View File

@ -0,0 +1,14 @@
package extract
import "github.com/PuerkitoBio/goquery"
// Text returns the combined text contents of provided selector.
type Text struct {
Name string
Selector string
}
// Extract returns the combined text contents of provided selector.
func (e *Text) Extract(doc *goquery.Document) (interface{}, error) {
return map[string]string{e.Name: doc.Find(e.Selector).Text()}, nil
}

View File

@ -1,14 +0,0 @@
package extractor
import "github.com/PuerkitoBio/goquery"
// Text extracts texts from selected nodes
type Text struct {
Name string
Selector string
}
// Extract extracts texts from selected nodes
func (e *Text) Extract(doc *goquery.Document) interface{} {
return map[string]string{e.Name: doc.Find(e.Selector).Text()}
}

View File

@ -13,7 +13,7 @@ import (
// Extractor interface is for extracting data from HTML document // Extractor interface is for extracting data from HTML document
type Extractor interface { type Extractor interface {
Extract(doc *goquery.Document) interface{} Extract(doc *goquery.Document) (interface{}, error)
} }
// Exporter interface is for extracting data to external resources. // Exporter interface is for extracting data to external resources.

View File

@ -7,8 +7,8 @@ import (
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor" "github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/exporter" "github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/extractor" "github.com/geziyor/geziyor/extract"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
@ -44,7 +44,7 @@ func TestQuotes(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/"}, StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: quotesParse, ParseFunc: quotesParse,
Exporters: []geziyor.Exporter{&exporter.JSON{}}, Exporters: []geziyor.Exporter{&export.JSON{}},
}).Start() }).Start()
} }
@ -84,7 +84,7 @@ func TestAllLinks(t *testing.T) {
} }
}) })
}, },
Exporters: []geziyor.Exporter{&exporter.CSV{}}, Exporters: []geziyor.Exporter{&export.CSV{}},
MetricsType: metrics.Prometheus, MetricsType: metrics.Prometheus,
}).Start() }).Start()
} }
@ -99,7 +99,7 @@ func TestStartRequestsFunc(t *testing.T) {
g.Exports <- s.AttrOr("href", "") g.Exports <- s.AttrOr("href", "")
}) })
}, },
Exporters: []geziyor.Exporter{&exporter.JSON{}}, Exporters: []geziyor.Exporter{&export.JSON{}},
}).Start() }).Start()
} }
@ -163,12 +163,15 @@ func TestExtractor(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"}, StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
Extractors: []geziyor.Extractor{ Extractors: []geziyor.Extractor{
&extractor.Text{Name: "title", Selector: ".c-page-title"}, &extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
&extractor.Text{Name: "byline", Selector: ".c-byline__item:nth-child(1) > a"}, &extract.Text{Name: "title", Selector: ".c-page-title"},
&extractor.Text{Name: "summary", Selector: ".c-entry-summary"}, &extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
&extractor.Text{Name: "content", Selector: ".c-entry-content"}, &extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
&extract.Text{Name: "content", Selector: ".c-entry-content"},
}, },
Exporters: []geziyor.Exporter{&exporter.JSON{}}, Exporters: []geziyor.Exporter{&export.JSON{}},
}).Start() }).Start()
} }

View File

@ -107,7 +107,11 @@ func extractorsMiddleware(g *Geziyor, r *client.Response) {
exports := map[string]interface{}{} exports := map[string]interface{}{}
for _, extractor := range g.Opt.Extractors { for _, extractor := range g.Opt.Extractors {
extracted := extractor.Extract(r.HTMLDoc) extracted, err := extractor.Extract(r.HTMLDoc)
if err != nil {
log.Println("extraction error: ", err)
continue
}
// Check extracted data type and use it accordingly // Check extracted data type and use it accordingly
val := reflect.ValueOf(extracted) val := reflect.ValueOf(extracted)