From 0eda056065f5b67f9e27a27b7a07aa2cd1d14bb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Sun, 30 Jun 2019 22:20:17 +0300 Subject: [PATCH] Attribute extractor added. HTML extractor added. Outer HTML Extractor added. exporter package renamed to export, extractor package renamed to extract for simplicity. --- {exporter => export}/csv.go | 2 +- {exporter => export}/csv_test.go | 2 +- {exporter => export}/json.go | 2 +- {exporter => export}/json_test.go | 2 +- {exporter => export}/pprint.go | 2 +- extract/attr.go | 24 ++++++++++++++ extract/html.go | 53 +++++++++++++++++++++++++++++++ extract/text.go | 14 ++++++++ extractor/text.go | 14 -------- geziyor.go | 2 +- geziyor_test.go | 23 ++++++++------ middleware.go | 6 +++- 12 files changed, 115 insertions(+), 31 deletions(-) rename {exporter => export}/csv.go (98%) rename {exporter => export}/csv_test.go (94%) rename {exporter => export}/json.go (98%) rename {exporter => export}/json_test.go (93%) rename {exporter => export}/pprint.go (95%) create mode 100644 extract/attr.go create mode 100644 extract/html.go create mode 100644 extract/text.go delete mode 100644 extractor/text.go diff --git a/exporter/csv.go b/export/csv.go similarity index 98% rename from exporter/csv.go rename to export/csv.go index 412bddd..e0e01c2 100644 --- a/exporter/csv.go +++ b/export/csv.go @@ -1,4 +1,4 @@ -package exporter +package export import ( "encoding/csv" diff --git a/exporter/csv_test.go b/export/csv_test.go similarity index 94% rename from exporter/csv_test.go rename to export/csv_test.go index 3b7ebea..f11a966 100644 --- a/exporter/csv_test.go +++ b/export/csv_test.go @@ -1,4 +1,4 @@ -package exporter +package export import "testing" diff --git a/exporter/json.go b/export/json.go similarity index 98% rename from exporter/json.go rename to export/json.go index aa01117..3c8d3d0 100644 --- a/exporter/json.go +++ b/export/json.go @@ -1,4 +1,4 @@ -package exporter +package export import ( "encoding/json" diff --git a/exporter/json_test.go b/export/json_test.go similarity index 93% rename from exporter/json_test.go rename to export/json_test.go index 405f86c..70db411 100644 --- a/exporter/json_test.go +++ b/export/json_test.go @@ -1,4 +1,4 @@ -package exporter +package export import "testing" diff --git a/exporter/pprint.go b/export/pprint.go similarity index 95% rename from exporter/pprint.go rename to export/pprint.go index 4deaad4..811c05d 100644 --- a/exporter/pprint.go +++ b/export/pprint.go @@ -1,4 +1,4 @@ -package exporter +package export import ( "encoding/json" diff --git a/extract/attr.go b/extract/attr.go new file mode 100644 index 0000000..c9405ac --- /dev/null +++ b/extract/attr.go @@ -0,0 +1,24 @@ +package extract + +import ( + "errors" + "github.com/PuerkitoBio/goquery" +) + +var ErrAttrNotExists = errors.New("attribute not exist") + +// Attr returns HTML attribute value of provided selector +type Attr struct { + Name string + Selector string + Attr string +} + +// Extract returns HTML attribute value of provided selector +func (e *Attr) Extract(doc *goquery.Document) (interface{}, error) { + attr, exists := doc.Find(e.Selector).Attr(e.Attr) + if !exists { + return nil, ErrAttrNotExists + } + return map[string]string{e.Name: attr}, nil +} diff --git a/extract/html.go b/extract/html.go new file mode 100644 index 0000000..5878dd9 --- /dev/null +++ b/extract/html.go @@ -0,0 +1,53 @@ +package extract + +import ( + "bytes" + "github.com/PuerkitoBio/goquery" + "golang.org/x/net/html" +) + +// HTML extracts and returns the HTML from inside each element of the given selection. +type HTML struct { + Name string + Selector string +} + +// Extract extracts and returns the HTML from inside each element of the given selection. +func (e *HTML) Extract(doc *goquery.Document) (interface{}, error) { + var ret, h string + var err error + + doc.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool { + h, err = s.Html() + if err != nil { + return false + } + + ret += h + return true + }) + + if err != nil { + return nil, err + } + + return map[string]string{e.Name: ret}, nil +} + +// OuterHTML extracts and returns the HTML of each element of the given selection. +type OuterHTML struct { + Name string + Selector string +} + +// Extract extracts and returns the HTML of each element of the given selection. +func (e *OuterHTML) Extract(doc *goquery.Document) (interface{}, error) { + output := bytes.NewBufferString("") + for _, node := range doc.Find(e.Selector).Nodes { + if err := html.Render(output, node); err != nil { + return nil, err + } + } + + return map[string]string{e.Name: output.String()}, nil +} diff --git a/extract/text.go b/extract/text.go new file mode 100644 index 0000000..e02803b --- /dev/null +++ b/extract/text.go @@ -0,0 +1,14 @@ +package extract + +import "github.com/PuerkitoBio/goquery" + +// Text returns the combined text contents of provided selector. +type Text struct { + Name string + Selector string +} + +// Extract returns the combined text contents of provided selector. +func (e *Text) Extract(doc *goquery.Document) (interface{}, error) { + return map[string]string{e.Name: doc.Find(e.Selector).Text()}, nil +} diff --git a/extractor/text.go b/extractor/text.go deleted file mode 100644 index c575c08..0000000 --- a/extractor/text.go +++ /dev/null @@ -1,14 +0,0 @@ -package extractor - -import "github.com/PuerkitoBio/goquery" - -// Text extracts texts from selected nodes -type Text struct { - Name string - Selector string -} - -// Extract extracts texts from selected nodes -func (e *Text) Extract(doc *goquery.Document) interface{} { - return map[string]string{e.Name: doc.Find(e.Selector).Text()} -} diff --git a/geziyor.go b/geziyor.go index 7aa9a1f..0eeb50f 100644 --- a/geziyor.go +++ b/geziyor.go @@ -13,7 +13,7 @@ import ( // Extractor interface is for extracting data from HTML document type Extractor interface { - Extract(doc *goquery.Document) interface{} + Extract(doc *goquery.Document) (interface{}, error) } // Exporter interface is for extracting data to external resources. diff --git a/geziyor_test.go b/geziyor_test.go index 7059e60..286fb39 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -7,8 +7,8 @@ import ( "github.com/fpfeng/httpcache" "github.com/geziyor/geziyor" "github.com/geziyor/geziyor/client" - "github.com/geziyor/geziyor/exporter" - "github.com/geziyor/geziyor/extractor" + "github.com/geziyor/geziyor/export" + "github.com/geziyor/geziyor/extract" "github.com/geziyor/geziyor/metrics" "net/http" "net/http/httptest" @@ -44,7 +44,7 @@ func TestQuotes(t *testing.T) { geziyor.NewGeziyor(&geziyor.Options{ StartURLs: []string{"http://quotes.toscrape.com/"}, ParseFunc: quotesParse, - Exporters: []geziyor.Exporter{&exporter.JSON{}}, + Exporters: []geziyor.Exporter{&export.JSON{}}, }).Start() } @@ -84,7 +84,7 @@ func TestAllLinks(t *testing.T) { } }) }, - Exporters: []geziyor.Exporter{&exporter.CSV{}}, + Exporters: []geziyor.Exporter{&export.CSV{}}, MetricsType: metrics.Prometheus, }).Start() } @@ -99,7 +99,7 @@ func TestStartRequestsFunc(t *testing.T) { g.Exports <- s.AttrOr("href", "") }) }, - Exporters: []geziyor.Exporter{&exporter.JSON{}}, + Exporters: []geziyor.Exporter{&export.JSON{}}, }).Start() } @@ -163,12 +163,15 @@ func TestExtractor(t *testing.T) { geziyor.NewGeziyor(&geziyor.Options{ StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"}, Extractors: []geziyor.Extractor{ - &extractor.Text{Name: "title", Selector: ".c-page-title"}, - &extractor.Text{Name: "byline", Selector: ".c-byline__item:nth-child(1) > a"}, - &extractor.Text{Name: "summary", Selector: ".c-entry-summary"}, - &extractor.Text{Name: "content", Selector: ".c-entry-content"}, + &extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"}, + &extract.Text{Name: "title", Selector: ".c-page-title"}, + &extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"}, + &extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"}, + &extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"}, + &extract.Text{Name: "summary", Selector: ".c-entry-summary"}, + &extract.Text{Name: "content", Selector: ".c-entry-content"}, }, - Exporters: []geziyor.Exporter{&exporter.JSON{}}, + Exporters: []geziyor.Exporter{&export.JSON{}}, }).Start() } diff --git a/middleware.go b/middleware.go index e1a2cf2..58e7a60 100644 --- a/middleware.go +++ b/middleware.go @@ -107,7 +107,11 @@ func extractorsMiddleware(g *Geziyor, r *client.Response) { exports := map[string]interface{}{} for _, extractor := range g.Opt.Extractors { - extracted := extractor.Extract(r.HTMLDoc) + extracted, err := extractor.Extract(r.HTMLDoc) + if err != nil { + log.Println("extraction error: ", err) + continue + } // Check extracted data type and use it accordingly val := reflect.ValueOf(extracted)