Attribute extractor added. HTML extractor added. Outer HTML Extractor added.

exporter package renamed to export, extractor package renamed to extract for simplicity.
2019-06-30 22:20:17 +03:00
parent 7c383b175f
commit 0eda056065
12 changed files with 115 additions and 31 deletions
@@ -1,4 +1,4 @@
-package exporter
+package export

 import (
 	"encoding/csv"
@@ -1,4 +1,4 @@
-package exporter
+package export

 import "testing"

@@ -1,4 +1,4 @@
-package exporter
+package export

 import (
 	"encoding/json"
@@ -1,4 +1,4 @@
-package exporter
+package export

 import "testing"

@@ -1,4 +1,4 @@
-package exporter
+package export

 import (
 	"encoding/json"
@@ -0,0 +1,24 @@
+package extract
+
+import (
+	"errors"
+	"github.com/PuerkitoBio/goquery"
+)
+
+var ErrAttrNotExists = errors.New("attribute not exist")
+
+// Attr returns HTML attribute value of provided selector
+type Attr struct {
+	Name     string
+	Selector string
+	Attr     string
+}
+
+// Extract returns HTML attribute value of provided selector
+func (e *Attr) Extract(doc *goquery.Document) (interface{}, error) {
+	attr, exists := doc.Find(e.Selector).Attr(e.Attr)
+	if !exists {
+		return nil, ErrAttrNotExists
+	}
+	return map[string]string{e.Name: attr}, nil
+}
@@ -0,0 +1,53 @@
+package extract
+
+import (
+	"bytes"
+	"github.com/PuerkitoBio/goquery"
+	"golang.org/x/net/html"
+)
+
+// HTML extracts and returns the HTML from inside each element of the given selection.
+type HTML struct {
+	Name     string
+	Selector string
+}
+
+// Extract extracts and returns the HTML from inside each element of the given selection.
+func (e *HTML) Extract(doc *goquery.Document) (interface{}, error) {
+	var ret, h string
+	var err error
+
+	doc.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
+		h, err = s.Html()
+		if err != nil {
+			return false
+		}
+
+		ret += h
+		return true
+	})
+
+	if err != nil {
+		return nil, err
+	}
+
+	return map[string]string{e.Name: ret}, nil
+}
+
+// OuterHTML extracts and returns the HTML of each element of the given selection.
+type OuterHTML struct {
+	Name     string
+	Selector string
+}
+
+// Extract extracts and returns the HTML of each element of the given selection.
+func (e *OuterHTML) Extract(doc *goquery.Document) (interface{}, error) {
+	output := bytes.NewBufferString("")
+	for _, node := range doc.Find(e.Selector).Nodes {
+		if err := html.Render(output, node); err != nil {
+			return nil, err
+		}
+	}
+
+	return map[string]string{e.Name: output.String()}, nil
+}
@@ -0,0 +1,14 @@
+package extract
+
+import "github.com/PuerkitoBio/goquery"
+
+// Text returns the combined text contents of provided selector.
+type Text struct {
+	Name     string
+	Selector string
+}
+
+// Extract returns the combined text contents of provided selector.
+func (e *Text) Extract(doc *goquery.Document) (interface{}, error) {
+	return map[string]string{e.Name: doc.Find(e.Selector).Text()}, nil
+}
@@ -1,14 +0,0 @@
-package extractor
-
-import "github.com/PuerkitoBio/goquery"
-
-// Text extracts texts from selected nodes
-type Text struct {
-	Name     string
-	Selector string
-}
-
-// Extract extracts texts from selected nodes
-func (e *Text) Extract(doc *goquery.Document) interface{} {
-	return map[string]string{e.Name: doc.Find(e.Selector).Text()}
-}
@@ -13,7 +13,7 @@ import (

 // Extractor interface is for extracting data from HTML document
 type Extractor interface {
-	Extract(doc *goquery.Document) interface{}
+	Extract(doc *goquery.Document) (interface{}, error)
 }

 // Exporter interface is for extracting data to external resources.
@@ -7,8 +7,8 @@ import (
 	"github.com/fpfeng/httpcache"
 	"github.com/geziyor/geziyor"
 	"github.com/geziyor/geziyor/client"
-	"github.com/geziyor/geziyor/exporter"
-	"github.com/geziyor/geziyor/extractor"
+	"github.com/geziyor/geziyor/export"
+	"github.com/geziyor/geziyor/extract"
 	"github.com/geziyor/geziyor/metrics"
 	"net/http"
 	"net/http/httptest"
@@ -44,7 +44,7 @@ func TestQuotes(t *testing.T) {
 	geziyor.NewGeziyor(&geziyor.Options{
 		StartURLs: []string{"http://quotes.toscrape.com/"},
 		ParseFunc: quotesParse,
-		Exporters: []geziyor.Exporter{&exporter.JSON{}},
+		Exporters: []geziyor.Exporter{&export.JSON{}},
 	}).Start()
 }

@@ -84,7 +84,7 @@ func TestAllLinks(t *testing.T) {
 				}
 			})
 		},
-		Exporters:   []geziyor.Exporter{&exporter.CSV{}},
+		Exporters:   []geziyor.Exporter{&export.CSV{}},
 		MetricsType: metrics.Prometheus,
 	}).Start()
 }
@@ -99,7 +99,7 @@ func TestStartRequestsFunc(t *testing.T) {
 				g.Exports <- s.AttrOr("href", "")
 			})
 		},
-		Exporters: []geziyor.Exporter{&exporter.JSON{}},
+		Exporters: []geziyor.Exporter{&export.JSON{}},
 	}).Start()
 }

@@ -163,12 +163,15 @@ func TestExtractor(t *testing.T) {
 	geziyor.NewGeziyor(&geziyor.Options{
 		StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
 		Extractors: []geziyor.Extractor{
-			&extractor.Text{Name: "title", Selector: ".c-page-title"},
-			&extractor.Text{Name: "byline", Selector: ".c-byline__item:nth-child(1) > a"},
-			&extractor.Text{Name: "summary", Selector: ".c-entry-summary"},
-			&extractor.Text{Name: "content", Selector: ".c-entry-content"},
+			&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
+			&extract.Text{Name: "title", Selector: ".c-page-title"},
+			&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
+			&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
+			&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
+			&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
+			&extract.Text{Name: "content", Selector: ".c-entry-content"},
 		},
-		Exporters: []geziyor.Exporter{&exporter.JSON{}},
+		Exporters: []geziyor.Exporter{&export.JSON{}},
 	}).Start()
 }

@@ -107,7 +107,11 @@ func extractorsMiddleware(g *Geziyor, r *client.Response) {
 		exports := map[string]interface{}{}

 		for _, extractor := range g.Opt.Extractors {
-			extracted := extractor.Extract(r.HTMLDoc)
+			extracted, err := extractor.Extract(r.HTMLDoc)
+			if err != nil {
+				log.Println("extraction error: ", err)
+				continue
+			}

 			// Check extracted data type and use it accordingly
 			val := reflect.ValueOf(extracted)