diff --git a/README.md b/README.md index 2f6c1b9..fcbfbf5 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ func main() { geziyor.NewGeziyor(&geziyor.Options{ StartURLs: []string{"http://quotes.toscrape.com/"}, ParseFunc: quotesParse, - Exporters: []geziyor.Exporter{export.JSON{}}, + Exporters: []export.Exporter{export.JSON{}}, }).Start() } @@ -110,7 +110,7 @@ You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor) ```go geziyor.NewGeziyor(&geziyor.Options{ StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"}, - Extractors: []geziyor.Extractor{ + Extractors: []extract.Extractor{ &extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"}, &extract.Text{Name: "title", Selector: ".c-page-title"}, &extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"}, @@ -119,7 +119,7 @@ geziyor.NewGeziyor(&geziyor.Options{ &extract.Text{Name: "summary", Selector: ".c-entry-summary"}, &extract.Text{Name: "content", Selector: ".c-entry-content"}, }, - Exporters: []geziyor.Exporter{&export.JSON{}}, + Exporters: []export.Exporter{&export.JSON{}}, }).Start() ``` @@ -157,7 +157,7 @@ geziyor.NewGeziyor(&geziyor.Options{ } }) }, - Exporters: []geziyor.Exporter{&export.JSON{}}, + Exporters: []export.Exporter{&export.JSON{}}, }).Start() ``` diff --git a/export/export.go b/export/export.go new file mode 100644 index 0000000..6d8cbf8 --- /dev/null +++ b/export/export.go @@ -0,0 +1,8 @@ +package export + +// Exporter interface is for extracting data to external resources. +// Geziyor calls every extractors Export functions before any scraping starts. +// Export functions should wait for new data from exports chan. +type Exporter interface { + Export(exports chan interface{}) +} diff --git a/extract/extract.go b/extract/extract.go new file mode 100644 index 0000000..b17de05 --- /dev/null +++ b/extract/extract.go @@ -0,0 +1,8 @@ +package extract + +import "github.com/PuerkitoBio/goquery" + +// Extractor interface is for extracting data from HTML document +type Extractor interface { + Extract(doc *goquery.Document) (interface{}, error) +} diff --git a/geziyor.go b/geziyor.go index 4967c4f..a60bfec 100644 --- a/geziyor.go +++ b/geziyor.go @@ -1,7 +1,6 @@ package geziyor import ( - "github.com/PuerkitoBio/goquery" "github.com/fpfeng/httpcache" "github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/metrics" @@ -11,18 +10,6 @@ import ( "sync" ) -// Extractor interface is for extracting data from HTML document -type Extractor interface { - Extract(doc *goquery.Document) (interface{}, error) -} - -// Exporter interface is for extracting data to external resources. -// Geziyor calls every extractors Export functions before any scraping starts. -// Export functions should wait for new data from exports chan. -type Exporter interface { - Export(exports chan interface{}) -} - // Geziyor is our main scraper type type Geziyor struct { Opt *Options diff --git a/geziyor_test.go b/geziyor_test.go index f36a8ad..6583081 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -44,7 +44,7 @@ func TestQuotes(t *testing.T) { geziyor.NewGeziyor(&geziyor.Options{ StartURLs: []string{"http://quotes.toscrape.com/"}, ParseFunc: quotesParse, - Exporters: []geziyor.Exporter{&export.JSON{}}, + Exporters: []export.Exporter{&export.JSON{}}, }).Start() } @@ -84,7 +84,7 @@ func TestAllLinks(t *testing.T) { } }) }, - Exporters: []geziyor.Exporter{&export.CSV{}}, + Exporters: []export.Exporter{&export.CSV{}}, MetricsType: metrics.Prometheus, }).Start() } @@ -99,7 +99,7 @@ func TestStartRequestsFunc(t *testing.T) { g.Exports <- s.AttrOr("href", "") }) }, - Exporters: []geziyor.Exporter{&export.JSON{}}, + Exporters: []export.Exporter{&export.JSON{}}, }).Start() } @@ -162,7 +162,7 @@ func TestBasicAuth(t *testing.T) { func TestExtractor(t *testing.T) { geziyor.NewGeziyor(&geziyor.Options{ StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"}, - Extractors: []geziyor.Extractor{ + Extractors: []extract.Extractor{ &extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"}, &extract.Text{Name: "title", Selector: ".c-page-title"}, &extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"}, @@ -171,7 +171,7 @@ func TestExtractor(t *testing.T) { &extract.Text{Name: "summary", Selector: ".c-entry-summary"}, &extract.Text{Name: "content", Selector: ".c-entry-content"}, }, - Exporters: []geziyor.Exporter{&export.JSON{}}, + Exporters: []export.Exporter{&export.JSON{}}, }).Start() } @@ -265,7 +265,7 @@ func BenchmarkWhole(b *testing.B) { } }) }, - Exporters: []geziyor.Exporter{&export.CSV{}}, + Exporters: []export.Exporter{&export.CSV{}}, //MetricsType: metrics.Prometheus, LogDisabled: true, }).Start() diff --git a/options.go b/options.go index 285c64c..fa6c140 100644 --- a/options.go +++ b/options.go @@ -3,6 +3,8 @@ package geziyor import ( "github.com/fpfeng/httpcache" "github.com/geziyor/geziyor/client" + "github.com/geziyor/geziyor/export" + "github.com/geziyor/geziyor/extract" "github.com/geziyor/geziyor/metrics" "time" ) @@ -23,7 +25,7 @@ type Options struct { ParseFunc func(g *Geziyor, r *client.Response) // Extractors extracts items from pages - Extractors []Extractor + Extractors []extract.Extractor // Timeout is global request timeout Timeout time.Duration @@ -50,7 +52,7 @@ type Options struct { LogDisabled bool // For extracting data - Exporters []Exporter + Exporters []export.Exporter // Called before requests made to manipulate requests RequestMiddlewares []RequestMiddleware