Exporter and Extractor interfaces moved to its own package for simplicity of main Geziyor package

This commit is contained in:
Musab Gültekin 2019-07-02 13:22:23 +03:00
parent c0dd0393e6
commit 4ab7cfd904
6 changed files with 30 additions and 25 deletions

View File

@ -40,7 +40,7 @@ func main() {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/"}, StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: quotesParse, ParseFunc: quotesParse,
Exporters: []geziyor.Exporter{export.JSON{}}, Exporters: []export.Exporter{export.JSON{}},
}).Start() }).Start()
} }
@ -110,7 +110,7 @@ You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor)
```go ```go
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"}, StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
Extractors: []geziyor.Extractor{ Extractors: []extract.Extractor{
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"}, &extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
&extract.Text{Name: "title", Selector: ".c-page-title"}, &extract.Text{Name: "title", Selector: ".c-page-title"},
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"}, &extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
@ -119,7 +119,7 @@ geziyor.NewGeziyor(&geziyor.Options{
&extract.Text{Name: "summary", Selector: ".c-entry-summary"}, &extract.Text{Name: "summary", Selector: ".c-entry-summary"},
&extract.Text{Name: "content", Selector: ".c-entry-content"}, &extract.Text{Name: "content", Selector: ".c-entry-content"},
}, },
Exporters: []geziyor.Exporter{&export.JSON{}}, Exporters: []export.Exporter{&export.JSON{}},
}).Start() }).Start()
``` ```
@ -157,7 +157,7 @@ geziyor.NewGeziyor(&geziyor.Options{
} }
}) })
}, },
Exporters: []geziyor.Exporter{&export.JSON{}}, Exporters: []export.Exporter{&export.JSON{}},
}).Start() }).Start()
``` ```

8
export/export.go Normal file
View File

@ -0,0 +1,8 @@
package export
// Exporter interface is for extracting data to external resources.
// Geziyor calls every extractors Export functions before any scraping starts.
// Export functions should wait for new data from exports chan.
type Exporter interface {
Export(exports chan interface{})
}

8
extract/extract.go Normal file
View File

@ -0,0 +1,8 @@
package extract
import "github.com/PuerkitoBio/goquery"
// Extractor interface is for extracting data from HTML document
type Extractor interface {
Extract(doc *goquery.Document) (interface{}, error)
}

View File

@ -1,7 +1,6 @@
package geziyor package geziyor
import ( import (
"github.com/PuerkitoBio/goquery"
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
@ -11,18 +10,6 @@ import (
"sync" "sync"
) )
// Extractor interface is for extracting data from HTML document
type Extractor interface {
Extract(doc *goquery.Document) (interface{}, error)
}
// Exporter interface is for extracting data to external resources.
// Geziyor calls every extractors Export functions before any scraping starts.
// Export functions should wait for new data from exports chan.
type Exporter interface {
Export(exports chan interface{})
}
// Geziyor is our main scraper type // Geziyor is our main scraper type
type Geziyor struct { type Geziyor struct {
Opt *Options Opt *Options

View File

@ -44,7 +44,7 @@ func TestQuotes(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/"}, StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: quotesParse, ParseFunc: quotesParse,
Exporters: []geziyor.Exporter{&export.JSON{}}, Exporters: []export.Exporter{&export.JSON{}},
}).Start() }).Start()
} }
@ -84,7 +84,7 @@ func TestAllLinks(t *testing.T) {
} }
}) })
}, },
Exporters: []geziyor.Exporter{&export.CSV{}}, Exporters: []export.Exporter{&export.CSV{}},
MetricsType: metrics.Prometheus, MetricsType: metrics.Prometheus,
}).Start() }).Start()
} }
@ -99,7 +99,7 @@ func TestStartRequestsFunc(t *testing.T) {
g.Exports <- s.AttrOr("href", "") g.Exports <- s.AttrOr("href", "")
}) })
}, },
Exporters: []geziyor.Exporter{&export.JSON{}}, Exporters: []export.Exporter{&export.JSON{}},
}).Start() }).Start()
} }
@ -162,7 +162,7 @@ func TestBasicAuth(t *testing.T) {
func TestExtractor(t *testing.T) { func TestExtractor(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"}, StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
Extractors: []geziyor.Extractor{ Extractors: []extract.Extractor{
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"}, &extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
&extract.Text{Name: "title", Selector: ".c-page-title"}, &extract.Text{Name: "title", Selector: ".c-page-title"},
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"}, &extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
@ -171,7 +171,7 @@ func TestExtractor(t *testing.T) {
&extract.Text{Name: "summary", Selector: ".c-entry-summary"}, &extract.Text{Name: "summary", Selector: ".c-entry-summary"},
&extract.Text{Name: "content", Selector: ".c-entry-content"}, &extract.Text{Name: "content", Selector: ".c-entry-content"},
}, },
Exporters: []geziyor.Exporter{&export.JSON{}}, Exporters: []export.Exporter{&export.JSON{}},
}).Start() }).Start()
} }
@ -265,7 +265,7 @@ func BenchmarkWhole(b *testing.B) {
} }
}) })
}, },
Exporters: []geziyor.Exporter{&export.CSV{}}, Exporters: []export.Exporter{&export.CSV{}},
//MetricsType: metrics.Prometheus, //MetricsType: metrics.Prometheus,
LogDisabled: true, LogDisabled: true,
}).Start() }).Start()

View File

@ -3,6 +3,8 @@ package geziyor
import ( import (
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/extract"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
"time" "time"
) )
@ -23,7 +25,7 @@ type Options struct {
ParseFunc func(g *Geziyor, r *client.Response) ParseFunc func(g *Geziyor, r *client.Response)
// Extractors extracts items from pages // Extractors extracts items from pages
Extractors []Extractor Extractors []extract.Extractor
// Timeout is global request timeout // Timeout is global request timeout
Timeout time.Duration Timeout time.Duration
@ -50,7 +52,7 @@ type Options struct {
LogDisabled bool LogDisabled bool
// For extracting data // For extracting data
Exporters []Exporter Exporters []export.Exporter
// Called before requests made to manipulate requests // Called before requests made to manipulate requests
RequestMiddlewares []RequestMiddleware RequestMiddlewares []RequestMiddleware