Exporter and Extractor interfaces moved to its own package for simplicity of main Geziyor package
This commit is contained in:
parent
c0dd0393e6
commit
4ab7cfd904
@ -40,7 +40,7 @@ func main() {
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||
ParseFunc: quotesParse,
|
||||
Exporters: []geziyor.Exporter{export.JSON{}},
|
||||
Exporters: []export.Exporter{export.JSON{}},
|
||||
}).Start()
|
||||
}
|
||||
|
||||
@ -110,7 +110,7 @@ You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor)
|
||||
```go
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||
Extractors: []geziyor.Extractor{
|
||||
Extractors: []extract.Extractor{
|
||||
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||
@ -119,7 +119,7 @@ geziyor.NewGeziyor(&geziyor.Options{
|
||||
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||
},
|
||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
||||
Exporters: []export.Exporter{&export.JSON{}},
|
||||
}).Start()
|
||||
```
|
||||
|
||||
@ -157,7 +157,7 @@ geziyor.NewGeziyor(&geziyor.Options{
|
||||
}
|
||||
})
|
||||
},
|
||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
||||
Exporters: []export.Exporter{&export.JSON{}},
|
||||
}).Start()
|
||||
```
|
||||
|
||||
|
8
export/export.go
Normal file
8
export/export.go
Normal file
@ -0,0 +1,8 @@
|
||||
package export
|
||||
|
||||
// Exporter interface is for extracting data to external resources.
|
||||
// Geziyor calls every extractors Export functions before any scraping starts.
|
||||
// Export functions should wait for new data from exports chan.
|
||||
type Exporter interface {
|
||||
Export(exports chan interface{})
|
||||
}
|
8
extract/extract.go
Normal file
8
extract/extract.go
Normal file
@ -0,0 +1,8 @@
|
||||
package extract
|
||||
|
||||
import "github.com/PuerkitoBio/goquery"
|
||||
|
||||
// Extractor interface is for extracting data from HTML document
|
||||
type Extractor interface {
|
||||
Extract(doc *goquery.Document) (interface{}, error)
|
||||
}
|
13
geziyor.go
13
geziyor.go
@ -1,7 +1,6 @@
|
||||
package geziyor
|
||||
|
||||
import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/fpfeng/httpcache"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/metrics"
|
||||
@ -11,18 +10,6 @@ import (
|
||||
"sync"
|
||||
)
|
||||
|
||||
// Extractor interface is for extracting data from HTML document
|
||||
type Extractor interface {
|
||||
Extract(doc *goquery.Document) (interface{}, error)
|
||||
}
|
||||
|
||||
// Exporter interface is for extracting data to external resources.
|
||||
// Geziyor calls every extractors Export functions before any scraping starts.
|
||||
// Export functions should wait for new data from exports chan.
|
||||
type Exporter interface {
|
||||
Export(exports chan interface{})
|
||||
}
|
||||
|
||||
// Geziyor is our main scraper type
|
||||
type Geziyor struct {
|
||||
Opt *Options
|
||||
|
@ -44,7 +44,7 @@ func TestQuotes(t *testing.T) {
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||
ParseFunc: quotesParse,
|
||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
||||
Exporters: []export.Exporter{&export.JSON{}},
|
||||
}).Start()
|
||||
}
|
||||
|
||||
@ -84,7 +84,7 @@ func TestAllLinks(t *testing.T) {
|
||||
}
|
||||
})
|
||||
},
|
||||
Exporters: []geziyor.Exporter{&export.CSV{}},
|
||||
Exporters: []export.Exporter{&export.CSV{}},
|
||||
MetricsType: metrics.Prometheus,
|
||||
}).Start()
|
||||
}
|
||||
@ -99,7 +99,7 @@ func TestStartRequestsFunc(t *testing.T) {
|
||||
g.Exports <- s.AttrOr("href", "")
|
||||
})
|
||||
},
|
||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
||||
Exporters: []export.Exporter{&export.JSON{}},
|
||||
}).Start()
|
||||
}
|
||||
|
||||
@ -162,7 +162,7 @@ func TestBasicAuth(t *testing.T) {
|
||||
func TestExtractor(t *testing.T) {
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||
Extractors: []geziyor.Extractor{
|
||||
Extractors: []extract.Extractor{
|
||||
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||
@ -171,7 +171,7 @@ func TestExtractor(t *testing.T) {
|
||||
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||
},
|
||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
||||
Exporters: []export.Exporter{&export.JSON{}},
|
||||
}).Start()
|
||||
}
|
||||
|
||||
@ -265,7 +265,7 @@ func BenchmarkWhole(b *testing.B) {
|
||||
}
|
||||
})
|
||||
},
|
||||
Exporters: []geziyor.Exporter{&export.CSV{}},
|
||||
Exporters: []export.Exporter{&export.CSV{}},
|
||||
//MetricsType: metrics.Prometheus,
|
||||
LogDisabled: true,
|
||||
}).Start()
|
||||
|
@ -3,6 +3,8 @@ package geziyor
|
||||
import (
|
||||
"github.com/fpfeng/httpcache"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/export"
|
||||
"github.com/geziyor/geziyor/extract"
|
||||
"github.com/geziyor/geziyor/metrics"
|
||||
"time"
|
||||
)
|
||||
@ -23,7 +25,7 @@ type Options struct {
|
||||
ParseFunc func(g *Geziyor, r *client.Response)
|
||||
|
||||
// Extractors extracts items from pages
|
||||
Extractors []Extractor
|
||||
Extractors []extract.Extractor
|
||||
|
||||
// Timeout is global request timeout
|
||||
Timeout time.Duration
|
||||
@ -50,7 +52,7 @@ type Options struct {
|
||||
LogDisabled bool
|
||||
|
||||
// For extracting data
|
||||
Exporters []Exporter
|
||||
Exporters []export.Exporter
|
||||
|
||||
// Called before requests made to manipulate requests
|
||||
RequestMiddlewares []RequestMiddleware
|
||||
|
Loading…
x
Reference in New Issue
Block a user