Exporter and Extractor interfaces moved to its own package for simplicity of main Geziyor package

This commit is contained in:
Musab Gültekin 2019-07-02 13:22:23 +03:00
parent c0dd0393e6
commit 4ab7cfd904
6 changed files with 30 additions and 25 deletions

View File

@ -40,7 +40,7 @@ func main() {
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: quotesParse,
Exporters: []geziyor.Exporter{export.JSON{}},
Exporters: []export.Exporter{export.JSON{}},
}).Start()
}
@ -110,7 +110,7 @@ You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor)
```go
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
Extractors: []geziyor.Extractor{
Extractors: []extract.Extractor{
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
&extract.Text{Name: "title", Selector: ".c-page-title"},
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
@ -119,7 +119,7 @@ geziyor.NewGeziyor(&geziyor.Options{
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
&extract.Text{Name: "content", Selector: ".c-entry-content"},
},
Exporters: []geziyor.Exporter{&export.JSON{}},
Exporters: []export.Exporter{&export.JSON{}},
}).Start()
```
@ -157,7 +157,7 @@ geziyor.NewGeziyor(&geziyor.Options{
}
})
},
Exporters: []geziyor.Exporter{&export.JSON{}},
Exporters: []export.Exporter{&export.JSON{}},
}).Start()
```

8
export/export.go Normal file
View File

@ -0,0 +1,8 @@
package export
// Exporter interface is for extracting data to external resources.
// Geziyor calls every extractors Export functions before any scraping starts.
// Export functions should wait for new data from exports chan.
type Exporter interface {
Export(exports chan interface{})
}

8
extract/extract.go Normal file
View File

@ -0,0 +1,8 @@
package extract
import "github.com/PuerkitoBio/goquery"
// Extractor interface is for extracting data from HTML document
type Extractor interface {
Extract(doc *goquery.Document) (interface{}, error)
}

View File

@ -1,7 +1,6 @@
package geziyor
import (
"github.com/PuerkitoBio/goquery"
"github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/metrics"
@ -11,18 +10,6 @@ import (
"sync"
)
// Extractor interface is for extracting data from HTML document
type Extractor interface {
Extract(doc *goquery.Document) (interface{}, error)
}
// Exporter interface is for extracting data to external resources.
// Geziyor calls every extractors Export functions before any scraping starts.
// Export functions should wait for new data from exports chan.
type Exporter interface {
Export(exports chan interface{})
}
// Geziyor is our main scraper type
type Geziyor struct {
Opt *Options

View File

@ -44,7 +44,7 @@ func TestQuotes(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: quotesParse,
Exporters: []geziyor.Exporter{&export.JSON{}},
Exporters: []export.Exporter{&export.JSON{}},
}).Start()
}
@ -84,7 +84,7 @@ func TestAllLinks(t *testing.T) {
}
})
},
Exporters: []geziyor.Exporter{&export.CSV{}},
Exporters: []export.Exporter{&export.CSV{}},
MetricsType: metrics.Prometheus,
}).Start()
}
@ -99,7 +99,7 @@ func TestStartRequestsFunc(t *testing.T) {
g.Exports <- s.AttrOr("href", "")
})
},
Exporters: []geziyor.Exporter{&export.JSON{}},
Exporters: []export.Exporter{&export.JSON{}},
}).Start()
}
@ -162,7 +162,7 @@ func TestBasicAuth(t *testing.T) {
func TestExtractor(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
Extractors: []geziyor.Extractor{
Extractors: []extract.Extractor{
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
&extract.Text{Name: "title", Selector: ".c-page-title"},
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
@ -171,7 +171,7 @@ func TestExtractor(t *testing.T) {
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
&extract.Text{Name: "content", Selector: ".c-entry-content"},
},
Exporters: []geziyor.Exporter{&export.JSON{}},
Exporters: []export.Exporter{&export.JSON{}},
}).Start()
}
@ -265,7 +265,7 @@ func BenchmarkWhole(b *testing.B) {
}
})
},
Exporters: []geziyor.Exporter{&export.CSV{}},
Exporters: []export.Exporter{&export.CSV{}},
//MetricsType: metrics.Prometheus,
LogDisabled: true,
}).Start()

View File

@ -3,6 +3,8 @@ package geziyor
import (
"github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/extract"
"github.com/geziyor/geziyor/metrics"
"time"
)
@ -23,7 +25,7 @@ type Options struct {
ParseFunc func(g *Geziyor, r *client.Response)
// Extractors extracts items from pages
Extractors []Extractor
Extractors []extract.Extractor
// Timeout is global request timeout
Timeout time.Duration
@ -50,7 +52,7 @@ type Options struct {
LogDisabled bool
// For extracting data
Exporters []Exporter
Exporters []export.Exporter
// Called before requests made to manipulate requests
RequestMiddlewares []RequestMiddleware