Exporter and Extractor interfaces moved to its own package for simplicity of main Geziyor package
This commit is contained in:
parent
c0dd0393e6
commit
4ab7cfd904
@ -40,7 +40,7 @@ func main() {
|
|||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||||
ParseFunc: quotesParse,
|
ParseFunc: quotesParse,
|
||||||
Exporters: []geziyor.Exporter{export.JSON{}},
|
Exporters: []export.Exporter{export.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -110,7 +110,7 @@ You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor)
|
|||||||
```go
|
```go
|
||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||||
Extractors: []geziyor.Extractor{
|
Extractors: []extract.Extractor{
|
||||||
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||||
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||||
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||||
@ -119,7 +119,7 @@ geziyor.NewGeziyor(&geziyor.Options{
|
|||||||
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||||
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||||
},
|
},
|
||||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
Exporters: []export.Exporter{&export.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -157,7 +157,7 @@ geziyor.NewGeziyor(&geziyor.Options{
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
Exporters: []export.Exporter{&export.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
8
export/export.go
Normal file
8
export/export.go
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
package export
|
||||||
|
|
||||||
|
// Exporter interface is for extracting data to external resources.
|
||||||
|
// Geziyor calls every extractors Export functions before any scraping starts.
|
||||||
|
// Export functions should wait for new data from exports chan.
|
||||||
|
type Exporter interface {
|
||||||
|
Export(exports chan interface{})
|
||||||
|
}
|
8
extract/extract.go
Normal file
8
extract/extract.go
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
package extract
|
||||||
|
|
||||||
|
import "github.com/PuerkitoBio/goquery"
|
||||||
|
|
||||||
|
// Extractor interface is for extracting data from HTML document
|
||||||
|
type Extractor interface {
|
||||||
|
Extract(doc *goquery.Document) (interface{}, error)
|
||||||
|
}
|
13
geziyor.go
13
geziyor.go
@ -1,7 +1,6 @@
|
|||||||
package geziyor
|
package geziyor
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/PuerkitoBio/goquery"
|
|
||||||
"github.com/fpfeng/httpcache"
|
"github.com/fpfeng/httpcache"
|
||||||
"github.com/geziyor/geziyor/client"
|
"github.com/geziyor/geziyor/client"
|
||||||
"github.com/geziyor/geziyor/metrics"
|
"github.com/geziyor/geziyor/metrics"
|
||||||
@ -11,18 +10,6 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Extractor interface is for extracting data from HTML document
|
|
||||||
type Extractor interface {
|
|
||||||
Extract(doc *goquery.Document) (interface{}, error)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Exporter interface is for extracting data to external resources.
|
|
||||||
// Geziyor calls every extractors Export functions before any scraping starts.
|
|
||||||
// Export functions should wait for new data from exports chan.
|
|
||||||
type Exporter interface {
|
|
||||||
Export(exports chan interface{})
|
|
||||||
}
|
|
||||||
|
|
||||||
// Geziyor is our main scraper type
|
// Geziyor is our main scraper type
|
||||||
type Geziyor struct {
|
type Geziyor struct {
|
||||||
Opt *Options
|
Opt *Options
|
||||||
|
@ -44,7 +44,7 @@ func TestQuotes(t *testing.T) {
|
|||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||||
ParseFunc: quotesParse,
|
ParseFunc: quotesParse,
|
||||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
Exporters: []export.Exporter{&export.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,7 +84,7 @@ func TestAllLinks(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
Exporters: []geziyor.Exporter{&export.CSV{}},
|
Exporters: []export.Exporter{&export.CSV{}},
|
||||||
MetricsType: metrics.Prometheus,
|
MetricsType: metrics.Prometheus,
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
@ -99,7 +99,7 @@ func TestStartRequestsFunc(t *testing.T) {
|
|||||||
g.Exports <- s.AttrOr("href", "")
|
g.Exports <- s.AttrOr("href", "")
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
Exporters: []export.Exporter{&export.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -162,7 +162,7 @@ func TestBasicAuth(t *testing.T) {
|
|||||||
func TestExtractor(t *testing.T) {
|
func TestExtractor(t *testing.T) {
|
||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||||
Extractors: []geziyor.Extractor{
|
Extractors: []extract.Extractor{
|
||||||
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||||
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||||
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||||
@ -171,7 +171,7 @@ func TestExtractor(t *testing.T) {
|
|||||||
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||||
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||||
},
|
},
|
||||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
Exporters: []export.Exporter{&export.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -265,7 +265,7 @@ func BenchmarkWhole(b *testing.B) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
Exporters: []geziyor.Exporter{&export.CSV{}},
|
Exporters: []export.Exporter{&export.CSV{}},
|
||||||
//MetricsType: metrics.Prometheus,
|
//MetricsType: metrics.Prometheus,
|
||||||
LogDisabled: true,
|
LogDisabled: true,
|
||||||
}).Start()
|
}).Start()
|
||||||
|
@ -3,6 +3,8 @@ package geziyor
|
|||||||
import (
|
import (
|
||||||
"github.com/fpfeng/httpcache"
|
"github.com/fpfeng/httpcache"
|
||||||
"github.com/geziyor/geziyor/client"
|
"github.com/geziyor/geziyor/client"
|
||||||
|
"github.com/geziyor/geziyor/export"
|
||||||
|
"github.com/geziyor/geziyor/extract"
|
||||||
"github.com/geziyor/geziyor/metrics"
|
"github.com/geziyor/geziyor/metrics"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@ -23,7 +25,7 @@ type Options struct {
|
|||||||
ParseFunc func(g *Geziyor, r *client.Response)
|
ParseFunc func(g *Geziyor, r *client.Response)
|
||||||
|
|
||||||
// Extractors extracts items from pages
|
// Extractors extracts items from pages
|
||||||
Extractors []Extractor
|
Extractors []extract.Extractor
|
||||||
|
|
||||||
// Timeout is global request timeout
|
// Timeout is global request timeout
|
||||||
Timeout time.Duration
|
Timeout time.Duration
|
||||||
@ -50,7 +52,7 @@ type Options struct {
|
|||||||
LogDisabled bool
|
LogDisabled bool
|
||||||
|
|
||||||
// For extracting data
|
// For extracting data
|
||||||
Exporters []Exporter
|
Exporters []export.Exporter
|
||||||
|
|
||||||
// Called before requests made to manipulate requests
|
// Called before requests made to manipulate requests
|
||||||
RequestMiddlewares []RequestMiddleware
|
RequestMiddlewares []RequestMiddleware
|
||||||
|
Loading…
x
Reference in New Issue
Block a user