Extractors implemented. Exporters name simplified. README Updated for extracting data. Removed go 1.11 support
This commit is contained in:
parent
679fd8ab7a
commit
b000581c3d
3
.gitignore
vendored
3
.gitignore
vendored
@ -19,3 +19,6 @@
|
|||||||
|
|
||||||
# Output files
|
# Output files
|
||||||
out.*
|
out.*
|
||||||
|
|
||||||
|
# Cache files
|
||||||
|
.cache/
|
@ -1,8 +1,7 @@
|
|||||||
language: go
|
language: go
|
||||||
|
|
||||||
go:
|
go:
|
||||||
- 1.10.x
|
- 1.12.x
|
||||||
- 1.11.x
|
|
||||||
- tip
|
- tip
|
||||||
|
|
||||||
env:
|
env:
|
||||||
|
41
README.md
41
README.md
@ -39,7 +39,7 @@ func main() {
|
|||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||||
ParseFunc: quotesParse,
|
ParseFunc: quotesParse,
|
||||||
Exporters: []geziyor.Exporter{exporter.JSONExporter{}},
|
Exporters: []geziyor.Exporter{exporter.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -100,6 +100,43 @@ geziyor.NewGeziyor(&geziyor.Options{
|
|||||||
}).Start()
|
}).Start()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Extracting Data
|
||||||
|
|
||||||
|
#### Extractors
|
||||||
|
You can add [Extractor]() to []Extractors option to extract structured data.
|
||||||
|
Exporters need to be defined in order to extractors work.
|
||||||
|
|
||||||
|
```go
|
||||||
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
|
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||||
|
Extractors: []geziyor.Extractor{
|
||||||
|
&extractor.Text{Name: "title", Selector: ".c-page-title"},
|
||||||
|
&extractor.Text{Name: "byline", Selector: ".c-byline__item:nth-child(1) > a"},
|
||||||
|
&extractor.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||||
|
&extractor.Text{Name: "content", Selector: ".c-entry-content"},
|
||||||
|
},
|
||||||
|
Exporters: []geziyor.Exporter{&exporter.JSON{}},
|
||||||
|
}).Start()
|
||||||
|
```
|
||||||
|
|
||||||
|
#### HTML selectors
|
||||||
|
|
||||||
|
We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document).
|
||||||
|
|
||||||
|
HTMLDoc can be accessible on Response if response is HTML and can be parsed using Go's built-in HTML [parser](https://godoc.org/golang.org/x/net/html#Parse)
|
||||||
|
If response isn't HTML, ```response.HTMLDoc``` would be ```nil```.
|
||||||
|
|
||||||
|
```go
|
||||||
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
|
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||||
|
ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
|
||||||
|
r.HTMLDoc.Find("div.quote").Each(func(_ int, s *goquery.Selection) {
|
||||||
|
log.Println(s.Find("span.text").Text(), s.Find("small.author").Text())
|
||||||
|
})
|
||||||
|
},
|
||||||
|
}).Start()
|
||||||
|
```
|
||||||
|
|
||||||
### Exporting Data
|
### Exporting Data
|
||||||
|
|
||||||
You can export data automatically using exporters. Just send data to ```Geziyor.Exports``` chan.
|
You can export data automatically using exporters. Just send data to ```Geziyor.Exports``` chan.
|
||||||
@ -116,7 +153,7 @@ geziyor.NewGeziyor(&geziyor.Options{
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
Exporters: []geziyor.Exporter{&exporter.JSONExporter{}},
|
Exporters: []geziyor.Exporter{&exporter.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -10,15 +10,15 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
)
|
)
|
||||||
|
|
||||||
// CSVExporter exports response data as CSV streaming file
|
// CSV exports response data as CSV streaming file
|
||||||
type CSVExporter struct {
|
type CSV struct {
|
||||||
FileName string
|
FileName string
|
||||||
Comma rune
|
Comma rune
|
||||||
UseCRLF bool
|
UseCRLF bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// Export exports response data as CSV streaming file
|
// Export exports response data as CSV streaming file
|
||||||
func (e *CSVExporter) Export(exports chan interface{}) {
|
func (e *CSV) Export(exports chan interface{}) {
|
||||||
|
|
||||||
// Create or append file
|
// Create or append file
|
||||||
file, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.csv"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
file, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.csv"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||||
|
@ -6,7 +6,7 @@ func TestCSVExporter_Export(t *testing.T) {
|
|||||||
ch := make(chan interface{})
|
ch := make(chan interface{})
|
||||||
defer close(ch)
|
defer close(ch)
|
||||||
|
|
||||||
exporter := &CSVExporter{
|
exporter := &CSV{
|
||||||
FileName: "out.csv",
|
FileName: "out.csv",
|
||||||
Comma: ';',
|
Comma: ';',
|
||||||
}
|
}
|
||||||
|
@ -7,8 +7,8 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
)
|
)
|
||||||
|
|
||||||
// JSONExporter exports response data as JSON streaming file
|
// JSON exports response data as JSON streaming file
|
||||||
type JSONExporter struct {
|
type JSON struct {
|
||||||
FileName string
|
FileName string
|
||||||
EscapeHTML bool
|
EscapeHTML bool
|
||||||
Prefix string
|
Prefix string
|
||||||
@ -16,7 +16,7 @@ type JSONExporter struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Export exports response data as JSON streaming file
|
// Export exports response data as JSON streaming file
|
||||||
func (e *JSONExporter) Export(exports chan interface{}) {
|
func (e *JSON) Export(exports chan interface{}) {
|
||||||
|
|
||||||
// Create or append file
|
// Create or append file
|
||||||
file, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
file, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||||
|
@ -6,7 +6,7 @@ func TestJSONExporter_Export(t *testing.T) {
|
|||||||
ch := make(chan interface{})
|
ch := make(chan interface{})
|
||||||
defer close(ch)
|
defer close(ch)
|
||||||
|
|
||||||
exporter := &JSONExporter{
|
exporter := &JSON{
|
||||||
FileName: "out.json",
|
FileName: "out.json",
|
||||||
Indent: " ",
|
Indent: " ",
|
||||||
}
|
}
|
||||||
|
14
extractor/text.go
Normal file
14
extractor/text.go
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
package extractor
|
||||||
|
|
||||||
|
import "github.com/PuerkitoBio/goquery"
|
||||||
|
|
||||||
|
// Text extracts texts from selected nodes
|
||||||
|
type Text struct {
|
||||||
|
Name string
|
||||||
|
Selector string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract extracts texts from selected nodes
|
||||||
|
func (e *Text) Extract(doc *goquery.Document) interface{} {
|
||||||
|
return map[string]string{e.Name: doc.Find(e.Selector).Text()}
|
||||||
|
}
|
11
geziyor.go
11
geziyor.go
@ -2,6 +2,7 @@ package geziyor
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
"github.com/chromedp/cdproto/dom"
|
"github.com/chromedp/cdproto/dom"
|
||||||
"github.com/chromedp/cdproto/network"
|
"github.com/chromedp/cdproto/network"
|
||||||
"github.com/chromedp/chromedp"
|
"github.com/chromedp/chromedp"
|
||||||
@ -20,7 +21,14 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Exporter interface is for extracting data to external resources
|
// Extractor interface is for extracting data from HTML document
|
||||||
|
type Extractor interface {
|
||||||
|
Extract(doc *goquery.Document) interface{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Exporter interface is for extracting data to external resources.
|
||||||
|
// Geziyor calls every extractors Export functions before any scraping starts.
|
||||||
|
// Export functions should wait for new data from exports chan.
|
||||||
type Exporter interface {
|
type Exporter interface {
|
||||||
Export(exports chan interface{})
|
Export(exports chan interface{})
|
||||||
}
|
}
|
||||||
@ -61,6 +69,7 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
responseMiddlewares: []ResponseMiddleware{
|
responseMiddlewares: []ResponseMiddleware{
|
||||||
parseHTMLMiddleware,
|
parseHTMLMiddleware,
|
||||||
metricsResponseMiddleware,
|
metricsResponseMiddleware,
|
||||||
|
extractorsMiddleware,
|
||||||
},
|
},
|
||||||
metrics: metrics.NewMetrics(opt.MetricsType),
|
metrics: metrics.NewMetrics(opt.MetricsType),
|
||||||
}
|
}
|
||||||
|
@ -7,10 +7,9 @@ import (
|
|||||||
"github.com/fpfeng/httpcache"
|
"github.com/fpfeng/httpcache"
|
||||||
"github.com/geziyor/geziyor"
|
"github.com/geziyor/geziyor"
|
||||||
"github.com/geziyor/geziyor/exporter"
|
"github.com/geziyor/geziyor/exporter"
|
||||||
|
"github.com/geziyor/geziyor/extractor"
|
||||||
"github.com/geziyor/geziyor/metrics"
|
"github.com/geziyor/geziyor/metrics"
|
||||||
"math/rand"
|
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestSimple(t *testing.T) {
|
func TestSimple(t *testing.T) {
|
||||||
@ -41,7 +40,7 @@ func TestQuotes(t *testing.T) {
|
|||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||||
ParseFunc: quotesParse,
|
ParseFunc: quotesParse,
|
||||||
Exporters: []geziyor.Exporter{&exporter.JSONExporter{}},
|
Exporters: []geziyor.Exporter{&exporter.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -78,20 +77,11 @@ func TestAllLinks(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
Exporters: []geziyor.Exporter{&exporter.CSVExporter{}},
|
Exporters: []geziyor.Exporter{&exporter.CSV{}},
|
||||||
MetricsType: metrics.Prometheus,
|
MetricsType: metrics.Prometheus,
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRandomDelay(t *testing.T) {
|
|
||||||
rand.Seed(time.Now().UnixNano())
|
|
||||||
delay := time.Millisecond * 1000
|
|
||||||
min := float64(delay) * 0.5
|
|
||||||
max := float64(delay) * 1.5
|
|
||||||
randomDelay := rand.Intn(int(max-min)) + int(min)
|
|
||||||
fmt.Println(time.Duration(randomDelay))
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestStartRequestsFunc(t *testing.T) {
|
func TestStartRequestsFunc(t *testing.T) {
|
||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartRequestsFunc: func(g *geziyor.Geziyor) {
|
StartRequestsFunc: func(g *geziyor.Geziyor) {
|
||||||
@ -102,7 +92,7 @@ func TestStartRequestsFunc(t *testing.T) {
|
|||||||
g.Exports <- s.AttrOr("href", "")
|
g.Exports <- s.AttrOr("href", "")
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
Exporters: []geziyor.Exporter{&exporter.JSONExporter{}},
|
Exporters: []geziyor.Exporter{&exporter.JSON{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -161,3 +151,16 @@ func TestBasicAuth(t *testing.T) {
|
|||||||
MetricsType: metrics.ExpVar,
|
MetricsType: metrics.ExpVar,
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestExtractor(t *testing.T) {
|
||||||
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
|
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||||
|
Extractors: []geziyor.Extractor{
|
||||||
|
&extractor.Text{Name: "title", Selector: ".c-page-title"},
|
||||||
|
&extractor.Text{Name: "byline", Selector: ".c-byline__item:nth-child(1) > a"},
|
||||||
|
&extractor.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||||
|
&extractor.Text{Name: "content", Selector: ".c-entry-content"},
|
||||||
|
},
|
||||||
|
Exporters: []geziyor.Exporter{&exporter.JSON{}},
|
||||||
|
}).Start()
|
||||||
|
}
|
||||||
|
@ -2,12 +2,14 @@ package geziyor
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"fmt"
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"github.com/geziyor/geziyor/http"
|
"github.com/geziyor/geziyor/http"
|
||||||
"github.com/geziyor/geziyor/internal"
|
"github.com/geziyor/geziyor/internal"
|
||||||
"log"
|
"log"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"os"
|
"os"
|
||||||
|
"reflect"
|
||||||
"runtime/debug"
|
"runtime/debug"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@ -92,3 +94,27 @@ func parseHTMLMiddleware(g *Geziyor, r *Response) {
|
|||||||
func metricsResponseMiddleware(g *Geziyor, r *Response) {
|
func metricsResponseMiddleware(g *Geziyor, r *Response) {
|
||||||
g.metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
|
g.metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// extractorsMiddleware extracts data from loaders conf and exports it to exporters
|
||||||
|
func extractorsMiddleware(g *Geziyor, r *Response) {
|
||||||
|
|
||||||
|
// Check if we have extractors and exporters
|
||||||
|
if len(g.Opt.Extractors) != 0 && len(g.Opt.Exporters) != 0 {
|
||||||
|
exports := map[string]interface{}{}
|
||||||
|
|
||||||
|
for _, extractor := range g.Opt.Extractors {
|
||||||
|
extracted := extractor.Extract(r.HTMLDoc)
|
||||||
|
|
||||||
|
// Check extracted data type and use it accordingly
|
||||||
|
val := reflect.ValueOf(extracted)
|
||||||
|
switch val.Kind() {
|
||||||
|
case reflect.Map:
|
||||||
|
r := val.MapRange()
|
||||||
|
for r.Next() {
|
||||||
|
exports[fmt.Sprint(r.Key())] = r.Value().Interface()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
g.Exports <- exports
|
||||||
|
}
|
||||||
|
}
|
||||||
|
17
middleware_test.go
Normal file
17
middleware_test.go
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
package geziyor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math/rand"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRandomDelay(t *testing.T) {
|
||||||
|
rand.Seed(time.Now().UnixNano())
|
||||||
|
delay := time.Millisecond * 1000
|
||||||
|
min := float64(delay) * 0.5
|
||||||
|
max := float64(delay) * 1.5
|
||||||
|
randomDelay := rand.Intn(int(max-min)) + int(min)
|
||||||
|
fmt.Println(time.Duration(randomDelay))
|
||||||
|
}
|
@ -21,6 +21,9 @@ type Options struct {
|
|||||||
// ParseFunc is callback of StartURLs response.
|
// ParseFunc is callback of StartURLs response.
|
||||||
ParseFunc func(g *Geziyor, r *Response)
|
ParseFunc func(g *Geziyor, r *Response)
|
||||||
|
|
||||||
|
// Extractors extracts items from pages
|
||||||
|
Extractors []Extractor
|
||||||
|
|
||||||
// Timeout is global request timeout
|
// Timeout is global request timeout
|
||||||
Timeout time.Duration
|
Timeout time.Duration
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user