Middleware support added. HTML Parsing disable option added.
Goroutine leaks will be tested using leaktest lib.
This commit is contained in:
parent
4799b0f7b4
commit
7b23596a2d
@ -4,6 +4,9 @@ go:
|
||||
- 1.11.x
|
||||
- tip
|
||||
|
||||
env:
|
||||
- GO111MODULE=on
|
||||
|
||||
before_install:
|
||||
- go get -t -v ./...
|
||||
|
||||
|
@ -16,7 +16,7 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
|
||||
See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings.
|
||||
|
||||
## Status
|
||||
Since the project is in **development phase**, **API may change in time**. Also, we highly recommend you to use Geziyor with go modules.
|
||||
Since the project is in **development phase**, **API may change in time**. Thus, we highly recommend you to use Geziyor with go modules.
|
||||
|
||||
## Usage
|
||||
Simple usage
|
||||
|
@ -49,12 +49,11 @@ func (e *CSVExporter) Export(response *geziyor.Response) {
|
||||
values = append(values, fmt.Sprint(val.Index(i)))
|
||||
}
|
||||
|
||||
// TODO: Map type support is incomplete. Ordering is wrong. Needs to be sorted by map keys (CSV headers).
|
||||
case reflect.Map:
|
||||
iter := val.MapRange()
|
||||
for iter.Next() {
|
||||
values = append(values, fmt.Sprint(iter.Value()))
|
||||
}
|
||||
//case reflect.Map:
|
||||
// iter := val.MapRange()
|
||||
// for iter.Next() {
|
||||
// values = append(values, fmt.Sprint(iter.Value()))
|
||||
// }
|
||||
}
|
||||
|
||||
// Write to file
|
||||
|
26
geziyor.go
26
geziyor.go
@ -23,9 +23,13 @@ import (
|
||||
|
||||
// Exporter interface is for extracting data to external resources
|
||||
type Exporter interface {
|
||||
Export(response *Response)
|
||||
Export(r *Response)
|
||||
}
|
||||
|
||||
// RequestMiddleware called before requests made.
|
||||
// Set request.Cancelled = true to cancel request
|
||||
type RequestMiddleware func(g *Geziyor, r *Request)
|
||||
|
||||
// Geziyor is our main scraper type
|
||||
type Geziyor struct {
|
||||
Opt Options
|
||||
@ -41,6 +45,7 @@ type Geziyor struct {
|
||||
sync.RWMutex
|
||||
visitedURLS []string
|
||||
}
|
||||
requestMiddlewaresBase []RequestMiddleware
|
||||
}
|
||||
|
||||
func init() {
|
||||
@ -68,7 +73,8 @@ func NewGeziyor(opt Options) *Geziyor {
|
||||
},
|
||||
Timeout: time.Second * 180, // Google's timeout
|
||||
},
|
||||
Opt: opt,
|
||||
Opt: opt,
|
||||
requestMiddlewaresBase: []RequestMiddleware{defaultHeadersMiddleware},
|
||||
}
|
||||
|
||||
if opt.Cache != nil {
|
||||
@ -168,6 +174,14 @@ func (g *Geziyor) do(req *Request, callback func(resp *Response)) {
|
||||
return
|
||||
}
|
||||
|
||||
// Request Middlewares
|
||||
for _, middlewareFunc := range g.requestMiddlewaresBase {
|
||||
middlewareFunc(g, req)
|
||||
}
|
||||
for _, middlewareFunc := range g.Opt.RequestMiddlewares {
|
||||
middlewareFunc(g, req)
|
||||
}
|
||||
|
||||
// Do request normal or Chrome and read response
|
||||
var response *Response
|
||||
var err error
|
||||
@ -180,7 +194,7 @@ func (g *Geziyor) do(req *Request, callback func(resp *Response)) {
|
||||
return
|
||||
}
|
||||
|
||||
if response.isHTML() {
|
||||
if !g.Opt.ParseHTMLDisabled && response.isHTML() {
|
||||
response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(response.Body))
|
||||
}
|
||||
|
||||
@ -216,12 +230,6 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
|
||||
|
||||
g.delay()
|
||||
|
||||
// Modify Request
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Charset", "utf-8")
|
||||
req.Header.Set("Accept-Language", "en")
|
||||
req.Header.Set("User-Agent", g.Opt.UserAgent)
|
||||
|
||||
log.Println("Fetching: ", req.URL.String())
|
||||
|
||||
// Do request
|
||||
|
@ -3,6 +3,7 @@ package geziyor_test
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/fortytw2/leaktest"
|
||||
"github.com/fpfeng/httpcache"
|
||||
"github.com/geziyor/geziyor"
|
||||
"github.com/geziyor/geziyor/exporter"
|
||||
@ -21,6 +22,7 @@ func TestSimple(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSimpleCache(t *testing.T) {
|
||||
defer leaktest.Check(t)()
|
||||
gez := geziyor.NewGeziyor(geziyor.Options{
|
||||
StartURLs: []string{"http://api.ipify.org"},
|
||||
Cache: httpcache.NewMemoryCache(),
|
||||
@ -34,6 +36,7 @@ func TestSimpleCache(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestQuotes(t *testing.T) {
|
||||
defer leaktest.Check(t)()
|
||||
geziyor.NewGeziyor(geziyor.Options{
|
||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||
ParseFunc: quotesParse,
|
||||
@ -56,7 +59,7 @@ func quotesParse(r *geziyor.Response) {
|
||||
|
||||
// Next Page
|
||||
if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok {
|
||||
go r.Geziyor.Get(r.JoinURL(href), quotesParse)
|
||||
r.Geziyor.Get(r.JoinURL(href), quotesParse)
|
||||
}
|
||||
}
|
||||
|
||||
|
1
go.mod
1
go.mod
@ -6,6 +6,7 @@ require (
|
||||
github.com/PuerkitoBio/goquery v1.5.0
|
||||
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9
|
||||
github.com/chromedp/chromedp v0.3.0
|
||||
github.com/fortytw2/leaktest v1.3.0
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
|
||||
golang.org/x/net v0.0.0-20190522155817-f3200d17e092
|
||||
golang.org/x/text v0.3.2 // indirect
|
||||
|
2
go.sum
2
go.sum
@ -6,6 +6,8 @@ github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9 h1:ARnDd2vEk91rLN
|
||||
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9/go.mod h1:xquOK9dIGFlLaIGI4c6IyfLI/Gz0LiYYuJtzhsUODgI=
|
||||
github.com/chromedp/chromedp v0.3.0 h1:7/pwrXFRq6/ym3sxCykm90DMoyw6VKXY48DgGRgUURA=
|
||||
github.com/chromedp/chromedp v0.3.0/go.mod h1:EktsZcC2iycVrRhC9fDmshBpCK9lNnZYi6x2q9uE7zI=
|
||||
github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw=
|
||||
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ=
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8=
|
||||
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0=
|
||||
|
@ -47,12 +47,18 @@ type Options struct {
|
||||
// For extracting data
|
||||
Exporters []Exporter
|
||||
|
||||
// Called before requests made to manipulate requests
|
||||
RequestMiddlewares []RequestMiddleware
|
||||
|
||||
// Max body reading size in bytes
|
||||
MaxBodySize int64
|
||||
|
||||
// Charset Detection disable
|
||||
CharsetDetectDisabled bool
|
||||
|
||||
// If true, HTML parsing is disabled to improve performance.
|
||||
ParseHTMLDisabled bool
|
||||
|
||||
// Revisiting same URLs is disabled by default
|
||||
URLRevisitEnabled bool
|
||||
}
|
||||
|
14
request.go
14
request.go
@ -10,3 +10,17 @@ type Request struct {
|
||||
Meta map[string]interface{}
|
||||
Rendered bool
|
||||
}
|
||||
|
||||
func defaultHeadersMiddleware(g *Geziyor, r *Request) {
|
||||
r.Header = headerSetDefault(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
r.Header = headerSetDefault(r.Header, "Accept-Charset", "utf-8")
|
||||
r.Header = headerSetDefault(r.Header, "Accept-Language", "en")
|
||||
r.Header = headerSetDefault(r.Header, "User-Agent", g.Opt.UserAgent)
|
||||
}
|
||||
|
||||
func headerSetDefault(header http.Header, key string, value string) http.Header {
|
||||
if header.Get(key) == "" {
|
||||
header.Set(key, value)
|
||||
}
|
||||
return header
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user