Middleware support added. HTML Parsing disable option added.

Goroutine leaks will be tested using leaktest lib.
This commit is contained in:
Musab Gültekin 2019-06-15 17:55:40 +03:00
parent 4799b0f7b4
commit 7b23596a2d
9 changed files with 53 additions and 17 deletions

View File

@ -4,6 +4,9 @@ go:
- 1.11.x
- tip
env:
- GO111MODULE=on
before_install:
- go get -t -v ./...

View File

@ -16,7 +16,7 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings.
## Status
Since the project is in **development phase**, **API may change in time**. Also, we highly recommend you to use Geziyor with go modules.
Since the project is in **development phase**, **API may change in time**. Thus, we highly recommend you to use Geziyor with go modules.
## Usage
Simple usage

View File

@ -49,12 +49,11 @@ func (e *CSVExporter) Export(response *geziyor.Response) {
values = append(values, fmt.Sprint(val.Index(i)))
}
// TODO: Map type support is incomplete. Ordering is wrong. Needs to be sorted by map keys (CSV headers).
case reflect.Map:
iter := val.MapRange()
for iter.Next() {
values = append(values, fmt.Sprint(iter.Value()))
}
//case reflect.Map:
// iter := val.MapRange()
// for iter.Next() {
// values = append(values, fmt.Sprint(iter.Value()))
// }
}
// Write to file

View File

@ -23,9 +23,13 @@ import (
// Exporter interface is for extracting data to external resources
type Exporter interface {
Export(response *Response)
Export(r *Response)
}
// RequestMiddleware called before requests made.
// Set request.Cancelled = true to cancel request
type RequestMiddleware func(g *Geziyor, r *Request)
// Geziyor is our main scraper type
type Geziyor struct {
Opt Options
@ -41,6 +45,7 @@ type Geziyor struct {
sync.RWMutex
visitedURLS []string
}
requestMiddlewaresBase []RequestMiddleware
}
func init() {
@ -69,6 +74,7 @@ func NewGeziyor(opt Options) *Geziyor {
Timeout: time.Second * 180, // Google's timeout
},
Opt: opt,
requestMiddlewaresBase: []RequestMiddleware{defaultHeadersMiddleware},
}
if opt.Cache != nil {
@ -168,6 +174,14 @@ func (g *Geziyor) do(req *Request, callback func(resp *Response)) {
return
}
// Request Middlewares
for _, middlewareFunc := range g.requestMiddlewaresBase {
middlewareFunc(g, req)
}
for _, middlewareFunc := range g.Opt.RequestMiddlewares {
middlewareFunc(g, req)
}
// Do request normal or Chrome and read response
var response *Response
var err error
@ -180,7 +194,7 @@ func (g *Geziyor) do(req *Request, callback func(resp *Response)) {
return
}
if response.isHTML() {
if !g.Opt.ParseHTMLDisabled && response.isHTML() {
response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(response.Body))
}
@ -216,12 +230,6 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
g.delay()
// Modify Request
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Set("Accept-Charset", "utf-8")
req.Header.Set("Accept-Language", "en")
req.Header.Set("User-Agent", g.Opt.UserAgent)
log.Println("Fetching: ", req.URL.String())
// Do request

View File

@ -3,6 +3,7 @@ package geziyor_test
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/fortytw2/leaktest"
"github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/exporter"
@ -21,6 +22,7 @@ func TestSimple(t *testing.T) {
}
func TestSimpleCache(t *testing.T) {
defer leaktest.Check(t)()
gez := geziyor.NewGeziyor(geziyor.Options{
StartURLs: []string{"http://api.ipify.org"},
Cache: httpcache.NewMemoryCache(),
@ -34,6 +36,7 @@ func TestSimpleCache(t *testing.T) {
}
func TestQuotes(t *testing.T) {
defer leaktest.Check(t)()
geziyor.NewGeziyor(geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: quotesParse,
@ -56,7 +59,7 @@ func quotesParse(r *geziyor.Response) {
// Next Page
if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok {
go r.Geziyor.Get(r.JoinURL(href), quotesParse)
r.Geziyor.Get(r.JoinURL(href), quotesParse)
}
}

1
go.mod
View File

@ -6,6 +6,7 @@ require (
github.com/PuerkitoBio/goquery v1.5.0
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9
github.com/chromedp/chromedp v0.3.0
github.com/fortytw2/leaktest v1.3.0
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
golang.org/x/net v0.0.0-20190522155817-f3200d17e092
golang.org/x/text v0.3.2 // indirect

2
go.sum
View File

@ -6,6 +6,8 @@ github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9 h1:ARnDd2vEk91rLN
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9/go.mod h1:xquOK9dIGFlLaIGI4c6IyfLI/Gz0LiYYuJtzhsUODgI=
github.com/chromedp/chromedp v0.3.0 h1:7/pwrXFRq6/ym3sxCykm90DMoyw6VKXY48DgGRgUURA=
github.com/chromedp/chromedp v0.3.0/go.mod h1:EktsZcC2iycVrRhC9fDmshBpCK9lNnZYi6x2q9uE7zI=
github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw=
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ=
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8=
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0=

View File

@ -47,12 +47,18 @@ type Options struct {
// For extracting data
Exporters []Exporter
// Called before requests made to manipulate requests
RequestMiddlewares []RequestMiddleware
// Max body reading size in bytes
MaxBodySize int64
// Charset Detection disable
CharsetDetectDisabled bool
// If true, HTML parsing is disabled to improve performance.
ParseHTMLDisabled bool
// Revisiting same URLs is disabled by default
URLRevisitEnabled bool
}

View File

@ -10,3 +10,17 @@ type Request struct {
Meta map[string]interface{}
Rendered bool
}
func defaultHeadersMiddleware(g *Geziyor, r *Request) {
r.Header = headerSetDefault(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Header = headerSetDefault(r.Header, "Accept-Charset", "utf-8")
r.Header = headerSetDefault(r.Header, "Accept-Language", "en")
r.Header = headerSetDefault(r.Header, "User-Agent", g.Opt.UserAgent)
}
func headerSetDefault(header http.Header, key string, value string) http.Header {
if header.Get(key) == "" {
header.Set(key, value)
}
return header
}