Middlewares refactored to multiple files in middleware package.

Extractors removed as they introduce complexity to scraper. Both in learning and developing.
This commit is contained in:
Musab Gültekin 2019-07-04 21:04:29 +03:00
parent 9adff75509
commit 2cab68d2ce
19 changed files with 202 additions and 304 deletions

View File

@ -104,28 +104,6 @@ geziyor.NewGeziyor(&geziyor.Options{
### Extracting Data ### Extracting Data
#### Extractors
You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor) to ```[]Extractors``` option to extract structured data.
```Exporters``` need to be defined in order extractors to work.
```go
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
Extractors: []extract.Extractor{
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
&extract.Text{Name: "title", Selector: ".c-page-title"},
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
&extract.Text{Name: "content", Selector: ".c-entry-content"},
},
Exporters: []export.Exporter{&export.JSON{}},
}).Start()
```
#### HTML selectors
We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document). We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document).
HTMLDoc can be accessible on Response if response is HTML and can be parsed using Go's built-in HTML [parser](https://godoc.org/golang.org/x/net/html#Parse) HTMLDoc can be accessible on Response if response is HTML and can be parsed using Go's built-in HTML [parser](https://godoc.org/golang.org/x/net/html#Parse)
@ -183,7 +161,6 @@ ok github.com/geziyor/geziyor 22.861s
If you're interested in helping this project, please consider these features: If you're interested in helping this project, please consider these features:
- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html)) - Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
- ~~Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape))~~
- Deploying Scrapers to Cloud - Deploying Scrapers to Cloud
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~ - ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html)) - Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))

View File

@ -1,7 +1,6 @@
package export package export
// Exporter interface is for extracting data to external resources. // Exporter interface is for extracting data to external resources.
// Geziyor calls every extractors Export functions before any scraping starts.
// Export functions should wait for new data from exports chan. // Export functions should wait for new data from exports chan.
type Exporter interface { type Exporter interface {
Export(exports chan interface{}) Export(exports chan interface{})

View File

@ -1,24 +0,0 @@
package extract
import (
"errors"
"github.com/PuerkitoBio/goquery"
)
var ErrAttrNotExists = errors.New("attribute not exist")
// Attr returns HTML attribute value of provided selector
type Attr struct {
Name string
Selector string
Attr string
}
// Extract returns HTML attribute value of provided selector
func (e Attr) Extract(sel *goquery.Selection) (interface{}, error) {
attr, exists := sel.Find(e.Selector).Attr(e.Attr)
if !exists {
return nil, ErrAttrNotExists
}
return map[string]string{e.Name: attr}, nil
}

View File

@ -1,8 +0,0 @@
package extract
import "github.com/PuerkitoBio/goquery"
// Extractor interface is for extracting data from HTML document
type Extractor interface {
Extract(sel *goquery.Selection) (interface{}, error)
}

View File

@ -1,52 +0,0 @@
package extract
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
)
// HTML extracts and returns the HTML from inside each element of the given selection.
type HTML struct {
Name string
Selector string
}
// Extract extracts and returns the HTML from inside each element of the given selection.
func (e HTML) Extract(sel *goquery.Selection) (interface{}, error) {
var ret, h string
var err error
sel.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
h, err = s.Html()
if err != nil {
return false
}
ret += h
return true
})
if err != nil {
return nil, err
}
return map[string]string{e.Name: ret}, nil
}
// OuterHTML extracts and returns the HTML of each element of the given selection.
type OuterHTML struct {
Name string
Selector string
}
// Extract extracts and returns the HTML of each element of the given selection.
func (e OuterHTML) Extract(sel *goquery.Selection) (interface{}, error) {
output := bytes.NewBufferString("")
for _, node := range sel.Find(e.Selector).Nodes {
if err := html.Render(output, node); err != nil {
return nil, err
}
}
return map[string]string{e.Name: output.String()}, nil
}

View File

@ -1,22 +0,0 @@
package extract
import (
"github.com/PuerkitoBio/goquery"
"strings"
)
// Text returns the combined text contents of provided selector.
type Text struct {
Name string
Selector string
TrimSpace bool
}
// Extract returns the combined text contents of provided selector.
func (e Text) Extract(sel *goquery.Selection) (interface{}, error) {
text := sel.Find(e.Selector).Text()
if e.TrimSpace {
text = strings.TrimSpace(text)
}
return map[string]string{e.Name: text}, nil
}

View File

@ -4,9 +4,12 @@ import (
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
"github.com/geziyor/geziyor/middleware"
"io/ioutil" "io/ioutil"
"log" "log"
"net/http/cookiejar" "net/http/cookiejar"
"os"
"runtime/debug"
"sync" "sync"
) )
@ -17,8 +20,8 @@ type Geziyor struct {
Exports chan interface{} Exports chan interface{}
metrics *metrics.Metrics metrics *metrics.Metrics
requestMiddlewares []RequestMiddleware requestMiddlewares []middleware.RequestProcessor
responseMiddlewares []ResponseMiddleware responseMiddlewares []middleware.ResponseProcessor
wgRequests sync.WaitGroup wgRequests sync.WaitGroup
wgExporters sync.WaitGroup wgExporters sync.WaitGroup
semGlobal chan struct{} semGlobal chan struct{}
@ -26,7 +29,6 @@ type Geziyor struct {
sync.RWMutex sync.RWMutex
hostSems map[string]chan struct{} hostSems map[string]chan struct{}
} }
visitedURLs sync.Map
} }
// NewGeziyor creates new Geziyor with default values. // NewGeziyor creates new Geziyor with default values.
@ -35,22 +37,23 @@ func NewGeziyor(opt *Options) *Geziyor {
geziyor := &Geziyor{ geziyor := &Geziyor{
Opt: opt, Opt: opt,
Exports: make(chan interface{}, 1), Exports: make(chan interface{}, 1),
requestMiddlewares: []RequestMiddleware{ requestMiddlewares: []middleware.RequestProcessor{
allowedDomainsMiddleware, &middleware.AllowedDomains{AllowedDomains: opt.AllowedDomains},
duplicateRequestsMiddleware, &middleware.DuplicateRequests{RevisitEnabled: opt.URLRevisitEnabled},
defaultHeadersMiddleware, &middleware.Headers{UserAgent: opt.UserAgent},
delayMiddleware, middleware.NewDelay(opt.RequestDelayRandomize, opt.RequestDelay),
logMiddleware,
metricsRequestMiddleware,
}, },
responseMiddlewares: []ResponseMiddleware{ responseMiddlewares: []middleware.ResponseProcessor{
parseHTMLMiddleware, &middleware.ParseHTML{ParseHTMLDisabled: opt.ParseHTMLDisabled},
metricsResponseMiddleware, &middleware.LogStats{LogDisabled: opt.LogDisabled},
extractorsMiddleware,
}, },
metrics: metrics.NewMetrics(opt.MetricsType), metrics: metrics.NewMetrics(opt.MetricsType),
} }
metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics}
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, metricsMiddleware)
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, metricsMiddleware)
// Default // Default
if opt.UserAgent == "" { if opt.UserAgent == "" {
opt.UserAgent = client.DefaultUserAgent opt.UserAgent = client.DefaultUserAgent
@ -95,6 +98,8 @@ func NewGeziyor(opt *Options) *Geziyor {
// Logging // Logging
if opt.LogDisabled { if opt.LogDisabled {
log.SetOutput(ioutil.Discard) log.SetOutput(ioutil.Discard)
} else {
log.SetOutput(os.Stdout)
} }
return geziyor return geziyor
@ -193,10 +198,10 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
if !req.Synchronized { if !req.Synchronized {
defer g.wgRequests.Done() defer g.wgRequests.Done()
} }
defer recoverMiddleware(g, req) defer g.recoverMe()
for _, middlewareFunc := range g.requestMiddlewares { for _, middlewareFunc := range g.requestMiddlewares {
middlewareFunc(g, req) middlewareFunc.ProcessRequest(req)
if req.Cancelled { if req.Cancelled {
return return
} }
@ -209,7 +214,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
} }
for _, middlewareFunc := range g.responseMiddlewares { for _, middlewareFunc := range g.responseMiddlewares {
middlewareFunc(g, res) middlewareFunc.ProcessResponse(res)
} }
// Callbacks // Callbacks
@ -248,3 +253,12 @@ func (g *Geziyor) releaseSem(req *client.Request) {
<-g.semHosts.hostSems[req.Host] <-g.semHosts.hostSems[req.Host]
} }
} }
// recoverMe prevents scraping being crashed.
// Logs error and stack trace
func (g *Geziyor) recoverMe() {
if r := recover(); r != nil {
log.Println(r, string(debug.Stack()))
g.metrics.PanicCounter.Add(1)
}
}

View File

@ -8,7 +8,6 @@ import (
"github.com/geziyor/geziyor" "github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/export" "github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/extract"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
@ -158,22 +157,6 @@ func TestBasicAuth(t *testing.T) {
}).Start() }).Start()
} }
func TestExtractor(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
Extractors: []extract.Extractor{
extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
extract.Text{Name: "title", Selector: ".c-page-title"},
extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
extract.Text{Name: "summary", Selector: ".c-entry-summary"},
extract.Text{Name: "content", Selector: ".c-entry-content"},
},
Exporters: []export.Exporter{&export.JSON{}},
}).Start()
}
func TestRedirect(t *testing.T) { func TestRedirect(t *testing.T) {
defer leaktest.Check(t)() defer leaktest.Check(t)()
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{

View File

@ -1,131 +0,0 @@
package geziyor
import (
"bytes"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal"
"log"
"math/rand"
"os"
"reflect"
"runtime/debug"
"time"
)
// RequestMiddleware called before requests made.
// Set request.Cancelled = true to cancel request
type RequestMiddleware func(g *Geziyor, r *client.Request)
// ResponseMiddleware called after request response receive
type ResponseMiddleware func(g *Geziyor, r *client.Response)
func init() {
log.SetOutput(os.Stdout)
rand.Seed(time.Now().UnixNano())
}
// ---* REQUEST MIDDLEWARES *---
// recoverMiddleware recovers scraping being crashed.
// Logs error and stack trace
func recoverMiddleware(g *Geziyor, r *client.Request) {
if r := recover(); r != nil {
log.Println(r, string(debug.Stack()))
g.metrics.PanicCounter.Add(1)
}
}
// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains
func allowedDomainsMiddleware(g *Geziyor, r *client.Request) {
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
//log.Printf("Domain not allowed: %s\n", req.Host)
r.Cancel()
return
}
}
// duplicateRequestsMiddleware checks for already visited URLs
func duplicateRequestsMiddleware(g *Geziyor, r *client.Request) {
if !g.Opt.URLRevisitEnabled && r.Request.Method == "GET" {
if _, visited := g.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
//log.Printf("URL already visited %s\n", rawURL)
r.Cancel()
}
}
}
// defaultHeadersMiddleware sets default request headers
func defaultHeadersMiddleware(g *Geziyor, r *client.Request) {
r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en")
r.Header = client.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
}
// delayMiddleware delays requests
func delayMiddleware(g *Geziyor, r *client.Request) {
if g.Opt.RequestDelayRandomize {
min := float64(g.Opt.RequestDelay) * 0.5
max := float64(g.Opt.RequestDelay) * 1.5
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
} else {
time.Sleep(g.Opt.RequestDelay)
}
}
// logMiddleware logs requests
func logMiddleware(g *Geziyor, r *client.Request) {
// LogDisabled check is not necessary, but done here for performance reasons
if !g.Opt.LogDisabled {
log.Println("Fetching: ", r.URL.String())
}
}
// metricsRequestMiddleware sets stats
func metricsRequestMiddleware(g *Geziyor, r *client.Request) {
g.metrics.RequestCounter.With("method", r.Method).Add(1)
}
// ---* RESPONSE MIDDLEWARES *---
// parseHTMLMiddleware parses response if response is HTML
func parseHTMLMiddleware(g *Geziyor, r *client.Response) {
if !g.Opt.ParseHTMLDisabled && r.IsHTML() {
r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
}
}
// metricsResponseMiddleware sets stats
func metricsResponseMiddleware(g *Geziyor, r *client.Response) {
g.metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
}
// extractorsMiddleware extracts data from loaders conf and exports it to exporters
func extractorsMiddleware(g *Geziyor, r *client.Response) {
// Check if we have extractors and exporters
if len(g.Opt.Extractors) != 0 && len(g.Opt.Exporters) != 0 {
exports := map[string]interface{}{}
for _, extractor := range g.Opt.Extractors {
extracted, err := extractor.Extract(r.HTMLDoc.Selection)
if err != nil {
log.Println("extraction error: ", err)
continue
}
// Check extracted data type and use it accordingly
val := reflect.ValueOf(extracted)
switch val.Kind() {
case reflect.Map:
r := val.MapRange()
for r.Next() {
exports[fmt.Sprint(r.Key())] = r.Value().Interface()
}
}
}
g.Exports <- exports
}
}

View File

@ -0,0 +1,19 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal"
)
// AllowedDomains checks for request host if it exists in AllowedDomains
type AllowedDomains struct {
AllowedDomains []string
}
func (a *AllowedDomains) ProcessRequest(r *client.Request) {
if len(a.AllowedDomains) != 0 && !internal.Contains(a.AllowedDomains, r.Host) {
//log.Printf("Domain not allowed: %s\n", req.Host)
r.Cancel()
return
}
}

30
middleware/delay.go Normal file
View File

@ -0,0 +1,30 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"math/rand"
"time"
)
// delay delays requests
type delay struct {
requestDelayRandomize bool
requestDelay time.Duration
}
func NewDelay(requestDelayRandomize bool, requestDelay time.Duration) RequestProcessor {
if requestDelayRandomize {
rand.Seed(time.Now().UnixNano())
}
return &delay{requestDelayRandomize: requestDelayRandomize, requestDelay: requestDelay}
}
func (a *delay) ProcessRequest(r *client.Request) {
if a.requestDelayRandomize {
min := float64(a.requestDelay) * 0.5
max := float64(a.requestDelay) * 1.5
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
} else {
time.Sleep(a.requestDelay)
}
}

View File

@ -1,7 +1,7 @@
package geziyor package middleware
import ( import (
"fmt" "github.com/stretchr/testify/assert"
"math/rand" "math/rand"
"testing" "testing"
"time" "time"
@ -13,5 +13,7 @@ func TestRandomDelay(t *testing.T) {
min := float64(delay) * 0.5 min := float64(delay) * 0.5
max := float64(delay) * 1.5 max := float64(delay) * 1.5
randomDelay := rand.Intn(int(max-min)) + int(min) randomDelay := rand.Intn(int(max-min)) + int(min)
fmt.Println(time.Duration(randomDelay))
assert.True(t, time.Duration(randomDelay).Seconds() < 1.5)
assert.True(t, time.Duration(randomDelay).Seconds() > 0.5)
} }

View File

@ -0,0 +1,21 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"sync"
)
// DuplicateRequests checks for already visited URLs
type DuplicateRequests struct {
RevisitEnabled bool
visitedURLs sync.Map
}
func (a *DuplicateRequests) ProcessRequest(r *client.Request) {
if !a.RevisitEnabled && r.Request.Method == "GET" {
if _, visited := a.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
//log.Printf("URL already visited %s\n")
r.Cancel()
}
}
}

17
middleware/headers.go Normal file
View File

@ -0,0 +1,17 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
)
// Headers sets default request headers
type Headers struct {
UserAgent string
}
func (a *Headers) ProcessRequest(r *client.Request) {
r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en")
r.Header = client.SetDefaultHeader(r.Header, "User-Agent", a.UserAgent)
}

18
middleware/log_stats.go Normal file
View File

@ -0,0 +1,18 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"log"
)
// LogStats logs responses
type LogStats struct {
LogDisabled bool
}
func (p *LogStats) ProcessResponse(r *client.Response) {
// LogDisabled check is not necessary, but done here for performance reasons
if !p.LogDisabled {
log.Printf("Crawled: (%d) <%s %s>", r.StatusCode, r.Request.Method, r.Request.URL.String())
}
}

19
middleware/metrics.go Normal file
View File

@ -0,0 +1,19 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/metrics"
)
// Metrics sets stats for request and responses
type Metrics struct {
Metrics *metrics.Metrics
}
func (a *Metrics) ProcessRequest(r *client.Request) {
a.Metrics.RequestCounter.With("method", r.Method).Add(1)
}
func (a *Metrics) ProcessResponse(r *client.Response) {
a.Metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
}

21
middleware/middleware.go Normal file
View File

@ -0,0 +1,21 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
)
type RequestResponseProcessor interface {
RequestProcessor
ResponseProcessor
}
// RequestProcessor called before requests made.
// Set request.Cancelled = true to cancel request
type RequestProcessor interface {
ProcessRequest(r *client.Request)
}
// ResponseProcessor called after request response receive
type ResponseProcessor interface {
ProcessResponse(r *client.Response)
}

18
middleware/parse_html.go Normal file
View File

@ -0,0 +1,18 @@
package middleware
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor/client"
)
// ParseHTML parses response if response is HTML
type ParseHTML struct {
ParseHTMLDisabled bool
}
func (p *ParseHTML) ProcessResponse(r *client.Response) {
if !p.ParseHTMLDisabled && r.IsHTML() {
r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
}
}

View File

@ -4,8 +4,8 @@ import (
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/export" "github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/extract"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
"github.com/geziyor/geziyor/middleware"
"time" "time"
) )
@ -24,9 +24,6 @@ type Options struct {
// ParseFunc is callback of StartURLs response. // ParseFunc is callback of StartURLs response.
ParseFunc func(g *Geziyor, r *client.Response) ParseFunc func(g *Geziyor, r *client.Response)
// Extractors extracts items from pages
Extractors []extract.Extractor
// Timeout is global request timeout // Timeout is global request timeout
Timeout time.Duration Timeout time.Duration
@ -56,10 +53,10 @@ type Options struct {
Exporters []export.Exporter Exporters []export.Exporter
// Called before requests made to manipulate requests // Called before requests made to manipulate requests
RequestMiddlewares []RequestMiddleware RequestMiddlewares []middleware.RequestProcessor
// Called after response received // Called after response received
ResponseMiddlewares []ResponseMiddleware ResponseMiddlewares []middleware.ResponseProcessor
// Max body reading size in bytes. Default: 1GB // Max body reading size in bytes. Default: 1GB
MaxBodySize int64 MaxBodySize int64