Middlewares refactored to multiple files in middleware package.
Extractors removed as they introduce complexity to scraper. Both in learning and developing.
This commit is contained in:
parent
9adff75509
commit
2cab68d2ce
23
README.md
23
README.md
@ -104,28 +104,6 @@ geziyor.NewGeziyor(&geziyor.Options{
|
||||
|
||||
### Extracting Data
|
||||
|
||||
#### Extractors
|
||||
You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor) to ```[]Extractors``` option to extract structured data.
|
||||
```Exporters``` need to be defined in order extractors to work.
|
||||
|
||||
```go
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||
Extractors: []extract.Extractor{
|
||||
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||
&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
|
||||
&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
|
||||
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||
},
|
||||
Exporters: []export.Exporter{&export.JSON{}},
|
||||
}).Start()
|
||||
```
|
||||
|
||||
#### HTML selectors
|
||||
|
||||
We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document).
|
||||
|
||||
HTMLDoc can be accessible on Response if response is HTML and can be parsed using Go's built-in HTML [parser](https://godoc.org/golang.org/x/net/html#Parse)
|
||||
@ -183,7 +161,6 @@ ok github.com/geziyor/geziyor 22.861s
|
||||
If you're interested in helping this project, please consider these features:
|
||||
|
||||
- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
|
||||
- ~~Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape))~~
|
||||
- Deploying Scrapers to Cloud
|
||||
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~
|
||||
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))
|
||||
|
@ -1,7 +1,6 @@
|
||||
package export
|
||||
|
||||
// Exporter interface is for extracting data to external resources.
|
||||
// Geziyor calls every extractors Export functions before any scraping starts.
|
||||
// Export functions should wait for new data from exports chan.
|
||||
type Exporter interface {
|
||||
Export(exports chan interface{})
|
||||
|
@ -1,24 +0,0 @@
|
||||
package extract
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
var ErrAttrNotExists = errors.New("attribute not exist")
|
||||
|
||||
// Attr returns HTML attribute value of provided selector
|
||||
type Attr struct {
|
||||
Name string
|
||||
Selector string
|
||||
Attr string
|
||||
}
|
||||
|
||||
// Extract returns HTML attribute value of provided selector
|
||||
func (e Attr) Extract(sel *goquery.Selection) (interface{}, error) {
|
||||
attr, exists := sel.Find(e.Selector).Attr(e.Attr)
|
||||
if !exists {
|
||||
return nil, ErrAttrNotExists
|
||||
}
|
||||
return map[string]string{e.Name: attr}, nil
|
||||
}
|
@ -1,8 +0,0 @@
|
||||
package extract
|
||||
|
||||
import "github.com/PuerkitoBio/goquery"
|
||||
|
||||
// Extractor interface is for extracting data from HTML document
|
||||
type Extractor interface {
|
||||
Extract(sel *goquery.Selection) (interface{}, error)
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
package extract
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// HTML extracts and returns the HTML from inside each element of the given selection.
|
||||
type HTML struct {
|
||||
Name string
|
||||
Selector string
|
||||
}
|
||||
|
||||
// Extract extracts and returns the HTML from inside each element of the given selection.
|
||||
func (e HTML) Extract(sel *goquery.Selection) (interface{}, error) {
|
||||
var ret, h string
|
||||
var err error
|
||||
|
||||
sel.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
|
||||
h, err = s.Html()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
ret += h
|
||||
return true
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return map[string]string{e.Name: ret}, nil
|
||||
}
|
||||
|
||||
// OuterHTML extracts and returns the HTML of each element of the given selection.
|
||||
type OuterHTML struct {
|
||||
Name string
|
||||
Selector string
|
||||
}
|
||||
|
||||
// Extract extracts and returns the HTML of each element of the given selection.
|
||||
func (e OuterHTML) Extract(sel *goquery.Selection) (interface{}, error) {
|
||||
output := bytes.NewBufferString("")
|
||||
for _, node := range sel.Find(e.Selector).Nodes {
|
||||
if err := html.Render(output, node); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return map[string]string{e.Name: output.String()}, nil
|
||||
}
|
@ -1,22 +0,0 @@
|
||||
package extract
|
||||
|
||||
import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Text returns the combined text contents of provided selector.
|
||||
type Text struct {
|
||||
Name string
|
||||
Selector string
|
||||
TrimSpace bool
|
||||
}
|
||||
|
||||
// Extract returns the combined text contents of provided selector.
|
||||
func (e Text) Extract(sel *goquery.Selection) (interface{}, error) {
|
||||
text := sel.Find(e.Selector).Text()
|
||||
if e.TrimSpace {
|
||||
text = strings.TrimSpace(text)
|
||||
}
|
||||
return map[string]string{e.Name: text}, nil
|
||||
}
|
48
geziyor.go
48
geziyor.go
@ -4,9 +4,12 @@ import (
|
||||
"github.com/fpfeng/httpcache"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/metrics"
|
||||
"github.com/geziyor/geziyor/middleware"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http/cookiejar"
|
||||
"os"
|
||||
"runtime/debug"
|
||||
"sync"
|
||||
)
|
||||
|
||||
@ -17,8 +20,8 @@ type Geziyor struct {
|
||||
Exports chan interface{}
|
||||
|
||||
metrics *metrics.Metrics
|
||||
requestMiddlewares []RequestMiddleware
|
||||
responseMiddlewares []ResponseMiddleware
|
||||
requestMiddlewares []middleware.RequestProcessor
|
||||
responseMiddlewares []middleware.ResponseProcessor
|
||||
wgRequests sync.WaitGroup
|
||||
wgExporters sync.WaitGroup
|
||||
semGlobal chan struct{}
|
||||
@ -26,7 +29,6 @@ type Geziyor struct {
|
||||
sync.RWMutex
|
||||
hostSems map[string]chan struct{}
|
||||
}
|
||||
visitedURLs sync.Map
|
||||
}
|
||||
|
||||
// NewGeziyor creates new Geziyor with default values.
|
||||
@ -35,22 +37,23 @@ func NewGeziyor(opt *Options) *Geziyor {
|
||||
geziyor := &Geziyor{
|
||||
Opt: opt,
|
||||
Exports: make(chan interface{}, 1),
|
||||
requestMiddlewares: []RequestMiddleware{
|
||||
allowedDomainsMiddleware,
|
||||
duplicateRequestsMiddleware,
|
||||
defaultHeadersMiddleware,
|
||||
delayMiddleware,
|
||||
logMiddleware,
|
||||
metricsRequestMiddleware,
|
||||
requestMiddlewares: []middleware.RequestProcessor{
|
||||
&middleware.AllowedDomains{AllowedDomains: opt.AllowedDomains},
|
||||
&middleware.DuplicateRequests{RevisitEnabled: opt.URLRevisitEnabled},
|
||||
&middleware.Headers{UserAgent: opt.UserAgent},
|
||||
middleware.NewDelay(opt.RequestDelayRandomize, opt.RequestDelay),
|
||||
},
|
||||
responseMiddlewares: []ResponseMiddleware{
|
||||
parseHTMLMiddleware,
|
||||
metricsResponseMiddleware,
|
||||
extractorsMiddleware,
|
||||
responseMiddlewares: []middleware.ResponseProcessor{
|
||||
&middleware.ParseHTML{ParseHTMLDisabled: opt.ParseHTMLDisabled},
|
||||
&middleware.LogStats{LogDisabled: opt.LogDisabled},
|
||||
},
|
||||
metrics: metrics.NewMetrics(opt.MetricsType),
|
||||
}
|
||||
|
||||
metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics}
|
||||
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, metricsMiddleware)
|
||||
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, metricsMiddleware)
|
||||
|
||||
// Default
|
||||
if opt.UserAgent == "" {
|
||||
opt.UserAgent = client.DefaultUserAgent
|
||||
@ -95,6 +98,8 @@ func NewGeziyor(opt *Options) *Geziyor {
|
||||
// Logging
|
||||
if opt.LogDisabled {
|
||||
log.SetOutput(ioutil.Discard)
|
||||
} else {
|
||||
log.SetOutput(os.Stdout)
|
||||
}
|
||||
|
||||
return geziyor
|
||||
@ -193,10 +198,10 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
|
||||
if !req.Synchronized {
|
||||
defer g.wgRequests.Done()
|
||||
}
|
||||
defer recoverMiddleware(g, req)
|
||||
defer g.recoverMe()
|
||||
|
||||
for _, middlewareFunc := range g.requestMiddlewares {
|
||||
middlewareFunc(g, req)
|
||||
middlewareFunc.ProcessRequest(req)
|
||||
if req.Cancelled {
|
||||
return
|
||||
}
|
||||
@ -209,7 +214,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
|
||||
}
|
||||
|
||||
for _, middlewareFunc := range g.responseMiddlewares {
|
||||
middlewareFunc(g, res)
|
||||
middlewareFunc.ProcessResponse(res)
|
||||
}
|
||||
|
||||
// Callbacks
|
||||
@ -248,3 +253,12 @@ func (g *Geziyor) releaseSem(req *client.Request) {
|
||||
<-g.semHosts.hostSems[req.Host]
|
||||
}
|
||||
}
|
||||
|
||||
// recoverMe prevents scraping being crashed.
|
||||
// Logs error and stack trace
|
||||
func (g *Geziyor) recoverMe() {
|
||||
if r := recover(); r != nil {
|
||||
log.Println(r, string(debug.Stack()))
|
||||
g.metrics.PanicCounter.Add(1)
|
||||
}
|
||||
}
|
||||
|
@ -8,7 +8,6 @@ import (
|
||||
"github.com/geziyor/geziyor"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/export"
|
||||
"github.com/geziyor/geziyor/extract"
|
||||
"github.com/geziyor/geziyor/metrics"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
@ -158,22 +157,6 @@ func TestBasicAuth(t *testing.T) {
|
||||
}).Start()
|
||||
}
|
||||
|
||||
func TestExtractor(t *testing.T) {
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||
Extractors: []extract.Extractor{
|
||||
extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||
extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||
extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||
extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
|
||||
extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
|
||||
extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||
extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||
},
|
||||
Exporters: []export.Exporter{&export.JSON{}},
|
||||
}).Start()
|
||||
}
|
||||
|
||||
func TestRedirect(t *testing.T) {
|
||||
defer leaktest.Check(t)()
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
|
131
middleware.go
131
middleware.go
@ -1,131 +0,0 @@
|
||||
package geziyor
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/internal"
|
||||
"log"
|
||||
"math/rand"
|
||||
"os"
|
||||
"reflect"
|
||||
"runtime/debug"
|
||||
"time"
|
||||
)
|
||||
|
||||
// RequestMiddleware called before requests made.
|
||||
// Set request.Cancelled = true to cancel request
|
||||
type RequestMiddleware func(g *Geziyor, r *client.Request)
|
||||
|
||||
// ResponseMiddleware called after request response receive
|
||||
type ResponseMiddleware func(g *Geziyor, r *client.Response)
|
||||
|
||||
func init() {
|
||||
log.SetOutput(os.Stdout)
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
}
|
||||
|
||||
// ---* REQUEST MIDDLEWARES *---
|
||||
|
||||
// recoverMiddleware recovers scraping being crashed.
|
||||
// Logs error and stack trace
|
||||
func recoverMiddleware(g *Geziyor, r *client.Request) {
|
||||
if r := recover(); r != nil {
|
||||
log.Println(r, string(debug.Stack()))
|
||||
g.metrics.PanicCounter.Add(1)
|
||||
}
|
||||
}
|
||||
|
||||
// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains
|
||||
func allowedDomainsMiddleware(g *Geziyor, r *client.Request) {
|
||||
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
|
||||
//log.Printf("Domain not allowed: %s\n", req.Host)
|
||||
r.Cancel()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// duplicateRequestsMiddleware checks for already visited URLs
|
||||
func duplicateRequestsMiddleware(g *Geziyor, r *client.Request) {
|
||||
if !g.Opt.URLRevisitEnabled && r.Request.Method == "GET" {
|
||||
if _, visited := g.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
|
||||
//log.Printf("URL already visited %s\n", rawURL)
|
||||
r.Cancel()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// defaultHeadersMiddleware sets default request headers
|
||||
func defaultHeadersMiddleware(g *Geziyor, r *client.Request) {
|
||||
r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
|
||||
r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en")
|
||||
r.Header = client.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
|
||||
}
|
||||
|
||||
// delayMiddleware delays requests
|
||||
func delayMiddleware(g *Geziyor, r *client.Request) {
|
||||
if g.Opt.RequestDelayRandomize {
|
||||
min := float64(g.Opt.RequestDelay) * 0.5
|
||||
max := float64(g.Opt.RequestDelay) * 1.5
|
||||
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
|
||||
} else {
|
||||
time.Sleep(g.Opt.RequestDelay)
|
||||
}
|
||||
}
|
||||
|
||||
// logMiddleware logs requests
|
||||
func logMiddleware(g *Geziyor, r *client.Request) {
|
||||
// LogDisabled check is not necessary, but done here for performance reasons
|
||||
if !g.Opt.LogDisabled {
|
||||
log.Println("Fetching: ", r.URL.String())
|
||||
}
|
||||
}
|
||||
|
||||
// metricsRequestMiddleware sets stats
|
||||
func metricsRequestMiddleware(g *Geziyor, r *client.Request) {
|
||||
g.metrics.RequestCounter.With("method", r.Method).Add(1)
|
||||
}
|
||||
|
||||
// ---* RESPONSE MIDDLEWARES *---
|
||||
|
||||
// parseHTMLMiddleware parses response if response is HTML
|
||||
func parseHTMLMiddleware(g *Geziyor, r *client.Response) {
|
||||
if !g.Opt.ParseHTMLDisabled && r.IsHTML() {
|
||||
r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
|
||||
}
|
||||
}
|
||||
|
||||
// metricsResponseMiddleware sets stats
|
||||
func metricsResponseMiddleware(g *Geziyor, r *client.Response) {
|
||||
g.metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
|
||||
}
|
||||
|
||||
// extractorsMiddleware extracts data from loaders conf and exports it to exporters
|
||||
func extractorsMiddleware(g *Geziyor, r *client.Response) {
|
||||
|
||||
// Check if we have extractors and exporters
|
||||
if len(g.Opt.Extractors) != 0 && len(g.Opt.Exporters) != 0 {
|
||||
exports := map[string]interface{}{}
|
||||
|
||||
for _, extractor := range g.Opt.Extractors {
|
||||
extracted, err := extractor.Extract(r.HTMLDoc.Selection)
|
||||
if err != nil {
|
||||
log.Println("extraction error: ", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check extracted data type and use it accordingly
|
||||
val := reflect.ValueOf(extracted)
|
||||
switch val.Kind() {
|
||||
case reflect.Map:
|
||||
r := val.MapRange()
|
||||
for r.Next() {
|
||||
exports[fmt.Sprint(r.Key())] = r.Value().Interface()
|
||||
}
|
||||
}
|
||||
}
|
||||
g.Exports <- exports
|
||||
}
|
||||
}
|
19
middleware/allowed_domains.go
Normal file
19
middleware/allowed_domains.go
Normal file
@ -0,0 +1,19 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/internal"
|
||||
)
|
||||
|
||||
// AllowedDomains checks for request host if it exists in AllowedDomains
|
||||
type AllowedDomains struct {
|
||||
AllowedDomains []string
|
||||
}
|
||||
|
||||
func (a *AllowedDomains) ProcessRequest(r *client.Request) {
|
||||
if len(a.AllowedDomains) != 0 && !internal.Contains(a.AllowedDomains, r.Host) {
|
||||
//log.Printf("Domain not allowed: %s\n", req.Host)
|
||||
r.Cancel()
|
||||
return
|
||||
}
|
||||
}
|
30
middleware/delay.go
Normal file
30
middleware/delay.go
Normal file
@ -0,0 +1,30 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"math/rand"
|
||||
"time"
|
||||
)
|
||||
|
||||
// delay delays requests
|
||||
type delay struct {
|
||||
requestDelayRandomize bool
|
||||
requestDelay time.Duration
|
||||
}
|
||||
|
||||
func NewDelay(requestDelayRandomize bool, requestDelay time.Duration) RequestProcessor {
|
||||
if requestDelayRandomize {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
}
|
||||
return &delay{requestDelayRandomize: requestDelayRandomize, requestDelay: requestDelay}
|
||||
}
|
||||
|
||||
func (a *delay) ProcessRequest(r *client.Request) {
|
||||
if a.requestDelayRandomize {
|
||||
min := float64(a.requestDelay) * 0.5
|
||||
max := float64(a.requestDelay) * 1.5
|
||||
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
|
||||
} else {
|
||||
time.Sleep(a.requestDelay)
|
||||
}
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
package geziyor
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"math/rand"
|
||||
"testing"
|
||||
"time"
|
||||
@ -13,5 +13,7 @@ func TestRandomDelay(t *testing.T) {
|
||||
min := float64(delay) * 0.5
|
||||
max := float64(delay) * 1.5
|
||||
randomDelay := rand.Intn(int(max-min)) + int(min)
|
||||
fmt.Println(time.Duration(randomDelay))
|
||||
|
||||
assert.True(t, time.Duration(randomDelay).Seconds() < 1.5)
|
||||
assert.True(t, time.Duration(randomDelay).Seconds() > 0.5)
|
||||
}
|
21
middleware/duplicate_requests.go
Normal file
21
middleware/duplicate_requests.go
Normal file
@ -0,0 +1,21 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// DuplicateRequests checks for already visited URLs
|
||||
type DuplicateRequests struct {
|
||||
RevisitEnabled bool
|
||||
visitedURLs sync.Map
|
||||
}
|
||||
|
||||
func (a *DuplicateRequests) ProcessRequest(r *client.Request) {
|
||||
if !a.RevisitEnabled && r.Request.Method == "GET" {
|
||||
if _, visited := a.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
|
||||
//log.Printf("URL already visited %s\n")
|
||||
r.Cancel()
|
||||
}
|
||||
}
|
||||
}
|
17
middleware/headers.go
Normal file
17
middleware/headers.go
Normal file
@ -0,0 +1,17 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
)
|
||||
|
||||
// Headers sets default request headers
|
||||
type Headers struct {
|
||||
UserAgent string
|
||||
}
|
||||
|
||||
func (a *Headers) ProcessRequest(r *client.Request) {
|
||||
r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
|
||||
r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en")
|
||||
r.Header = client.SetDefaultHeader(r.Header, "User-Agent", a.UserAgent)
|
||||
}
|
18
middleware/log_stats.go
Normal file
18
middleware/log_stats.go
Normal file
@ -0,0 +1,18 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"log"
|
||||
)
|
||||
|
||||
// LogStats logs responses
|
||||
type LogStats struct {
|
||||
LogDisabled bool
|
||||
}
|
||||
|
||||
func (p *LogStats) ProcessResponse(r *client.Response) {
|
||||
// LogDisabled check is not necessary, but done here for performance reasons
|
||||
if !p.LogDisabled {
|
||||
log.Printf("Crawled: (%d) <%s %s>", r.StatusCode, r.Request.Method, r.Request.URL.String())
|
||||
}
|
||||
}
|
19
middleware/metrics.go
Normal file
19
middleware/metrics.go
Normal file
@ -0,0 +1,19 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/metrics"
|
||||
)
|
||||
|
||||
// Metrics sets stats for request and responses
|
||||
type Metrics struct {
|
||||
Metrics *metrics.Metrics
|
||||
}
|
||||
|
||||
func (a *Metrics) ProcessRequest(r *client.Request) {
|
||||
a.Metrics.RequestCounter.With("method", r.Method).Add(1)
|
||||
}
|
||||
|
||||
func (a *Metrics) ProcessResponse(r *client.Response) {
|
||||
a.Metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
|
||||
}
|
21
middleware/middleware.go
Normal file
21
middleware/middleware.go
Normal file
@ -0,0 +1,21 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
)
|
||||
|
||||
type RequestResponseProcessor interface {
|
||||
RequestProcessor
|
||||
ResponseProcessor
|
||||
}
|
||||
|
||||
// RequestProcessor called before requests made.
|
||||
// Set request.Cancelled = true to cancel request
|
||||
type RequestProcessor interface {
|
||||
ProcessRequest(r *client.Request)
|
||||
}
|
||||
|
||||
// ResponseProcessor called after request response receive
|
||||
type ResponseProcessor interface {
|
||||
ProcessResponse(r *client.Response)
|
||||
}
|
18
middleware/parse_html.go
Normal file
18
middleware/parse_html.go
Normal file
@ -0,0 +1,18 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
)
|
||||
|
||||
// ParseHTML parses response if response is HTML
|
||||
type ParseHTML struct {
|
||||
ParseHTMLDisabled bool
|
||||
}
|
||||
|
||||
func (p *ParseHTML) ProcessResponse(r *client.Response) {
|
||||
if !p.ParseHTMLDisabled && r.IsHTML() {
|
||||
r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
|
||||
}
|
||||
}
|
@ -4,8 +4,8 @@ import (
|
||||
"github.com/fpfeng/httpcache"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/export"
|
||||
"github.com/geziyor/geziyor/extract"
|
||||
"github.com/geziyor/geziyor/metrics"
|
||||
"github.com/geziyor/geziyor/middleware"
|
||||
"time"
|
||||
)
|
||||
|
||||
@ -24,9 +24,6 @@ type Options struct {
|
||||
// ParseFunc is callback of StartURLs response.
|
||||
ParseFunc func(g *Geziyor, r *client.Response)
|
||||
|
||||
// Extractors extracts items from pages
|
||||
Extractors []extract.Extractor
|
||||
|
||||
// Timeout is global request timeout
|
||||
Timeout time.Duration
|
||||
|
||||
@ -56,10 +53,10 @@ type Options struct {
|
||||
Exporters []export.Exporter
|
||||
|
||||
// Called before requests made to manipulate requests
|
||||
RequestMiddlewares []RequestMiddleware
|
||||
RequestMiddlewares []middleware.RequestProcessor
|
||||
|
||||
// Called after response received
|
||||
ResponseMiddlewares []ResponseMiddleware
|
||||
ResponseMiddlewares []middleware.ResponseProcessor
|
||||
|
||||
// Max body reading size in bytes. Default: 1GB
|
||||
MaxBodySize int64
|
||||
|
Loading…
x
Reference in New Issue
Block a user