Middlewares refactored to multiple files in middleware package.
Extractors removed as they introduce complexity to scraper. Both in learning and developing.
This commit is contained in:
parent
9adff75509
commit
2cab68d2ce
23
README.md
23
README.md
@ -104,28 +104,6 @@ geziyor.NewGeziyor(&geziyor.Options{
|
|||||||
|
|
||||||
### Extracting Data
|
### Extracting Data
|
||||||
|
|
||||||
#### Extractors
|
|
||||||
You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor) to ```[]Extractors``` option to extract structured data.
|
|
||||||
```Exporters``` need to be defined in order extractors to work.
|
|
||||||
|
|
||||||
```go
|
|
||||||
geziyor.NewGeziyor(&geziyor.Options{
|
|
||||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
|
||||||
Extractors: []extract.Extractor{
|
|
||||||
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
|
||||||
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
|
||||||
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
|
||||||
&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
|
|
||||||
&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
|
|
||||||
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
|
||||||
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
|
||||||
},
|
|
||||||
Exporters: []export.Exporter{&export.JSON{}},
|
|
||||||
}).Start()
|
|
||||||
```
|
|
||||||
|
|
||||||
#### HTML selectors
|
|
||||||
|
|
||||||
We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document).
|
We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document).
|
||||||
|
|
||||||
HTMLDoc can be accessible on Response if response is HTML and can be parsed using Go's built-in HTML [parser](https://godoc.org/golang.org/x/net/html#Parse)
|
HTMLDoc can be accessible on Response if response is HTML and can be parsed using Go's built-in HTML [parser](https://godoc.org/golang.org/x/net/html#Parse)
|
||||||
@ -183,7 +161,6 @@ ok github.com/geziyor/geziyor 22.861s
|
|||||||
If you're interested in helping this project, please consider these features:
|
If you're interested in helping this project, please consider these features:
|
||||||
|
|
||||||
- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
|
- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
|
||||||
- ~~Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape))~~
|
|
||||||
- Deploying Scrapers to Cloud
|
- Deploying Scrapers to Cloud
|
||||||
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~
|
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~
|
||||||
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))
|
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
package export
|
package export
|
||||||
|
|
||||||
// Exporter interface is for extracting data to external resources.
|
// Exporter interface is for extracting data to external resources.
|
||||||
// Geziyor calls every extractors Export functions before any scraping starts.
|
|
||||||
// Export functions should wait for new data from exports chan.
|
// Export functions should wait for new data from exports chan.
|
||||||
type Exporter interface {
|
type Exporter interface {
|
||||||
Export(exports chan interface{})
|
Export(exports chan interface{})
|
||||||
|
@ -1,24 +0,0 @@
|
|||||||
package extract
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
|
||||||
)
|
|
||||||
|
|
||||||
var ErrAttrNotExists = errors.New("attribute not exist")
|
|
||||||
|
|
||||||
// Attr returns HTML attribute value of provided selector
|
|
||||||
type Attr struct {
|
|
||||||
Name string
|
|
||||||
Selector string
|
|
||||||
Attr string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract returns HTML attribute value of provided selector
|
|
||||||
func (e Attr) Extract(sel *goquery.Selection) (interface{}, error) {
|
|
||||||
attr, exists := sel.Find(e.Selector).Attr(e.Attr)
|
|
||||||
if !exists {
|
|
||||||
return nil, ErrAttrNotExists
|
|
||||||
}
|
|
||||||
return map[string]string{e.Name: attr}, nil
|
|
||||||
}
|
|
@ -1,8 +0,0 @@
|
|||||||
package extract
|
|
||||||
|
|
||||||
import "github.com/PuerkitoBio/goquery"
|
|
||||||
|
|
||||||
// Extractor interface is for extracting data from HTML document
|
|
||||||
type Extractor interface {
|
|
||||||
Extract(sel *goquery.Selection) (interface{}, error)
|
|
||||||
}
|
|
@ -1,52 +0,0 @@
|
|||||||
package extract
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
|
||||||
"golang.org/x/net/html"
|
|
||||||
)
|
|
||||||
|
|
||||||
// HTML extracts and returns the HTML from inside each element of the given selection.
|
|
||||||
type HTML struct {
|
|
||||||
Name string
|
|
||||||
Selector string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract extracts and returns the HTML from inside each element of the given selection.
|
|
||||||
func (e HTML) Extract(sel *goquery.Selection) (interface{}, error) {
|
|
||||||
var ret, h string
|
|
||||||
var err error
|
|
||||||
|
|
||||||
sel.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
|
|
||||||
h, err = s.Html()
|
|
||||||
if err != nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
ret += h
|
|
||||||
return true
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return map[string]string{e.Name: ret}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// OuterHTML extracts and returns the HTML of each element of the given selection.
|
|
||||||
type OuterHTML struct {
|
|
||||||
Name string
|
|
||||||
Selector string
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract extracts and returns the HTML of each element of the given selection.
|
|
||||||
func (e OuterHTML) Extract(sel *goquery.Selection) (interface{}, error) {
|
|
||||||
output := bytes.NewBufferString("")
|
|
||||||
for _, node := range sel.Find(e.Selector).Nodes {
|
|
||||||
if err := html.Render(output, node); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return map[string]string{e.Name: output.String()}, nil
|
|
||||||
}
|
|
@ -1,22 +0,0 @@
|
|||||||
package extract
|
|
||||||
|
|
||||||
import (
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Text returns the combined text contents of provided selector.
|
|
||||||
type Text struct {
|
|
||||||
Name string
|
|
||||||
Selector string
|
|
||||||
TrimSpace bool
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract returns the combined text contents of provided selector.
|
|
||||||
func (e Text) Extract(sel *goquery.Selection) (interface{}, error) {
|
|
||||||
text := sel.Find(e.Selector).Text()
|
|
||||||
if e.TrimSpace {
|
|
||||||
text = strings.TrimSpace(text)
|
|
||||||
}
|
|
||||||
return map[string]string{e.Name: text}, nil
|
|
||||||
}
|
|
48
geziyor.go
48
geziyor.go
@ -4,9 +4,12 @@ import (
|
|||||||
"github.com/fpfeng/httpcache"
|
"github.com/fpfeng/httpcache"
|
||||||
"github.com/geziyor/geziyor/client"
|
"github.com/geziyor/geziyor/client"
|
||||||
"github.com/geziyor/geziyor/metrics"
|
"github.com/geziyor/geziyor/metrics"
|
||||||
|
"github.com/geziyor/geziyor/middleware"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
"net/http/cookiejar"
|
"net/http/cookiejar"
|
||||||
|
"os"
|
||||||
|
"runtime/debug"
|
||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -17,8 +20,8 @@ type Geziyor struct {
|
|||||||
Exports chan interface{}
|
Exports chan interface{}
|
||||||
|
|
||||||
metrics *metrics.Metrics
|
metrics *metrics.Metrics
|
||||||
requestMiddlewares []RequestMiddleware
|
requestMiddlewares []middleware.RequestProcessor
|
||||||
responseMiddlewares []ResponseMiddleware
|
responseMiddlewares []middleware.ResponseProcessor
|
||||||
wgRequests sync.WaitGroup
|
wgRequests sync.WaitGroup
|
||||||
wgExporters sync.WaitGroup
|
wgExporters sync.WaitGroup
|
||||||
semGlobal chan struct{}
|
semGlobal chan struct{}
|
||||||
@ -26,7 +29,6 @@ type Geziyor struct {
|
|||||||
sync.RWMutex
|
sync.RWMutex
|
||||||
hostSems map[string]chan struct{}
|
hostSems map[string]chan struct{}
|
||||||
}
|
}
|
||||||
visitedURLs sync.Map
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewGeziyor creates new Geziyor with default values.
|
// NewGeziyor creates new Geziyor with default values.
|
||||||
@ -35,22 +37,23 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
geziyor := &Geziyor{
|
geziyor := &Geziyor{
|
||||||
Opt: opt,
|
Opt: opt,
|
||||||
Exports: make(chan interface{}, 1),
|
Exports: make(chan interface{}, 1),
|
||||||
requestMiddlewares: []RequestMiddleware{
|
requestMiddlewares: []middleware.RequestProcessor{
|
||||||
allowedDomainsMiddleware,
|
&middleware.AllowedDomains{AllowedDomains: opt.AllowedDomains},
|
||||||
duplicateRequestsMiddleware,
|
&middleware.DuplicateRequests{RevisitEnabled: opt.URLRevisitEnabled},
|
||||||
defaultHeadersMiddleware,
|
&middleware.Headers{UserAgent: opt.UserAgent},
|
||||||
delayMiddleware,
|
middleware.NewDelay(opt.RequestDelayRandomize, opt.RequestDelay),
|
||||||
logMiddleware,
|
|
||||||
metricsRequestMiddleware,
|
|
||||||
},
|
},
|
||||||
responseMiddlewares: []ResponseMiddleware{
|
responseMiddlewares: []middleware.ResponseProcessor{
|
||||||
parseHTMLMiddleware,
|
&middleware.ParseHTML{ParseHTMLDisabled: opt.ParseHTMLDisabled},
|
||||||
metricsResponseMiddleware,
|
&middleware.LogStats{LogDisabled: opt.LogDisabled},
|
||||||
extractorsMiddleware,
|
|
||||||
},
|
},
|
||||||
metrics: metrics.NewMetrics(opt.MetricsType),
|
metrics: metrics.NewMetrics(opt.MetricsType),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics}
|
||||||
|
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, metricsMiddleware)
|
||||||
|
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, metricsMiddleware)
|
||||||
|
|
||||||
// Default
|
// Default
|
||||||
if opt.UserAgent == "" {
|
if opt.UserAgent == "" {
|
||||||
opt.UserAgent = client.DefaultUserAgent
|
opt.UserAgent = client.DefaultUserAgent
|
||||||
@ -95,6 +98,8 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
// Logging
|
// Logging
|
||||||
if opt.LogDisabled {
|
if opt.LogDisabled {
|
||||||
log.SetOutput(ioutil.Discard)
|
log.SetOutput(ioutil.Discard)
|
||||||
|
} else {
|
||||||
|
log.SetOutput(os.Stdout)
|
||||||
}
|
}
|
||||||
|
|
||||||
return geziyor
|
return geziyor
|
||||||
@ -193,10 +198,10 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
|
|||||||
if !req.Synchronized {
|
if !req.Synchronized {
|
||||||
defer g.wgRequests.Done()
|
defer g.wgRequests.Done()
|
||||||
}
|
}
|
||||||
defer recoverMiddleware(g, req)
|
defer g.recoverMe()
|
||||||
|
|
||||||
for _, middlewareFunc := range g.requestMiddlewares {
|
for _, middlewareFunc := range g.requestMiddlewares {
|
||||||
middlewareFunc(g, req)
|
middlewareFunc.ProcessRequest(req)
|
||||||
if req.Cancelled {
|
if req.Cancelled {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -209,7 +214,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, middlewareFunc := range g.responseMiddlewares {
|
for _, middlewareFunc := range g.responseMiddlewares {
|
||||||
middlewareFunc(g, res)
|
middlewareFunc.ProcessResponse(res)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Callbacks
|
// Callbacks
|
||||||
@ -248,3 +253,12 @@ func (g *Geziyor) releaseSem(req *client.Request) {
|
|||||||
<-g.semHosts.hostSems[req.Host]
|
<-g.semHosts.hostSems[req.Host]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// recoverMe prevents scraping being crashed.
|
||||||
|
// Logs error and stack trace
|
||||||
|
func (g *Geziyor) recoverMe() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
log.Println(r, string(debug.Stack()))
|
||||||
|
g.metrics.PanicCounter.Add(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -8,7 +8,6 @@ import (
|
|||||||
"github.com/geziyor/geziyor"
|
"github.com/geziyor/geziyor"
|
||||||
"github.com/geziyor/geziyor/client"
|
"github.com/geziyor/geziyor/client"
|
||||||
"github.com/geziyor/geziyor/export"
|
"github.com/geziyor/geziyor/export"
|
||||||
"github.com/geziyor/geziyor/extract"
|
|
||||||
"github.com/geziyor/geziyor/metrics"
|
"github.com/geziyor/geziyor/metrics"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
@ -158,22 +157,6 @@ func TestBasicAuth(t *testing.T) {
|
|||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestExtractor(t *testing.T) {
|
|
||||||
geziyor.NewGeziyor(&geziyor.Options{
|
|
||||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
|
||||||
Extractors: []extract.Extractor{
|
|
||||||
extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
|
||||||
extract.Text{Name: "title", Selector: ".c-page-title"},
|
|
||||||
extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
|
||||||
extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
|
|
||||||
extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
|
|
||||||
extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
|
||||||
extract.Text{Name: "content", Selector: ".c-entry-content"},
|
|
||||||
},
|
|
||||||
Exporters: []export.Exporter{&export.JSON{}},
|
|
||||||
}).Start()
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestRedirect(t *testing.T) {
|
func TestRedirect(t *testing.T) {
|
||||||
defer leaktest.Check(t)()
|
defer leaktest.Check(t)()
|
||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
|
131
middleware.go
131
middleware.go
@ -1,131 +0,0 @@
|
|||||||
package geziyor
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"fmt"
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
|
||||||
"github.com/geziyor/geziyor/client"
|
|
||||||
"github.com/geziyor/geziyor/internal"
|
|
||||||
"log"
|
|
||||||
"math/rand"
|
|
||||||
"os"
|
|
||||||
"reflect"
|
|
||||||
"runtime/debug"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
// RequestMiddleware called before requests made.
|
|
||||||
// Set request.Cancelled = true to cancel request
|
|
||||||
type RequestMiddleware func(g *Geziyor, r *client.Request)
|
|
||||||
|
|
||||||
// ResponseMiddleware called after request response receive
|
|
||||||
type ResponseMiddleware func(g *Geziyor, r *client.Response)
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
log.SetOutput(os.Stdout)
|
|
||||||
rand.Seed(time.Now().UnixNano())
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---* REQUEST MIDDLEWARES *---
|
|
||||||
|
|
||||||
// recoverMiddleware recovers scraping being crashed.
|
|
||||||
// Logs error and stack trace
|
|
||||||
func recoverMiddleware(g *Geziyor, r *client.Request) {
|
|
||||||
if r := recover(); r != nil {
|
|
||||||
log.Println(r, string(debug.Stack()))
|
|
||||||
g.metrics.PanicCounter.Add(1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains
|
|
||||||
func allowedDomainsMiddleware(g *Geziyor, r *client.Request) {
|
|
||||||
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
|
|
||||||
//log.Printf("Domain not allowed: %s\n", req.Host)
|
|
||||||
r.Cancel()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// duplicateRequestsMiddleware checks for already visited URLs
|
|
||||||
func duplicateRequestsMiddleware(g *Geziyor, r *client.Request) {
|
|
||||||
if !g.Opt.URLRevisitEnabled && r.Request.Method == "GET" {
|
|
||||||
if _, visited := g.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
|
|
||||||
//log.Printf("URL already visited %s\n", rawURL)
|
|
||||||
r.Cancel()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// defaultHeadersMiddleware sets default request headers
|
|
||||||
func defaultHeadersMiddleware(g *Geziyor, r *client.Request) {
|
|
||||||
r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
|
||||||
r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
|
|
||||||
r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en")
|
|
||||||
r.Header = client.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
|
|
||||||
}
|
|
||||||
|
|
||||||
// delayMiddleware delays requests
|
|
||||||
func delayMiddleware(g *Geziyor, r *client.Request) {
|
|
||||||
if g.Opt.RequestDelayRandomize {
|
|
||||||
min := float64(g.Opt.RequestDelay) * 0.5
|
|
||||||
max := float64(g.Opt.RequestDelay) * 1.5
|
|
||||||
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
|
|
||||||
} else {
|
|
||||||
time.Sleep(g.Opt.RequestDelay)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// logMiddleware logs requests
|
|
||||||
func logMiddleware(g *Geziyor, r *client.Request) {
|
|
||||||
// LogDisabled check is not necessary, but done here for performance reasons
|
|
||||||
if !g.Opt.LogDisabled {
|
|
||||||
log.Println("Fetching: ", r.URL.String())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// metricsRequestMiddleware sets stats
|
|
||||||
func metricsRequestMiddleware(g *Geziyor, r *client.Request) {
|
|
||||||
g.metrics.RequestCounter.With("method", r.Method).Add(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---* RESPONSE MIDDLEWARES *---
|
|
||||||
|
|
||||||
// parseHTMLMiddleware parses response if response is HTML
|
|
||||||
func parseHTMLMiddleware(g *Geziyor, r *client.Response) {
|
|
||||||
if !g.Opt.ParseHTMLDisabled && r.IsHTML() {
|
|
||||||
r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// metricsResponseMiddleware sets stats
|
|
||||||
func metricsResponseMiddleware(g *Geziyor, r *client.Response) {
|
|
||||||
g.metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
// extractorsMiddleware extracts data from loaders conf and exports it to exporters
|
|
||||||
func extractorsMiddleware(g *Geziyor, r *client.Response) {
|
|
||||||
|
|
||||||
// Check if we have extractors and exporters
|
|
||||||
if len(g.Opt.Extractors) != 0 && len(g.Opt.Exporters) != 0 {
|
|
||||||
exports := map[string]interface{}{}
|
|
||||||
|
|
||||||
for _, extractor := range g.Opt.Extractors {
|
|
||||||
extracted, err := extractor.Extract(r.HTMLDoc.Selection)
|
|
||||||
if err != nil {
|
|
||||||
log.Println("extraction error: ", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check extracted data type and use it accordingly
|
|
||||||
val := reflect.ValueOf(extracted)
|
|
||||||
switch val.Kind() {
|
|
||||||
case reflect.Map:
|
|
||||||
r := val.MapRange()
|
|
||||||
for r.Next() {
|
|
||||||
exports[fmt.Sprint(r.Key())] = r.Value().Interface()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
g.Exports <- exports
|
|
||||||
}
|
|
||||||
}
|
|
19
middleware/allowed_domains.go
Normal file
19
middleware/allowed_domains.go
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
package middleware
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/geziyor/geziyor/client"
|
||||||
|
"github.com/geziyor/geziyor/internal"
|
||||||
|
)
|
||||||
|
|
||||||
|
// AllowedDomains checks for request host if it exists in AllowedDomains
|
||||||
|
type AllowedDomains struct {
|
||||||
|
AllowedDomains []string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *AllowedDomains) ProcessRequest(r *client.Request) {
|
||||||
|
if len(a.AllowedDomains) != 0 && !internal.Contains(a.AllowedDomains, r.Host) {
|
||||||
|
//log.Printf("Domain not allowed: %s\n", req.Host)
|
||||||
|
r.Cancel()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
30
middleware/delay.go
Normal file
30
middleware/delay.go
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
package middleware
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/geziyor/geziyor/client"
|
||||||
|
"math/rand"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// delay delays requests
|
||||||
|
type delay struct {
|
||||||
|
requestDelayRandomize bool
|
||||||
|
requestDelay time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewDelay(requestDelayRandomize bool, requestDelay time.Duration) RequestProcessor {
|
||||||
|
if requestDelayRandomize {
|
||||||
|
rand.Seed(time.Now().UnixNano())
|
||||||
|
}
|
||||||
|
return &delay{requestDelayRandomize: requestDelayRandomize, requestDelay: requestDelay}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *delay) ProcessRequest(r *client.Request) {
|
||||||
|
if a.requestDelayRandomize {
|
||||||
|
min := float64(a.requestDelay) * 0.5
|
||||||
|
max := float64(a.requestDelay) * 1.5
|
||||||
|
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
|
||||||
|
} else {
|
||||||
|
time.Sleep(a.requestDelay)
|
||||||
|
}
|
||||||
|
}
|
@ -1,7 +1,7 @@
|
|||||||
package geziyor
|
package middleware
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"github.com/stretchr/testify/assert"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
@ -13,5 +13,7 @@ func TestRandomDelay(t *testing.T) {
|
|||||||
min := float64(delay) * 0.5
|
min := float64(delay) * 0.5
|
||||||
max := float64(delay) * 1.5
|
max := float64(delay) * 1.5
|
||||||
randomDelay := rand.Intn(int(max-min)) + int(min)
|
randomDelay := rand.Intn(int(max-min)) + int(min)
|
||||||
fmt.Println(time.Duration(randomDelay))
|
|
||||||
|
assert.True(t, time.Duration(randomDelay).Seconds() < 1.5)
|
||||||
|
assert.True(t, time.Duration(randomDelay).Seconds() > 0.5)
|
||||||
}
|
}
|
21
middleware/duplicate_requests.go
Normal file
21
middleware/duplicate_requests.go
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
package middleware
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/geziyor/geziyor/client"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
// DuplicateRequests checks for already visited URLs
|
||||||
|
type DuplicateRequests struct {
|
||||||
|
RevisitEnabled bool
|
||||||
|
visitedURLs sync.Map
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *DuplicateRequests) ProcessRequest(r *client.Request) {
|
||||||
|
if !a.RevisitEnabled && r.Request.Method == "GET" {
|
||||||
|
if _, visited := a.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
|
||||||
|
//log.Printf("URL already visited %s\n")
|
||||||
|
r.Cancel()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
17
middleware/headers.go
Normal file
17
middleware/headers.go
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
package middleware
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/geziyor/geziyor/client"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Headers sets default request headers
|
||||||
|
type Headers struct {
|
||||||
|
UserAgent string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *Headers) ProcessRequest(r *client.Request) {
|
||||||
|
r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||||
|
r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
|
||||||
|
r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en")
|
||||||
|
r.Header = client.SetDefaultHeader(r.Header, "User-Agent", a.UserAgent)
|
||||||
|
}
|
18
middleware/log_stats.go
Normal file
18
middleware/log_stats.go
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
package middleware
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/geziyor/geziyor/client"
|
||||||
|
"log"
|
||||||
|
)
|
||||||
|
|
||||||
|
// LogStats logs responses
|
||||||
|
type LogStats struct {
|
||||||
|
LogDisabled bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *LogStats) ProcessResponse(r *client.Response) {
|
||||||
|
// LogDisabled check is not necessary, but done here for performance reasons
|
||||||
|
if !p.LogDisabled {
|
||||||
|
log.Printf("Crawled: (%d) <%s %s>", r.StatusCode, r.Request.Method, r.Request.URL.String())
|
||||||
|
}
|
||||||
|
}
|
19
middleware/metrics.go
Normal file
19
middleware/metrics.go
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
package middleware
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/geziyor/geziyor/client"
|
||||||
|
"github.com/geziyor/geziyor/metrics"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Metrics sets stats for request and responses
|
||||||
|
type Metrics struct {
|
||||||
|
Metrics *metrics.Metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *Metrics) ProcessRequest(r *client.Request) {
|
||||||
|
a.Metrics.RequestCounter.With("method", r.Method).Add(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *Metrics) ProcessResponse(r *client.Response) {
|
||||||
|
a.Metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
|
||||||
|
}
|
21
middleware/middleware.go
Normal file
21
middleware/middleware.go
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
package middleware
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/geziyor/geziyor/client"
|
||||||
|
)
|
||||||
|
|
||||||
|
type RequestResponseProcessor interface {
|
||||||
|
RequestProcessor
|
||||||
|
ResponseProcessor
|
||||||
|
}
|
||||||
|
|
||||||
|
// RequestProcessor called before requests made.
|
||||||
|
// Set request.Cancelled = true to cancel request
|
||||||
|
type RequestProcessor interface {
|
||||||
|
ProcessRequest(r *client.Request)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ResponseProcessor called after request response receive
|
||||||
|
type ResponseProcessor interface {
|
||||||
|
ProcessResponse(r *client.Response)
|
||||||
|
}
|
18
middleware/parse_html.go
Normal file
18
middleware/parse_html.go
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
package middleware
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
"github.com/geziyor/geziyor/client"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ParseHTML parses response if response is HTML
|
||||||
|
type ParseHTML struct {
|
||||||
|
ParseHTMLDisabled bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *ParseHTML) ProcessResponse(r *client.Response) {
|
||||||
|
if !p.ParseHTMLDisabled && r.IsHTML() {
|
||||||
|
r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
|
||||||
|
}
|
||||||
|
}
|
@ -4,8 +4,8 @@ import (
|
|||||||
"github.com/fpfeng/httpcache"
|
"github.com/fpfeng/httpcache"
|
||||||
"github.com/geziyor/geziyor/client"
|
"github.com/geziyor/geziyor/client"
|
||||||
"github.com/geziyor/geziyor/export"
|
"github.com/geziyor/geziyor/export"
|
||||||
"github.com/geziyor/geziyor/extract"
|
|
||||||
"github.com/geziyor/geziyor/metrics"
|
"github.com/geziyor/geziyor/metrics"
|
||||||
|
"github.com/geziyor/geziyor/middleware"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -24,9 +24,6 @@ type Options struct {
|
|||||||
// ParseFunc is callback of StartURLs response.
|
// ParseFunc is callback of StartURLs response.
|
||||||
ParseFunc func(g *Geziyor, r *client.Response)
|
ParseFunc func(g *Geziyor, r *client.Response)
|
||||||
|
|
||||||
// Extractors extracts items from pages
|
|
||||||
Extractors []extract.Extractor
|
|
||||||
|
|
||||||
// Timeout is global request timeout
|
// Timeout is global request timeout
|
||||||
Timeout time.Duration
|
Timeout time.Duration
|
||||||
|
|
||||||
@ -56,10 +53,10 @@ type Options struct {
|
|||||||
Exporters []export.Exporter
|
Exporters []export.Exporter
|
||||||
|
|
||||||
// Called before requests made to manipulate requests
|
// Called before requests made to manipulate requests
|
||||||
RequestMiddlewares []RequestMiddleware
|
RequestMiddlewares []middleware.RequestProcessor
|
||||||
|
|
||||||
// Called after response received
|
// Called after response received
|
||||||
ResponseMiddlewares []ResponseMiddleware
|
ResponseMiddlewares []middleware.ResponseProcessor
|
||||||
|
|
||||||
// Max body reading size in bytes. Default: 1GB
|
// Max body reading size in bytes. Default: 1GB
|
||||||
MaxBodySize int64
|
MaxBodySize int64
|
||||||
|
Loading…
x
Reference in New Issue
Block a user