Middlewares refactored to multiple files in middleware package.
Extractors removed as they introduce complexity to scraper. Both in learning and developing.
This commit is contained in:
		
							
								
								
									
										23
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								README.md
									
									
									
									
									
								
							@@ -104,28 +104,6 @@ geziyor.NewGeziyor(&geziyor.Options{
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
### Extracting Data
 | 
					### Extracting Data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#### Extractors
 | 
					 | 
				
			||||||
You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor) to ```[]Extractors``` option to extract structured data. 
 | 
					 | 
				
			||||||
```Exporters``` need to be defined in order extractors to work.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```go
 | 
					 | 
				
			||||||
geziyor.NewGeziyor(&geziyor.Options{
 | 
					 | 
				
			||||||
    StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
 | 
					 | 
				
			||||||
    Extractors: []extract.Extractor{
 | 
					 | 
				
			||||||
            &extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
 | 
					 | 
				
			||||||
            &extract.Text{Name: "title", Selector: ".c-page-title"},
 | 
					 | 
				
			||||||
            &extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
 | 
					 | 
				
			||||||
            &extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
 | 
					 | 
				
			||||||
            &extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
 | 
					 | 
				
			||||||
            &extract.Text{Name: "summary", Selector: ".c-entry-summary"},
 | 
					 | 
				
			||||||
            &extract.Text{Name: "content", Selector: ".c-entry-content"},
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
    Exporters: []export.Exporter{&export.JSON{}},
 | 
					 | 
				
			||||||
}).Start()
 | 
					 | 
				
			||||||
```    
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#### HTML selectors
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document).
 | 
					We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
HTMLDoc can be accessible on Response if response is HTML and can be parsed using Go's built-in HTML [parser](https://godoc.org/golang.org/x/net/html#Parse)
 | 
					HTMLDoc can be accessible on Response if response is HTML and can be parsed using Go's built-in HTML [parser](https://godoc.org/golang.org/x/net/html#Parse)
 | 
				
			||||||
@@ -183,7 +161,6 @@ ok  	github.com/geziyor/geziyor	22.861s
 | 
				
			|||||||
If you're interested in helping this project, please consider these features:
 | 
					If you're interested in helping this project, please consider these features:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
 | 
					- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
 | 
				
			||||||
- ~~Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape))~~
 | 
					 | 
				
			||||||
- Deploying Scrapers to Cloud
 | 
					- Deploying Scrapers to Cloud
 | 
				
			||||||
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~ 
 | 
					- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~ 
 | 
				
			||||||
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))
 | 
					- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,7 +1,6 @@
 | 
				
			|||||||
package export
 | 
					package export
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Exporter interface is for extracting data to external resources.
 | 
					// Exporter interface is for extracting data to external resources.
 | 
				
			||||||
// Geziyor calls every extractors Export functions before any scraping starts.
 | 
					 | 
				
			||||||
// Export functions should wait for new data from exports chan.
 | 
					// Export functions should wait for new data from exports chan.
 | 
				
			||||||
type Exporter interface {
 | 
					type Exporter interface {
 | 
				
			||||||
	Export(exports chan interface{})
 | 
						Export(exports chan interface{})
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,24 +0,0 @@
 | 
				
			|||||||
package extract
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import (
 | 
					 | 
				
			||||||
	"errors"
 | 
					 | 
				
			||||||
	"github.com/PuerkitoBio/goquery"
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
var ErrAttrNotExists = errors.New("attribute not exist")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Attr returns HTML attribute value of provided selector
 | 
					 | 
				
			||||||
type Attr struct {
 | 
					 | 
				
			||||||
	Name     string
 | 
					 | 
				
			||||||
	Selector string
 | 
					 | 
				
			||||||
	Attr     string
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Extract returns HTML attribute value of provided selector
 | 
					 | 
				
			||||||
func (e Attr) Extract(sel *goquery.Selection) (interface{}, error) {
 | 
					 | 
				
			||||||
	attr, exists := sel.Find(e.Selector).Attr(e.Attr)
 | 
					 | 
				
			||||||
	if !exists {
 | 
					 | 
				
			||||||
		return nil, ErrAttrNotExists
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	return map[string]string{e.Name: attr}, nil
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
@@ -1,8 +0,0 @@
 | 
				
			|||||||
package extract
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import "github.com/PuerkitoBio/goquery"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Extractor interface is for extracting data from HTML document
 | 
					 | 
				
			||||||
type Extractor interface {
 | 
					 | 
				
			||||||
	Extract(sel *goquery.Selection) (interface{}, error)
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
@@ -1,52 +0,0 @@
 | 
				
			|||||||
package extract
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import (
 | 
					 | 
				
			||||||
	"bytes"
 | 
					 | 
				
			||||||
	"github.com/PuerkitoBio/goquery"
 | 
					 | 
				
			||||||
	"golang.org/x/net/html"
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// HTML extracts and returns the HTML from inside each element of the given selection.
 | 
					 | 
				
			||||||
type HTML struct {
 | 
					 | 
				
			||||||
	Name     string
 | 
					 | 
				
			||||||
	Selector string
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Extract extracts and returns the HTML from inside each element of the given selection.
 | 
					 | 
				
			||||||
func (e HTML) Extract(sel *goquery.Selection) (interface{}, error) {
 | 
					 | 
				
			||||||
	var ret, h string
 | 
					 | 
				
			||||||
	var err error
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	sel.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool {
 | 
					 | 
				
			||||||
		h, err = s.Html()
 | 
					 | 
				
			||||||
		if err != nil {
 | 
					 | 
				
			||||||
			return false
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		ret += h
 | 
					 | 
				
			||||||
		return true
 | 
					 | 
				
			||||||
	})
 | 
					 | 
				
			||||||
	if err != nil {
 | 
					 | 
				
			||||||
		return nil, err
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	return map[string]string{e.Name: ret}, nil
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// OuterHTML extracts and returns the HTML of each element of the given selection.
 | 
					 | 
				
			||||||
type OuterHTML struct {
 | 
					 | 
				
			||||||
	Name     string
 | 
					 | 
				
			||||||
	Selector string
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Extract extracts and returns the HTML of each element of the given selection.
 | 
					 | 
				
			||||||
func (e OuterHTML) Extract(sel *goquery.Selection) (interface{}, error) {
 | 
					 | 
				
			||||||
	output := bytes.NewBufferString("")
 | 
					 | 
				
			||||||
	for _, node := range sel.Find(e.Selector).Nodes {
 | 
					 | 
				
			||||||
		if err := html.Render(output, node); err != nil {
 | 
					 | 
				
			||||||
			return nil, err
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	return map[string]string{e.Name: output.String()}, nil
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
@@ -1,22 +0,0 @@
 | 
				
			|||||||
package extract
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import (
 | 
					 | 
				
			||||||
	"github.com/PuerkitoBio/goquery"
 | 
					 | 
				
			||||||
	"strings"
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Text returns the combined text contents of provided selector.
 | 
					 | 
				
			||||||
type Text struct {
 | 
					 | 
				
			||||||
	Name      string
 | 
					 | 
				
			||||||
	Selector  string
 | 
					 | 
				
			||||||
	TrimSpace bool
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Extract returns the combined text contents of provided selector.
 | 
					 | 
				
			||||||
func (e Text) Extract(sel *goquery.Selection) (interface{}, error) {
 | 
					 | 
				
			||||||
	text := sel.Find(e.Selector).Text()
 | 
					 | 
				
			||||||
	if e.TrimSpace {
 | 
					 | 
				
			||||||
		text = strings.TrimSpace(text)
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	return map[string]string{e.Name: text}, nil
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
							
								
								
									
										48
									
								
								geziyor.go
									
									
									
									
									
								
							
							
						
						
									
										48
									
								
								geziyor.go
									
									
									
									
									
								
							@@ -4,9 +4,12 @@ import (
 | 
				
			|||||||
	"github.com/fpfeng/httpcache"
 | 
						"github.com/fpfeng/httpcache"
 | 
				
			||||||
	"github.com/geziyor/geziyor/client"
 | 
						"github.com/geziyor/geziyor/client"
 | 
				
			||||||
	"github.com/geziyor/geziyor/metrics"
 | 
						"github.com/geziyor/geziyor/metrics"
 | 
				
			||||||
 | 
						"github.com/geziyor/geziyor/middleware"
 | 
				
			||||||
	"io/ioutil"
 | 
						"io/ioutil"
 | 
				
			||||||
	"log"
 | 
						"log"
 | 
				
			||||||
	"net/http/cookiejar"
 | 
						"net/http/cookiejar"
 | 
				
			||||||
 | 
						"os"
 | 
				
			||||||
 | 
						"runtime/debug"
 | 
				
			||||||
	"sync"
 | 
						"sync"
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -17,8 +20,8 @@ type Geziyor struct {
 | 
				
			|||||||
	Exports chan interface{}
 | 
						Exports chan interface{}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	metrics             *metrics.Metrics
 | 
						metrics             *metrics.Metrics
 | 
				
			||||||
	requestMiddlewares  []RequestMiddleware
 | 
						requestMiddlewares  []middleware.RequestProcessor
 | 
				
			||||||
	responseMiddlewares []ResponseMiddleware
 | 
						responseMiddlewares []middleware.ResponseProcessor
 | 
				
			||||||
	wgRequests          sync.WaitGroup
 | 
						wgRequests          sync.WaitGroup
 | 
				
			||||||
	wgExporters         sync.WaitGroup
 | 
						wgExporters         sync.WaitGroup
 | 
				
			||||||
	semGlobal           chan struct{}
 | 
						semGlobal           chan struct{}
 | 
				
			||||||
@@ -26,7 +29,6 @@ type Geziyor struct {
 | 
				
			|||||||
		sync.RWMutex
 | 
							sync.RWMutex
 | 
				
			||||||
		hostSems map[string]chan struct{}
 | 
							hostSems map[string]chan struct{}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	visitedURLs sync.Map
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// NewGeziyor creates new Geziyor with default values.
 | 
					// NewGeziyor creates new Geziyor with default values.
 | 
				
			||||||
@@ -35,22 +37,23 @@ func NewGeziyor(opt *Options) *Geziyor {
 | 
				
			|||||||
	geziyor := &Geziyor{
 | 
						geziyor := &Geziyor{
 | 
				
			||||||
		Opt:     opt,
 | 
							Opt:     opt,
 | 
				
			||||||
		Exports: make(chan interface{}, 1),
 | 
							Exports: make(chan interface{}, 1),
 | 
				
			||||||
		requestMiddlewares: []RequestMiddleware{
 | 
							requestMiddlewares: []middleware.RequestProcessor{
 | 
				
			||||||
			allowedDomainsMiddleware,
 | 
								&middleware.AllowedDomains{AllowedDomains: opt.AllowedDomains},
 | 
				
			||||||
			duplicateRequestsMiddleware,
 | 
								&middleware.DuplicateRequests{RevisitEnabled: opt.URLRevisitEnabled},
 | 
				
			||||||
			defaultHeadersMiddleware,
 | 
								&middleware.Headers{UserAgent: opt.UserAgent},
 | 
				
			||||||
			delayMiddleware,
 | 
								middleware.NewDelay(opt.RequestDelayRandomize, opt.RequestDelay),
 | 
				
			||||||
			logMiddleware,
 | 
					 | 
				
			||||||
			metricsRequestMiddleware,
 | 
					 | 
				
			||||||
		},
 | 
							},
 | 
				
			||||||
		responseMiddlewares: []ResponseMiddleware{
 | 
							responseMiddlewares: []middleware.ResponseProcessor{
 | 
				
			||||||
			parseHTMLMiddleware,
 | 
								&middleware.ParseHTML{ParseHTMLDisabled: opt.ParseHTMLDisabled},
 | 
				
			||||||
			metricsResponseMiddleware,
 | 
								&middleware.LogStats{LogDisabled: opt.LogDisabled},
 | 
				
			||||||
			extractorsMiddleware,
 | 
					 | 
				
			||||||
		},
 | 
							},
 | 
				
			||||||
		metrics: metrics.NewMetrics(opt.MetricsType),
 | 
							metrics: metrics.NewMetrics(opt.MetricsType),
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics}
 | 
				
			||||||
 | 
						geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, metricsMiddleware)
 | 
				
			||||||
 | 
						geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, metricsMiddleware)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// Default
 | 
						// Default
 | 
				
			||||||
	if opt.UserAgent == "" {
 | 
						if opt.UserAgent == "" {
 | 
				
			||||||
		opt.UserAgent = client.DefaultUserAgent
 | 
							opt.UserAgent = client.DefaultUserAgent
 | 
				
			||||||
@@ -95,6 +98,8 @@ func NewGeziyor(opt *Options) *Geziyor {
 | 
				
			|||||||
	// Logging
 | 
						// Logging
 | 
				
			||||||
	if opt.LogDisabled {
 | 
						if opt.LogDisabled {
 | 
				
			||||||
		log.SetOutput(ioutil.Discard)
 | 
							log.SetOutput(ioutil.Discard)
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							log.SetOutput(os.Stdout)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return geziyor
 | 
						return geziyor
 | 
				
			||||||
@@ -193,10 +198,10 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
 | 
				
			|||||||
	if !req.Synchronized {
 | 
						if !req.Synchronized {
 | 
				
			||||||
		defer g.wgRequests.Done()
 | 
							defer g.wgRequests.Done()
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	defer recoverMiddleware(g, req)
 | 
						defer g.recoverMe()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for _, middlewareFunc := range g.requestMiddlewares {
 | 
						for _, middlewareFunc := range g.requestMiddlewares {
 | 
				
			||||||
		middlewareFunc(g, req)
 | 
							middlewareFunc.ProcessRequest(req)
 | 
				
			||||||
		if req.Cancelled {
 | 
							if req.Cancelled {
 | 
				
			||||||
			return
 | 
								return
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
@@ -209,7 +214,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
 | 
				
			|||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for _, middlewareFunc := range g.responseMiddlewares {
 | 
						for _, middlewareFunc := range g.responseMiddlewares {
 | 
				
			||||||
		middlewareFunc(g, res)
 | 
							middlewareFunc.ProcessResponse(res)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// Callbacks
 | 
						// Callbacks
 | 
				
			||||||
@@ -248,3 +253,12 @@ func (g *Geziyor) releaseSem(req *client.Request) {
 | 
				
			|||||||
		<-g.semHosts.hostSems[req.Host]
 | 
							<-g.semHosts.hostSems[req.Host]
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// recoverMe prevents scraping being crashed.
 | 
				
			||||||
 | 
					// Logs error and stack trace
 | 
				
			||||||
 | 
					func (g *Geziyor) recoverMe() {
 | 
				
			||||||
 | 
						if r := recover(); r != nil {
 | 
				
			||||||
 | 
							log.Println(r, string(debug.Stack()))
 | 
				
			||||||
 | 
							g.metrics.PanicCounter.Add(1)
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -8,7 +8,6 @@ import (
 | 
				
			|||||||
	"github.com/geziyor/geziyor"
 | 
						"github.com/geziyor/geziyor"
 | 
				
			||||||
	"github.com/geziyor/geziyor/client"
 | 
						"github.com/geziyor/geziyor/client"
 | 
				
			||||||
	"github.com/geziyor/geziyor/export"
 | 
						"github.com/geziyor/geziyor/export"
 | 
				
			||||||
	"github.com/geziyor/geziyor/extract"
 | 
					 | 
				
			||||||
	"github.com/geziyor/geziyor/metrics"
 | 
						"github.com/geziyor/geziyor/metrics"
 | 
				
			||||||
	"net/http"
 | 
						"net/http"
 | 
				
			||||||
	"net/http/httptest"
 | 
						"net/http/httptest"
 | 
				
			||||||
@@ -158,22 +157,6 @@ func TestBasicAuth(t *testing.T) {
 | 
				
			|||||||
	}).Start()
 | 
						}).Start()
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func TestExtractor(t *testing.T) {
 | 
					 | 
				
			||||||
	geziyor.NewGeziyor(&geziyor.Options{
 | 
					 | 
				
			||||||
		StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
 | 
					 | 
				
			||||||
		Extractors: []extract.Extractor{
 | 
					 | 
				
			||||||
			extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
 | 
					 | 
				
			||||||
			extract.Text{Name: "title", Selector: ".c-page-title"},
 | 
					 | 
				
			||||||
			extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
 | 
					 | 
				
			||||||
			extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
 | 
					 | 
				
			||||||
			extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
 | 
					 | 
				
			||||||
			extract.Text{Name: "summary", Selector: ".c-entry-summary"},
 | 
					 | 
				
			||||||
			extract.Text{Name: "content", Selector: ".c-entry-content"},
 | 
					 | 
				
			||||||
		},
 | 
					 | 
				
			||||||
		Exporters: []export.Exporter{&export.JSON{}},
 | 
					 | 
				
			||||||
	}).Start()
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
func TestRedirect(t *testing.T) {
 | 
					func TestRedirect(t *testing.T) {
 | 
				
			||||||
	defer leaktest.Check(t)()
 | 
						defer leaktest.Check(t)()
 | 
				
			||||||
	geziyor.NewGeziyor(&geziyor.Options{
 | 
						geziyor.NewGeziyor(&geziyor.Options{
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										131
									
								
								middleware.go
									
									
									
									
									
								
							
							
						
						
									
										131
									
								
								middleware.go
									
									
									
									
									
								
							@@ -1,131 +0,0 @@
 | 
				
			|||||||
package geziyor
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import (
 | 
					 | 
				
			||||||
	"bytes"
 | 
					 | 
				
			||||||
	"fmt"
 | 
					 | 
				
			||||||
	"github.com/PuerkitoBio/goquery"
 | 
					 | 
				
			||||||
	"github.com/geziyor/geziyor/client"
 | 
					 | 
				
			||||||
	"github.com/geziyor/geziyor/internal"
 | 
					 | 
				
			||||||
	"log"
 | 
					 | 
				
			||||||
	"math/rand"
 | 
					 | 
				
			||||||
	"os"
 | 
					 | 
				
			||||||
	"reflect"
 | 
					 | 
				
			||||||
	"runtime/debug"
 | 
					 | 
				
			||||||
	"time"
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// RequestMiddleware called before requests made.
 | 
					 | 
				
			||||||
// Set request.Cancelled = true to cancel request
 | 
					 | 
				
			||||||
type RequestMiddleware func(g *Geziyor, r *client.Request)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// ResponseMiddleware called after request response receive
 | 
					 | 
				
			||||||
type ResponseMiddleware func(g *Geziyor, r *client.Response)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
func init() {
 | 
					 | 
				
			||||||
	log.SetOutput(os.Stdout)
 | 
					 | 
				
			||||||
	rand.Seed(time.Now().UnixNano())
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// ---* REQUEST MIDDLEWARES *---
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// recoverMiddleware recovers scraping being crashed.
 | 
					 | 
				
			||||||
// Logs error and stack trace
 | 
					 | 
				
			||||||
func recoverMiddleware(g *Geziyor, r *client.Request) {
 | 
					 | 
				
			||||||
	if r := recover(); r != nil {
 | 
					 | 
				
			||||||
		log.Println(r, string(debug.Stack()))
 | 
					 | 
				
			||||||
		g.metrics.PanicCounter.Add(1)
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains
 | 
					 | 
				
			||||||
func allowedDomainsMiddleware(g *Geziyor, r *client.Request) {
 | 
					 | 
				
			||||||
	if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
 | 
					 | 
				
			||||||
		//log.Printf("Domain not allowed: %s\n", req.Host)
 | 
					 | 
				
			||||||
		r.Cancel()
 | 
					 | 
				
			||||||
		return
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// duplicateRequestsMiddleware checks for already visited URLs
 | 
					 | 
				
			||||||
func duplicateRequestsMiddleware(g *Geziyor, r *client.Request) {
 | 
					 | 
				
			||||||
	if !g.Opt.URLRevisitEnabled && r.Request.Method == "GET" {
 | 
					 | 
				
			||||||
		if _, visited := g.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
 | 
					 | 
				
			||||||
			//log.Printf("URL already visited %s\n", rawURL)
 | 
					 | 
				
			||||||
			r.Cancel()
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// defaultHeadersMiddleware sets default request headers
 | 
					 | 
				
			||||||
func defaultHeadersMiddleware(g *Geziyor, r *client.Request) {
 | 
					 | 
				
			||||||
	r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
 | 
					 | 
				
			||||||
	r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
 | 
					 | 
				
			||||||
	r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en")
 | 
					 | 
				
			||||||
	r.Header = client.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// delayMiddleware delays requests
 | 
					 | 
				
			||||||
func delayMiddleware(g *Geziyor, r *client.Request) {
 | 
					 | 
				
			||||||
	if g.Opt.RequestDelayRandomize {
 | 
					 | 
				
			||||||
		min := float64(g.Opt.RequestDelay) * 0.5
 | 
					 | 
				
			||||||
		max := float64(g.Opt.RequestDelay) * 1.5
 | 
					 | 
				
			||||||
		time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
 | 
					 | 
				
			||||||
	} else {
 | 
					 | 
				
			||||||
		time.Sleep(g.Opt.RequestDelay)
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// logMiddleware logs requests
 | 
					 | 
				
			||||||
func logMiddleware(g *Geziyor, r *client.Request) {
 | 
					 | 
				
			||||||
	// LogDisabled check is not necessary, but done here for performance reasons
 | 
					 | 
				
			||||||
	if !g.Opt.LogDisabled {
 | 
					 | 
				
			||||||
		log.Println("Fetching: ", r.URL.String())
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// metricsRequestMiddleware sets stats
 | 
					 | 
				
			||||||
func metricsRequestMiddleware(g *Geziyor, r *client.Request) {
 | 
					 | 
				
			||||||
	g.metrics.RequestCounter.With("method", r.Method).Add(1)
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// ---* RESPONSE MIDDLEWARES *---
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// parseHTMLMiddleware parses response if response is HTML
 | 
					 | 
				
			||||||
func parseHTMLMiddleware(g *Geziyor, r *client.Response) {
 | 
					 | 
				
			||||||
	if !g.Opt.ParseHTMLDisabled && r.IsHTML() {
 | 
					 | 
				
			||||||
		r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// metricsResponseMiddleware sets stats
 | 
					 | 
				
			||||||
func metricsResponseMiddleware(g *Geziyor, r *client.Response) {
 | 
					 | 
				
			||||||
	g.metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// extractorsMiddleware extracts data from loaders conf and exports it to exporters
 | 
					 | 
				
			||||||
func extractorsMiddleware(g *Geziyor, r *client.Response) {
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	// Check if we have extractors and exporters
 | 
					 | 
				
			||||||
	if len(g.Opt.Extractors) != 0 && len(g.Opt.Exporters) != 0 {
 | 
					 | 
				
			||||||
		exports := map[string]interface{}{}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		for _, extractor := range g.Opt.Extractors {
 | 
					 | 
				
			||||||
			extracted, err := extractor.Extract(r.HTMLDoc.Selection)
 | 
					 | 
				
			||||||
			if err != nil {
 | 
					 | 
				
			||||||
				log.Println("extraction error: ", err)
 | 
					 | 
				
			||||||
				continue
 | 
					 | 
				
			||||||
			}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			// Check extracted data type and use it accordingly
 | 
					 | 
				
			||||||
			val := reflect.ValueOf(extracted)
 | 
					 | 
				
			||||||
			switch val.Kind() {
 | 
					 | 
				
			||||||
			case reflect.Map:
 | 
					 | 
				
			||||||
				r := val.MapRange()
 | 
					 | 
				
			||||||
				for r.Next() {
 | 
					 | 
				
			||||||
					exports[fmt.Sprint(r.Key())] = r.Value().Interface()
 | 
					 | 
				
			||||||
				}
 | 
					 | 
				
			||||||
			}
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
		g.Exports <- exports
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
							
								
								
									
										19
									
								
								middleware/allowed_domains.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								middleware/allowed_domains.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,19 @@
 | 
				
			|||||||
 | 
					package middleware
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import (
 | 
				
			||||||
 | 
						"github.com/geziyor/geziyor/client"
 | 
				
			||||||
 | 
						"github.com/geziyor/geziyor/internal"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// AllowedDomains checks for request host if it exists in AllowedDomains
 | 
				
			||||||
 | 
					type AllowedDomains struct {
 | 
				
			||||||
 | 
						AllowedDomains []string
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func (a *AllowedDomains) ProcessRequest(r *client.Request) {
 | 
				
			||||||
 | 
						if len(a.AllowedDomains) != 0 && !internal.Contains(a.AllowedDomains, r.Host) {
 | 
				
			||||||
 | 
							//log.Printf("Domain not allowed: %s\n", req.Host)
 | 
				
			||||||
 | 
							r.Cancel()
 | 
				
			||||||
 | 
							return
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										30
									
								
								middleware/delay.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								middleware/delay.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,30 @@
 | 
				
			|||||||
 | 
					package middleware
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import (
 | 
				
			||||||
 | 
						"github.com/geziyor/geziyor/client"
 | 
				
			||||||
 | 
						"math/rand"
 | 
				
			||||||
 | 
						"time"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// delay delays requests
 | 
				
			||||||
 | 
					type delay struct {
 | 
				
			||||||
 | 
						requestDelayRandomize bool
 | 
				
			||||||
 | 
						requestDelay          time.Duration
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func NewDelay(requestDelayRandomize bool, requestDelay time.Duration) RequestProcessor {
 | 
				
			||||||
 | 
						if requestDelayRandomize {
 | 
				
			||||||
 | 
							rand.Seed(time.Now().UnixNano())
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						return &delay{requestDelayRandomize: requestDelayRandomize, requestDelay: requestDelay}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func (a *delay) ProcessRequest(r *client.Request) {
 | 
				
			||||||
 | 
						if a.requestDelayRandomize {
 | 
				
			||||||
 | 
							min := float64(a.requestDelay) * 0.5
 | 
				
			||||||
 | 
							max := float64(a.requestDelay) * 1.5
 | 
				
			||||||
 | 
							time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							time.Sleep(a.requestDelay)
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -1,7 +1,7 @@
 | 
				
			|||||||
package geziyor
 | 
					package middleware
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import (
 | 
					import (
 | 
				
			||||||
	"fmt"
 | 
						"github.com/stretchr/testify/assert"
 | 
				
			||||||
	"math/rand"
 | 
						"math/rand"
 | 
				
			||||||
	"testing"
 | 
						"testing"
 | 
				
			||||||
	"time"
 | 
						"time"
 | 
				
			||||||
@@ -13,5 +13,7 @@ func TestRandomDelay(t *testing.T) {
 | 
				
			|||||||
	min := float64(delay) * 0.5
 | 
						min := float64(delay) * 0.5
 | 
				
			||||||
	max := float64(delay) * 1.5
 | 
						max := float64(delay) * 1.5
 | 
				
			||||||
	randomDelay := rand.Intn(int(max-min)) + int(min)
 | 
						randomDelay := rand.Intn(int(max-min)) + int(min)
 | 
				
			||||||
	fmt.Println(time.Duration(randomDelay))
 | 
					
 | 
				
			||||||
 | 
						assert.True(t, time.Duration(randomDelay).Seconds() < 1.5)
 | 
				
			||||||
 | 
						assert.True(t, time.Duration(randomDelay).Seconds() > 0.5)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
							
								
								
									
										21
									
								
								middleware/duplicate_requests.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								middleware/duplicate_requests.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,21 @@
 | 
				
			|||||||
 | 
					package middleware
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import (
 | 
				
			||||||
 | 
						"github.com/geziyor/geziyor/client"
 | 
				
			||||||
 | 
						"sync"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// DuplicateRequests checks for already visited URLs
 | 
				
			||||||
 | 
					type DuplicateRequests struct {
 | 
				
			||||||
 | 
						RevisitEnabled bool
 | 
				
			||||||
 | 
						visitedURLs    sync.Map
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func (a *DuplicateRequests) ProcessRequest(r *client.Request) {
 | 
				
			||||||
 | 
						if !a.RevisitEnabled && r.Request.Method == "GET" {
 | 
				
			||||||
 | 
							if _, visited := a.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
 | 
				
			||||||
 | 
								//log.Printf("URL already visited %s\n")
 | 
				
			||||||
 | 
								r.Cancel()
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										17
									
								
								middleware/headers.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								middleware/headers.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,17 @@
 | 
				
			|||||||
 | 
					package middleware
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import (
 | 
				
			||||||
 | 
						"github.com/geziyor/geziyor/client"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Headers sets default request headers
 | 
				
			||||||
 | 
					type Headers struct {
 | 
				
			||||||
 | 
						UserAgent string
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func (a *Headers) ProcessRequest(r *client.Request) {
 | 
				
			||||||
 | 
						r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
 | 
				
			||||||
 | 
						r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
 | 
				
			||||||
 | 
						r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en")
 | 
				
			||||||
 | 
						r.Header = client.SetDefaultHeader(r.Header, "User-Agent", a.UserAgent)
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										18
									
								
								middleware/log_stats.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								middleware/log_stats.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,18 @@
 | 
				
			|||||||
 | 
					package middleware
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import (
 | 
				
			||||||
 | 
						"github.com/geziyor/geziyor/client"
 | 
				
			||||||
 | 
						"log"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// LogStats logs responses
 | 
				
			||||||
 | 
					type LogStats struct {
 | 
				
			||||||
 | 
						LogDisabled bool
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func (p *LogStats) ProcessResponse(r *client.Response) {
 | 
				
			||||||
 | 
						// LogDisabled check is not necessary, but done here for performance reasons
 | 
				
			||||||
 | 
						if !p.LogDisabled {
 | 
				
			||||||
 | 
							log.Printf("Crawled: (%d) <%s %s>", r.StatusCode, r.Request.Method, r.Request.URL.String())
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										19
									
								
								middleware/metrics.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								middleware/metrics.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,19 @@
 | 
				
			|||||||
 | 
					package middleware
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import (
 | 
				
			||||||
 | 
						"github.com/geziyor/geziyor/client"
 | 
				
			||||||
 | 
						"github.com/geziyor/geziyor/metrics"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Metrics sets stats for request and responses
 | 
				
			||||||
 | 
					type Metrics struct {
 | 
				
			||||||
 | 
						Metrics *metrics.Metrics
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func (a *Metrics) ProcessRequest(r *client.Request) {
 | 
				
			||||||
 | 
						a.Metrics.RequestCounter.With("method", r.Method).Add(1)
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func (a *Metrics) ProcessResponse(r *client.Response) {
 | 
				
			||||||
 | 
						a.Metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										21
									
								
								middleware/middleware.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								middleware/middleware.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,21 @@
 | 
				
			|||||||
 | 
					package middleware
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import (
 | 
				
			||||||
 | 
						"github.com/geziyor/geziyor/client"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					type RequestResponseProcessor interface {
 | 
				
			||||||
 | 
						RequestProcessor
 | 
				
			||||||
 | 
						ResponseProcessor
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// RequestProcessor called before requests made.
 | 
				
			||||||
 | 
					// Set request.Cancelled = true to cancel request
 | 
				
			||||||
 | 
					type RequestProcessor interface {
 | 
				
			||||||
 | 
						ProcessRequest(r *client.Request)
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// ResponseProcessor called after request response receive
 | 
				
			||||||
 | 
					type ResponseProcessor interface {
 | 
				
			||||||
 | 
						ProcessResponse(r *client.Response)
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										18
									
								
								middleware/parse_html.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								middleware/parse_html.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,18 @@
 | 
				
			|||||||
 | 
					package middleware
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import (
 | 
				
			||||||
 | 
						"bytes"
 | 
				
			||||||
 | 
						"github.com/PuerkitoBio/goquery"
 | 
				
			||||||
 | 
						"github.com/geziyor/geziyor/client"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// ParseHTML parses response if response is HTML
 | 
				
			||||||
 | 
					type ParseHTML struct {
 | 
				
			||||||
 | 
						ParseHTMLDisabled bool
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func (p *ParseHTML) ProcessResponse(r *client.Response) {
 | 
				
			||||||
 | 
						if !p.ParseHTMLDisabled && r.IsHTML() {
 | 
				
			||||||
 | 
							r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -4,8 +4,8 @@ import (
 | 
				
			|||||||
	"github.com/fpfeng/httpcache"
 | 
						"github.com/fpfeng/httpcache"
 | 
				
			||||||
	"github.com/geziyor/geziyor/client"
 | 
						"github.com/geziyor/geziyor/client"
 | 
				
			||||||
	"github.com/geziyor/geziyor/export"
 | 
						"github.com/geziyor/geziyor/export"
 | 
				
			||||||
	"github.com/geziyor/geziyor/extract"
 | 
					 | 
				
			||||||
	"github.com/geziyor/geziyor/metrics"
 | 
						"github.com/geziyor/geziyor/metrics"
 | 
				
			||||||
 | 
						"github.com/geziyor/geziyor/middleware"
 | 
				
			||||||
	"time"
 | 
						"time"
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -24,9 +24,6 @@ type Options struct {
 | 
				
			|||||||
	// ParseFunc is callback of StartURLs response.
 | 
						// ParseFunc is callback of StartURLs response.
 | 
				
			||||||
	ParseFunc func(g *Geziyor, r *client.Response)
 | 
						ParseFunc func(g *Geziyor, r *client.Response)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// Extractors extracts items from pages
 | 
					 | 
				
			||||||
	Extractors []extract.Extractor
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	// Timeout is global request timeout
 | 
						// Timeout is global request timeout
 | 
				
			||||||
	Timeout time.Duration
 | 
						Timeout time.Duration
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -56,10 +53,10 @@ type Options struct {
 | 
				
			|||||||
	Exporters []export.Exporter
 | 
						Exporters []export.Exporter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// Called before requests made to manipulate requests
 | 
						// Called before requests made to manipulate requests
 | 
				
			||||||
	RequestMiddlewares []RequestMiddleware
 | 
						RequestMiddlewares []middleware.RequestProcessor
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// Called after response received
 | 
						// Called after response received
 | 
				
			||||||
	ResponseMiddlewares []ResponseMiddleware
 | 
						ResponseMiddlewares []middleware.ResponseProcessor
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// Max body reading size in bytes. Default: 1GB
 | 
						// Max body reading size in bytes. Default: 1GB
 | 
				
			||||||
	MaxBodySize int64
 | 
						MaxBodySize int64
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user