Middlewares refactored to multiple files in middleware package.
Extractors removed as they introduce complexity to scraper. Both in learning and developing.
This commit is contained in:
		
							
								
								
									
										19
									
								
								middleware/allowed_domains.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								middleware/allowed_domains.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,19 @@ | ||||
| package middleware | ||||
|  | ||||
| import ( | ||||
| 	"github.com/geziyor/geziyor/client" | ||||
| 	"github.com/geziyor/geziyor/internal" | ||||
| ) | ||||
|  | ||||
| // AllowedDomains checks for request host if it exists in AllowedDomains | ||||
| type AllowedDomains struct { | ||||
| 	AllowedDomains []string | ||||
| } | ||||
|  | ||||
| func (a *AllowedDomains) ProcessRequest(r *client.Request) { | ||||
| 	if len(a.AllowedDomains) != 0 && !internal.Contains(a.AllowedDomains, r.Host) { | ||||
| 		//log.Printf("Domain not allowed: %s\n", req.Host) | ||||
| 		r.Cancel() | ||||
| 		return | ||||
| 	} | ||||
| } | ||||
							
								
								
									
										30
									
								
								middleware/delay.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								middleware/delay.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | ||||
| package middleware | ||||
|  | ||||
| import ( | ||||
| 	"github.com/geziyor/geziyor/client" | ||||
| 	"math/rand" | ||||
| 	"time" | ||||
| ) | ||||
|  | ||||
| // delay delays requests | ||||
| type delay struct { | ||||
| 	requestDelayRandomize bool | ||||
| 	requestDelay          time.Duration | ||||
| } | ||||
|  | ||||
| func NewDelay(requestDelayRandomize bool, requestDelay time.Duration) RequestProcessor { | ||||
| 	if requestDelayRandomize { | ||||
| 		rand.Seed(time.Now().UnixNano()) | ||||
| 	} | ||||
| 	return &delay{requestDelayRandomize: requestDelayRandomize, requestDelay: requestDelay} | ||||
| } | ||||
|  | ||||
| func (a *delay) ProcessRequest(r *client.Request) { | ||||
| 	if a.requestDelayRandomize { | ||||
| 		min := float64(a.requestDelay) * 0.5 | ||||
| 		max := float64(a.requestDelay) * 1.5 | ||||
| 		time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min))) | ||||
| 	} else { | ||||
| 		time.Sleep(a.requestDelay) | ||||
| 	} | ||||
| } | ||||
							
								
								
									
										19
									
								
								middleware/delay_test.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								middleware/delay_test.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,19 @@ | ||||
| package middleware | ||||
|  | ||||
| import ( | ||||
| 	"github.com/stretchr/testify/assert" | ||||
| 	"math/rand" | ||||
| 	"testing" | ||||
| 	"time" | ||||
| ) | ||||
|  | ||||
| func TestRandomDelay(t *testing.T) { | ||||
| 	rand.Seed(time.Now().UnixNano()) | ||||
| 	delay := time.Millisecond * 1000 | ||||
| 	min := float64(delay) * 0.5 | ||||
| 	max := float64(delay) * 1.5 | ||||
| 	randomDelay := rand.Intn(int(max-min)) + int(min) | ||||
|  | ||||
| 	assert.True(t, time.Duration(randomDelay).Seconds() < 1.5) | ||||
| 	assert.True(t, time.Duration(randomDelay).Seconds() > 0.5) | ||||
| } | ||||
							
								
								
									
										21
									
								
								middleware/duplicate_requests.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								middleware/duplicate_requests.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | ||||
| package middleware | ||||
|  | ||||
| import ( | ||||
| 	"github.com/geziyor/geziyor/client" | ||||
| 	"sync" | ||||
| ) | ||||
|  | ||||
| // DuplicateRequests checks for already visited URLs | ||||
| type DuplicateRequests struct { | ||||
| 	RevisitEnabled bool | ||||
| 	visitedURLs    sync.Map | ||||
| } | ||||
|  | ||||
| func (a *DuplicateRequests) ProcessRequest(r *client.Request) { | ||||
| 	if !a.RevisitEnabled && r.Request.Method == "GET" { | ||||
| 		if _, visited := a.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited { | ||||
| 			//log.Printf("URL already visited %s\n") | ||||
| 			r.Cancel() | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
							
								
								
									
										17
									
								
								middleware/headers.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								middleware/headers.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,17 @@ | ||||
| package middleware | ||||
|  | ||||
| import ( | ||||
| 	"github.com/geziyor/geziyor/client" | ||||
| ) | ||||
|  | ||||
| // Headers sets default request headers | ||||
| type Headers struct { | ||||
| 	UserAgent string | ||||
| } | ||||
|  | ||||
| func (a *Headers) ProcessRequest(r *client.Request) { | ||||
| 	r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") | ||||
| 	r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8") | ||||
| 	r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en") | ||||
| 	r.Header = client.SetDefaultHeader(r.Header, "User-Agent", a.UserAgent) | ||||
| } | ||||
							
								
								
									
										18
									
								
								middleware/log_stats.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								middleware/log_stats.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,18 @@ | ||||
| package middleware | ||||
|  | ||||
| import ( | ||||
| 	"github.com/geziyor/geziyor/client" | ||||
| 	"log" | ||||
| ) | ||||
|  | ||||
| // LogStats logs responses | ||||
| type LogStats struct { | ||||
| 	LogDisabled bool | ||||
| } | ||||
|  | ||||
| func (p *LogStats) ProcessResponse(r *client.Response) { | ||||
| 	// LogDisabled check is not necessary, but done here for performance reasons | ||||
| 	if !p.LogDisabled { | ||||
| 		log.Printf("Crawled: (%d) <%s %s>", r.StatusCode, r.Request.Method, r.Request.URL.String()) | ||||
| 	} | ||||
| } | ||||
							
								
								
									
										19
									
								
								middleware/metrics.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								middleware/metrics.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,19 @@ | ||||
| package middleware | ||||
|  | ||||
| import ( | ||||
| 	"github.com/geziyor/geziyor/client" | ||||
| 	"github.com/geziyor/geziyor/metrics" | ||||
| ) | ||||
|  | ||||
| // Metrics sets stats for request and responses | ||||
| type Metrics struct { | ||||
| 	Metrics *metrics.Metrics | ||||
| } | ||||
|  | ||||
| func (a *Metrics) ProcessRequest(r *client.Request) { | ||||
| 	a.Metrics.RequestCounter.With("method", r.Method).Add(1) | ||||
| } | ||||
|  | ||||
| func (a *Metrics) ProcessResponse(r *client.Response) { | ||||
| 	a.Metrics.ResponseCounter.With("method", r.Request.Method).Add(1) | ||||
| } | ||||
							
								
								
									
										21
									
								
								middleware/middleware.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								middleware/middleware.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | ||||
| package middleware | ||||
|  | ||||
| import ( | ||||
| 	"github.com/geziyor/geziyor/client" | ||||
| ) | ||||
|  | ||||
| type RequestResponseProcessor interface { | ||||
| 	RequestProcessor | ||||
| 	ResponseProcessor | ||||
| } | ||||
|  | ||||
| // RequestProcessor called before requests made. | ||||
| // Set request.Cancelled = true to cancel request | ||||
| type RequestProcessor interface { | ||||
| 	ProcessRequest(r *client.Request) | ||||
| } | ||||
|  | ||||
| // ResponseProcessor called after request response receive | ||||
| type ResponseProcessor interface { | ||||
| 	ProcessResponse(r *client.Response) | ||||
| } | ||||
							
								
								
									
										18
									
								
								middleware/parse_html.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								middleware/parse_html.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,18 @@ | ||||
| package middleware | ||||
|  | ||||
| import ( | ||||
| 	"bytes" | ||||
| 	"github.com/PuerkitoBio/goquery" | ||||
| 	"github.com/geziyor/geziyor/client" | ||||
| ) | ||||
|  | ||||
| // ParseHTML parses response if response is HTML | ||||
| type ParseHTML struct { | ||||
| 	ParseHTMLDisabled bool | ||||
| } | ||||
|  | ||||
| func (p *ParseHTML) ProcessResponse(r *client.Response) { | ||||
| 	if !p.ParseHTMLDisabled && r.IsHTML() { | ||||
| 		r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) | ||||
| 	} | ||||
| } | ||||
		Reference in New Issue
	
	Block a user