Middlewares refactored to multiple files in middleware package.

Extractors removed as they introduce complexity to scraper. Both in learning and developing.
This commit is contained in:
Musab Gültekin
2019-07-04 21:04:29 +03:00
parent 9adff75509
commit 2cab68d2ce
19 changed files with 202 additions and 304 deletions

View File

@ -0,0 +1,19 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal"
)
// AllowedDomains checks for request host if it exists in AllowedDomains
type AllowedDomains struct {
AllowedDomains []string
}
func (a *AllowedDomains) ProcessRequest(r *client.Request) {
if len(a.AllowedDomains) != 0 && !internal.Contains(a.AllowedDomains, r.Host) {
//log.Printf("Domain not allowed: %s\n", req.Host)
r.Cancel()
return
}
}

30
middleware/delay.go Normal file
View File

@ -0,0 +1,30 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"math/rand"
"time"
)
// delay delays requests
type delay struct {
requestDelayRandomize bool
requestDelay time.Duration
}
func NewDelay(requestDelayRandomize bool, requestDelay time.Duration) RequestProcessor {
if requestDelayRandomize {
rand.Seed(time.Now().UnixNano())
}
return &delay{requestDelayRandomize: requestDelayRandomize, requestDelay: requestDelay}
}
func (a *delay) ProcessRequest(r *client.Request) {
if a.requestDelayRandomize {
min := float64(a.requestDelay) * 0.5
max := float64(a.requestDelay) * 1.5
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
} else {
time.Sleep(a.requestDelay)
}
}

19
middleware/delay_test.go Normal file
View File

@ -0,0 +1,19 @@
package middleware
import (
"github.com/stretchr/testify/assert"
"math/rand"
"testing"
"time"
)
func TestRandomDelay(t *testing.T) {
rand.Seed(time.Now().UnixNano())
delay := time.Millisecond * 1000
min := float64(delay) * 0.5
max := float64(delay) * 1.5
randomDelay := rand.Intn(int(max-min)) + int(min)
assert.True(t, time.Duration(randomDelay).Seconds() < 1.5)
assert.True(t, time.Duration(randomDelay).Seconds() > 0.5)
}

View File

@ -0,0 +1,21 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"sync"
)
// DuplicateRequests checks for already visited URLs
type DuplicateRequests struct {
RevisitEnabled bool
visitedURLs sync.Map
}
func (a *DuplicateRequests) ProcessRequest(r *client.Request) {
if !a.RevisitEnabled && r.Request.Method == "GET" {
if _, visited := a.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
//log.Printf("URL already visited %s\n")
r.Cancel()
}
}
}

17
middleware/headers.go Normal file
View File

@ -0,0 +1,17 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
)
// Headers sets default request headers
type Headers struct {
UserAgent string
}
func (a *Headers) ProcessRequest(r *client.Request) {
r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en")
r.Header = client.SetDefaultHeader(r.Header, "User-Agent", a.UserAgent)
}

18
middleware/log_stats.go Normal file
View File

@ -0,0 +1,18 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"log"
)
// LogStats logs responses
type LogStats struct {
LogDisabled bool
}
func (p *LogStats) ProcessResponse(r *client.Response) {
// LogDisabled check is not necessary, but done here for performance reasons
if !p.LogDisabled {
log.Printf("Crawled: (%d) <%s %s>", r.StatusCode, r.Request.Method, r.Request.URL.String())
}
}

19
middleware/metrics.go Normal file
View File

@ -0,0 +1,19 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/metrics"
)
// Metrics sets stats for request and responses
type Metrics struct {
Metrics *metrics.Metrics
}
func (a *Metrics) ProcessRequest(r *client.Request) {
a.Metrics.RequestCounter.With("method", r.Method).Add(1)
}
func (a *Metrics) ProcessResponse(r *client.Response) {
a.Metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
}

21
middleware/middleware.go Normal file
View File

@ -0,0 +1,21 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
)
type RequestResponseProcessor interface {
RequestProcessor
ResponseProcessor
}
// RequestProcessor called before requests made.
// Set request.Cancelled = true to cancel request
type RequestProcessor interface {
ProcessRequest(r *client.Request)
}
// ResponseProcessor called after request response receive
type ResponseProcessor interface {
ProcessResponse(r *client.Response)
}

18
middleware/parse_html.go Normal file
View File

@ -0,0 +1,18 @@
package middleware
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor/client"
)
// ParseHTML parses response if response is HTML
type ParseHTML struct {
ParseHTMLDisabled bool
}
func (p *ParseHTML) ProcessResponse(r *client.Response) {
if !p.ParseHTMLDisabled && r.IsHTML() {
r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
}
}