Middlewares refactored to multiple files in middleware package.
Extractors removed as they introduce complexity to scraper. Both in learning and developing.
This commit is contained in:
19
middleware/allowed_domains.go
Normal file
19
middleware/allowed_domains.go
Normal file
@ -0,0 +1,19 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/internal"
|
||||
)
|
||||
|
||||
// AllowedDomains checks for request host if it exists in AllowedDomains
|
||||
type AllowedDomains struct {
|
||||
AllowedDomains []string
|
||||
}
|
||||
|
||||
func (a *AllowedDomains) ProcessRequest(r *client.Request) {
|
||||
if len(a.AllowedDomains) != 0 && !internal.Contains(a.AllowedDomains, r.Host) {
|
||||
//log.Printf("Domain not allowed: %s\n", req.Host)
|
||||
r.Cancel()
|
||||
return
|
||||
}
|
||||
}
|
30
middleware/delay.go
Normal file
30
middleware/delay.go
Normal file
@ -0,0 +1,30 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"math/rand"
|
||||
"time"
|
||||
)
|
||||
|
||||
// delay delays requests
|
||||
type delay struct {
|
||||
requestDelayRandomize bool
|
||||
requestDelay time.Duration
|
||||
}
|
||||
|
||||
func NewDelay(requestDelayRandomize bool, requestDelay time.Duration) RequestProcessor {
|
||||
if requestDelayRandomize {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
}
|
||||
return &delay{requestDelayRandomize: requestDelayRandomize, requestDelay: requestDelay}
|
||||
}
|
||||
|
||||
func (a *delay) ProcessRequest(r *client.Request) {
|
||||
if a.requestDelayRandomize {
|
||||
min := float64(a.requestDelay) * 0.5
|
||||
max := float64(a.requestDelay) * 1.5
|
||||
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
|
||||
} else {
|
||||
time.Sleep(a.requestDelay)
|
||||
}
|
||||
}
|
19
middleware/delay_test.go
Normal file
19
middleware/delay_test.go
Normal file
@ -0,0 +1,19 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
"math/rand"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestRandomDelay(t *testing.T) {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
delay := time.Millisecond * 1000
|
||||
min := float64(delay) * 0.5
|
||||
max := float64(delay) * 1.5
|
||||
randomDelay := rand.Intn(int(max-min)) + int(min)
|
||||
|
||||
assert.True(t, time.Duration(randomDelay).Seconds() < 1.5)
|
||||
assert.True(t, time.Duration(randomDelay).Seconds() > 0.5)
|
||||
}
|
21
middleware/duplicate_requests.go
Normal file
21
middleware/duplicate_requests.go
Normal file
@ -0,0 +1,21 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// DuplicateRequests checks for already visited URLs
|
||||
type DuplicateRequests struct {
|
||||
RevisitEnabled bool
|
||||
visitedURLs sync.Map
|
||||
}
|
||||
|
||||
func (a *DuplicateRequests) ProcessRequest(r *client.Request) {
|
||||
if !a.RevisitEnabled && r.Request.Method == "GET" {
|
||||
if _, visited := a.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
|
||||
//log.Printf("URL already visited %s\n")
|
||||
r.Cancel()
|
||||
}
|
||||
}
|
||||
}
|
17
middleware/headers.go
Normal file
17
middleware/headers.go
Normal file
@ -0,0 +1,17 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
)
|
||||
|
||||
// Headers sets default request headers
|
||||
type Headers struct {
|
||||
UserAgent string
|
||||
}
|
||||
|
||||
func (a *Headers) ProcessRequest(r *client.Request) {
|
||||
r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
|
||||
r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en")
|
||||
r.Header = client.SetDefaultHeader(r.Header, "User-Agent", a.UserAgent)
|
||||
}
|
18
middleware/log_stats.go
Normal file
18
middleware/log_stats.go
Normal file
@ -0,0 +1,18 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"log"
|
||||
)
|
||||
|
||||
// LogStats logs responses
|
||||
type LogStats struct {
|
||||
LogDisabled bool
|
||||
}
|
||||
|
||||
func (p *LogStats) ProcessResponse(r *client.Response) {
|
||||
// LogDisabled check is not necessary, but done here for performance reasons
|
||||
if !p.LogDisabled {
|
||||
log.Printf("Crawled: (%d) <%s %s>", r.StatusCode, r.Request.Method, r.Request.URL.String())
|
||||
}
|
||||
}
|
19
middleware/metrics.go
Normal file
19
middleware/metrics.go
Normal file
@ -0,0 +1,19 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/metrics"
|
||||
)
|
||||
|
||||
// Metrics sets stats for request and responses
|
||||
type Metrics struct {
|
||||
Metrics *metrics.Metrics
|
||||
}
|
||||
|
||||
func (a *Metrics) ProcessRequest(r *client.Request) {
|
||||
a.Metrics.RequestCounter.With("method", r.Method).Add(1)
|
||||
}
|
||||
|
||||
func (a *Metrics) ProcessResponse(r *client.Response) {
|
||||
a.Metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
|
||||
}
|
21
middleware/middleware.go
Normal file
21
middleware/middleware.go
Normal file
@ -0,0 +1,21 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/client"
|
||||
)
|
||||
|
||||
type RequestResponseProcessor interface {
|
||||
RequestProcessor
|
||||
ResponseProcessor
|
||||
}
|
||||
|
||||
// RequestProcessor called before requests made.
|
||||
// Set request.Cancelled = true to cancel request
|
||||
type RequestProcessor interface {
|
||||
ProcessRequest(r *client.Request)
|
||||
}
|
||||
|
||||
// ResponseProcessor called after request response receive
|
||||
type ResponseProcessor interface {
|
||||
ProcessResponse(r *client.Response)
|
||||
}
|
18
middleware/parse_html.go
Normal file
18
middleware/parse_html.go
Normal file
@ -0,0 +1,18 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
)
|
||||
|
||||
// ParseHTML parses response if response is HTML
|
||||
type ParseHTML struct {
|
||||
ParseHTMLDisabled bool
|
||||
}
|
||||
|
||||
func (p *ParseHTML) ProcessResponse(r *client.Response) {
|
||||
if !p.ParseHTMLDisabled && r.IsHTML() {
|
||||
r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user