Delays and logs refactored as middlewares.

This commit is contained in:
Musab Gültekin 2019-06-20 09:54:30 +03:00
parent 514fe2e8d2
commit f88b88986c
2 changed files with 31 additions and 32 deletions

View File

@ -11,13 +11,10 @@ import (
"io" "io"
"io/ioutil" "io/ioutil"
"log" "log"
"math/rand"
"net/http" "net/http"
"net/http/cookiejar" "net/http/cookiejar"
"net/url" "net/url"
"os"
"sync" "sync"
"time"
) )
// Exporter interface is for extracting data to external resources // Exporter interface is for extracting data to external resources
@ -42,11 +39,6 @@ type Geziyor struct {
responseMiddlewares []ResponseMiddleware responseMiddlewares []ResponseMiddleware
} }
func init() {
log.SetOutput(os.Stdout)
rand.Seed(time.Now().UnixNano())
}
// NewGeziyor creates new Geziyor with default values. // NewGeziyor creates new Geziyor with default values.
// If options provided, options // If options provided, options
func NewGeziyor(opt *Options) *Geziyor { func NewGeziyor(opt *Options) *Geziyor {
@ -58,6 +50,8 @@ func NewGeziyor(opt *Options) *Geziyor {
allowedDomainsMiddleware, allowedDomainsMiddleware,
duplicateRequestsMiddleware, duplicateRequestsMiddleware,
defaultHeadersMiddleware, defaultHeadersMiddleware,
delayMiddleware,
logMiddleware,
}, },
responseMiddlewares: []ResponseMiddleware{ responseMiddlewares: []ResponseMiddleware{
parseHTMLMiddleware, parseHTMLMiddleware,
@ -168,6 +162,8 @@ func (g *Geziyor) Do(req *Request, callback func(g *Geziyor, r *Response)) {
// Do sends an HTTP request // Do sends an HTTP request
func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) { func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) {
g.acquireSem(req)
defer g.releaseSem(req)
defer g.wg.Done() defer g.wg.Done()
defer recoverMiddleware() defer recoverMiddleware()
@ -205,12 +201,6 @@ func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) {
} }
func (g *Geziyor) doRequestClient(req *Request) (*Response, error) { func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
g.acquireSem(req)
defer g.releaseSem(req)
g.delay()
log.Println("Fetching: ", req.URL.String())
// Do request // Do request
resp, err := g.Client.Do(req.Request) resp, err := g.Client.Do(req.Request)
@ -251,18 +241,13 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
} }
func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) { func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
g.acquireSem(req)
defer g.releaseSem(req)
g.delay()
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
var body string var body string
var reqID network.RequestID var reqID network.RequestID
var res *network.Response var res *network.Response
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
if err := chromedp.Run(ctx, if err := chromedp.Run(ctx,
network.Enable(), network.Enable(),
network.SetExtraHTTPHeaders(network.Headers(internal.ConvertHeaderToMap(req.Header))), network.SetExtraHTTPHeaders(network.Headers(internal.ConvertHeaderToMap(req.Header))),
@ -339,13 +324,3 @@ func (g *Geziyor) releaseSem(req *Request) {
<-g.semHosts.hostSems[req.Host] <-g.semHosts.hostSems[req.Host]
} }
} }
func (g *Geziyor) delay() {
if g.Opt.RequestDelayRandomize {
min := float64(g.Opt.RequestDelay) * 0.5
max := float64(g.Opt.RequestDelay) * 1.5
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
} else {
time.Sleep(g.Opt.RequestDelay)
}
}

View File

@ -5,7 +5,10 @@ import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor/internal" "github.com/geziyor/geziyor/internal"
"log" "log"
"math/rand"
"os"
"runtime/debug" "runtime/debug"
"time"
) )
// RequestMiddleware called before requests made. // RequestMiddleware called before requests made.
@ -15,6 +18,11 @@ type RequestMiddleware func(g *Geziyor, r *Request)
// ResponseMiddleware called after request response receive // ResponseMiddleware called after request response receive
type ResponseMiddleware func(g *Geziyor, r *Response) type ResponseMiddleware func(g *Geziyor, r *Response)
func init() {
log.SetOutput(os.Stdout)
rand.Seed(time.Now().UnixNano())
}
// recoverMiddleware recovers scraping being crashed. // recoverMiddleware recovers scraping being crashed.
// Logs error and stack trace // Logs error and stack trace
func recoverMiddleware() { func recoverMiddleware() {
@ -51,6 +59,22 @@ func defaultHeadersMiddleware(g *Geziyor, r *Request) {
r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent) r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
} }
// delayMiddleware delays requests
func delayMiddleware(g *Geziyor, r *Request) {
if g.Opt.RequestDelayRandomize {
min := float64(g.Opt.RequestDelay) * 0.5
max := float64(g.Opt.RequestDelay) * 1.5
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
} else {
time.Sleep(g.Opt.RequestDelay)
}
}
// logMiddleware logs requests
func logMiddleware(g *Geziyor, r *Request) {
log.Println("Fetching: ", r.URL.String())
}
// parseHTMLMiddleware parses response if response is HTML // parseHTMLMiddleware parses response if response is HTML
func parseHTMLMiddleware(g *Geziyor, r *Response) { func parseHTMLMiddleware(g *Geziyor, r *Response) {
if !g.Opt.ParseHTMLDisabled && r.isHTML() { if !g.Opt.ParseHTMLDisabled && r.isHTML() {