Delays and logs refactored as middlewares.
This commit is contained in:
parent
514fe2e8d2
commit
f88b88986c
39
geziyor.go
39
geziyor.go
@ -11,13 +11,10 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
"math/rand"
|
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/cookiejar"
|
"net/http/cookiejar"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Exporter interface is for extracting data to external resources
|
// Exporter interface is for extracting data to external resources
|
||||||
@ -42,11 +39,6 @@ type Geziyor struct {
|
|||||||
responseMiddlewares []ResponseMiddleware
|
responseMiddlewares []ResponseMiddleware
|
||||||
}
|
}
|
||||||
|
|
||||||
func init() {
|
|
||||||
log.SetOutput(os.Stdout)
|
|
||||||
rand.Seed(time.Now().UnixNano())
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewGeziyor creates new Geziyor with default values.
|
// NewGeziyor creates new Geziyor with default values.
|
||||||
// If options provided, options
|
// If options provided, options
|
||||||
func NewGeziyor(opt *Options) *Geziyor {
|
func NewGeziyor(opt *Options) *Geziyor {
|
||||||
@ -58,6 +50,8 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
allowedDomainsMiddleware,
|
allowedDomainsMiddleware,
|
||||||
duplicateRequestsMiddleware,
|
duplicateRequestsMiddleware,
|
||||||
defaultHeadersMiddleware,
|
defaultHeadersMiddleware,
|
||||||
|
delayMiddleware,
|
||||||
|
logMiddleware,
|
||||||
},
|
},
|
||||||
responseMiddlewares: []ResponseMiddleware{
|
responseMiddlewares: []ResponseMiddleware{
|
||||||
parseHTMLMiddleware,
|
parseHTMLMiddleware,
|
||||||
@ -168,6 +162,8 @@ func (g *Geziyor) Do(req *Request, callback func(g *Geziyor, r *Response)) {
|
|||||||
|
|
||||||
// Do sends an HTTP request
|
// Do sends an HTTP request
|
||||||
func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) {
|
func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) {
|
||||||
|
g.acquireSem(req)
|
||||||
|
defer g.releaseSem(req)
|
||||||
defer g.wg.Done()
|
defer g.wg.Done()
|
||||||
defer recoverMiddleware()
|
defer recoverMiddleware()
|
||||||
|
|
||||||
@ -205,12 +201,6 @@ func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
|
func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
|
||||||
g.acquireSem(req)
|
|
||||||
defer g.releaseSem(req)
|
|
||||||
|
|
||||||
g.delay()
|
|
||||||
|
|
||||||
log.Println("Fetching: ", req.URL.String())
|
|
||||||
|
|
||||||
// Do request
|
// Do request
|
||||||
resp, err := g.Client.Do(req.Request)
|
resp, err := g.Client.Do(req.Request)
|
||||||
@ -251,18 +241,13 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
|
func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
|
||||||
g.acquireSem(req)
|
|
||||||
defer g.releaseSem(req)
|
|
||||||
|
|
||||||
g.delay()
|
|
||||||
|
|
||||||
ctx, cancel := chromedp.NewContext(context.Background())
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
var body string
|
var body string
|
||||||
var reqID network.RequestID
|
var reqID network.RequestID
|
||||||
var res *network.Response
|
var res *network.Response
|
||||||
|
|
||||||
|
ctx, cancel := chromedp.NewContext(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
if err := chromedp.Run(ctx,
|
if err := chromedp.Run(ctx,
|
||||||
network.Enable(),
|
network.Enable(),
|
||||||
network.SetExtraHTTPHeaders(network.Headers(internal.ConvertHeaderToMap(req.Header))),
|
network.SetExtraHTTPHeaders(network.Headers(internal.ConvertHeaderToMap(req.Header))),
|
||||||
@ -339,13 +324,3 @@ func (g *Geziyor) releaseSem(req *Request) {
|
|||||||
<-g.semHosts.hostSems[req.Host]
|
<-g.semHosts.hostSems[req.Host]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (g *Geziyor) delay() {
|
|
||||||
if g.Opt.RequestDelayRandomize {
|
|
||||||
min := float64(g.Opt.RequestDelay) * 0.5
|
|
||||||
max := float64(g.Opt.RequestDelay) * 1.5
|
|
||||||
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
|
|
||||||
} else {
|
|
||||||
time.Sleep(g.Opt.RequestDelay)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -5,7 +5,10 @@ import (
|
|||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"github.com/geziyor/geziyor/internal"
|
"github.com/geziyor/geziyor/internal"
|
||||||
"log"
|
"log"
|
||||||
|
"math/rand"
|
||||||
|
"os"
|
||||||
"runtime/debug"
|
"runtime/debug"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// RequestMiddleware called before requests made.
|
// RequestMiddleware called before requests made.
|
||||||
@ -15,6 +18,11 @@ type RequestMiddleware func(g *Geziyor, r *Request)
|
|||||||
// ResponseMiddleware called after request response receive
|
// ResponseMiddleware called after request response receive
|
||||||
type ResponseMiddleware func(g *Geziyor, r *Response)
|
type ResponseMiddleware func(g *Geziyor, r *Response)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
log.SetOutput(os.Stdout)
|
||||||
|
rand.Seed(time.Now().UnixNano())
|
||||||
|
}
|
||||||
|
|
||||||
// recoverMiddleware recovers scraping being crashed.
|
// recoverMiddleware recovers scraping being crashed.
|
||||||
// Logs error and stack trace
|
// Logs error and stack trace
|
||||||
func recoverMiddleware() {
|
func recoverMiddleware() {
|
||||||
@ -51,6 +59,22 @@ func defaultHeadersMiddleware(g *Geziyor, r *Request) {
|
|||||||
r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
|
r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// delayMiddleware delays requests
|
||||||
|
func delayMiddleware(g *Geziyor, r *Request) {
|
||||||
|
if g.Opt.RequestDelayRandomize {
|
||||||
|
min := float64(g.Opt.RequestDelay) * 0.5
|
||||||
|
max := float64(g.Opt.RequestDelay) * 1.5
|
||||||
|
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
|
||||||
|
} else {
|
||||||
|
time.Sleep(g.Opt.RequestDelay)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// logMiddleware logs requests
|
||||||
|
func logMiddleware(g *Geziyor, r *Request) {
|
||||||
|
log.Println("Fetching: ", r.URL.String())
|
||||||
|
}
|
||||||
|
|
||||||
// parseHTMLMiddleware parses response if response is HTML
|
// parseHTMLMiddleware parses response if response is HTML
|
||||||
func parseHTMLMiddleware(g *Geziyor, r *Response) {
|
func parseHTMLMiddleware(g *Geziyor, r *Response) {
|
||||||
if !g.Opt.ParseHTMLDisabled && r.isHTML() {
|
if !g.Opt.ParseHTMLDisabled && r.isHTML() {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user