diff --git a/geziyor.go b/geziyor.go index 4275b9c..c86d1e0 100644 --- a/geziyor.go +++ b/geziyor.go @@ -11,13 +11,10 @@ import ( "io" "io/ioutil" "log" - "math/rand" "net/http" "net/http/cookiejar" "net/url" - "os" "sync" - "time" ) // Exporter interface is for extracting data to external resources @@ -42,11 +39,6 @@ type Geziyor struct { responseMiddlewares []ResponseMiddleware } -func init() { - log.SetOutput(os.Stdout) - rand.Seed(time.Now().UnixNano()) -} - // NewGeziyor creates new Geziyor with default values. // If options provided, options func NewGeziyor(opt *Options) *Geziyor { @@ -58,6 +50,8 @@ func NewGeziyor(opt *Options) *Geziyor { allowedDomainsMiddleware, duplicateRequestsMiddleware, defaultHeadersMiddleware, + delayMiddleware, + logMiddleware, }, responseMiddlewares: []ResponseMiddleware{ parseHTMLMiddleware, @@ -168,6 +162,8 @@ func (g *Geziyor) Do(req *Request, callback func(g *Geziyor, r *Response)) { // Do sends an HTTP request func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) { + g.acquireSem(req) + defer g.releaseSem(req) defer g.wg.Done() defer recoverMiddleware() @@ -205,12 +201,6 @@ func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) { } func (g *Geziyor) doRequestClient(req *Request) (*Response, error) { - g.acquireSem(req) - defer g.releaseSem(req) - - g.delay() - - log.Println("Fetching: ", req.URL.String()) // Do request resp, err := g.Client.Do(req.Request) @@ -251,18 +241,13 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) { } func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) { - g.acquireSem(req) - defer g.releaseSem(req) - - g.delay() - - ctx, cancel := chromedp.NewContext(context.Background()) - defer cancel() - var body string var reqID network.RequestID var res *network.Response + ctx, cancel := chromedp.NewContext(context.Background()) + defer cancel() + if err := chromedp.Run(ctx, network.Enable(), network.SetExtraHTTPHeaders(network.Headers(internal.ConvertHeaderToMap(req.Header))), @@ -339,13 +324,3 @@ func (g *Geziyor) releaseSem(req *Request) { <-g.semHosts.hostSems[req.Host] } } - -func (g *Geziyor) delay() { - if g.Opt.RequestDelayRandomize { - min := float64(g.Opt.RequestDelay) * 0.5 - max := float64(g.Opt.RequestDelay) * 1.5 - time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min))) - } else { - time.Sleep(g.Opt.RequestDelay) - } -} diff --git a/middleware.go b/middleware.go index 3dcbf59..cd7cd32 100644 --- a/middleware.go +++ b/middleware.go @@ -5,7 +5,10 @@ import ( "github.com/PuerkitoBio/goquery" "github.com/geziyor/geziyor/internal" "log" + "math/rand" + "os" "runtime/debug" + "time" ) // RequestMiddleware called before requests made. @@ -15,6 +18,11 @@ type RequestMiddleware func(g *Geziyor, r *Request) // ResponseMiddleware called after request response receive type ResponseMiddleware func(g *Geziyor, r *Response) +func init() { + log.SetOutput(os.Stdout) + rand.Seed(time.Now().UnixNano()) +} + // recoverMiddleware recovers scraping being crashed. // Logs error and stack trace func recoverMiddleware() { @@ -51,6 +59,22 @@ func defaultHeadersMiddleware(g *Geziyor, r *Request) { r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent) } +// delayMiddleware delays requests +func delayMiddleware(g *Geziyor, r *Request) { + if g.Opt.RequestDelayRandomize { + min := float64(g.Opt.RequestDelay) * 0.5 + max := float64(g.Opt.RequestDelay) * 1.5 + time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min))) + } else { + time.Sleep(g.Opt.RequestDelay) + } +} + +// logMiddleware logs requests +func logMiddleware(g *Geziyor, r *Request) { + log.Println("Fetching: ", r.URL.String()) +} + // parseHTMLMiddleware parses response if response is HTML func parseHTMLMiddleware(g *Geziyor, r *Response) { if !g.Opt.ParseHTMLDisabled && r.isHTML() {