Request delays support added

This commit is contained in:
Musab Gültekin 2019-06-09 14:24:53 +03:00
parent 2263108838
commit b973c1c064
4 changed files with 32 additions and 2 deletions

View File

@ -8,7 +8,8 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
- 1.000+ Requests/Sec
- Caching (Memory/Disk)
- Automatic Data Exporting
- Limit Concurrency Global/Per Domain
- Limit Concurrency (Global/Per Domain)
- Request Delays (Constant/Randomized)
- Automatic response decoding to UTF-8

View File

@ -7,6 +7,7 @@ import (
"golang.org/x/net/html/charset"
"io/ioutil"
"log"
"math/rand"
"net/http"
"net/url"
"os"
@ -30,6 +31,7 @@ type Geziyor struct {
func init() {
log.SetOutput(os.Stdout)
rand.Seed(time.Now().UnixNano())
}
// NewGeziyor creates new Geziyor with default values.
@ -110,6 +112,15 @@ func (g *Geziyor) Do(req *http.Request) {
// Acquire Semaphore
g.acquireSem(req)
// Request Delay
if g.opt.RequestDelayRandomize {
min := float64(g.opt.RequestDelay) * 0.5
max := float64(g.opt.RequestDelay) * 1.5
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
} else {
time.Sleep(g.opt.RequestDelay)
}
// Log
log.Println("Fetching: ", req.URL.String())

View File

@ -5,7 +5,9 @@ import (
"github.com/PuerkitoBio/goquery"
"github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor"
"math/rand"
"testing"
"time"
)
func TestGeziyor_Simple(t *testing.T) {
@ -69,3 +71,12 @@ func TestGeziyor_Concurrent_Requests(t *testing.T) {
})
gez.Start()
}
func TestRandomDelay(t *testing.T) {
rand.Seed(time.Now().UnixNano())
delay := time.Millisecond * 1000
min := float64(delay) * 0.5
max := float64(delay) * 1.5
randomDelay := rand.Intn(int(max-min)) + int(min)
fmt.Println(time.Duration(randomDelay))
}

View File

@ -10,10 +10,13 @@ type Options struct {
// AllowedDomains is domains that are allowed to make requests
// If empty, any domain is allowed
AllowedDomains []string
// First requests will made to this url array. (Concurrently)
StartURLs []string
// ParseFunc is callback of StartURLs response.
ParseFunc func(response *Response)
// Timeout is global request timeout
Timeout time.Duration
@ -24,10 +27,14 @@ type Options struct {
// Concurrent requests limit
ConcurrentRequests int
// Concurrent requests per domain limit
ConcurrentRequestsPerDomain int
// User Agent
UserAgent string
// Request delays
RequestDelay time.Duration
// RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay
RequestDelayRandomize bool
}