diff --git a/README.md b/README.md index 9535f56..cdb0870 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,8 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl - 1.000+ Requests/Sec - Caching (Memory/Disk) - Automatic Data Exporting -- Limit Concurrency Global/Per Domain +- Limit Concurrency (Global/Per Domain) +- Request Delays (Constant/Randomized) - Automatic response decoding to UTF-8 diff --git a/geziyor.go b/geziyor.go index 5815a8b..8088fad 100644 --- a/geziyor.go +++ b/geziyor.go @@ -7,6 +7,7 @@ import ( "golang.org/x/net/html/charset" "io/ioutil" "log" + "math/rand" "net/http" "net/url" "os" @@ -30,6 +31,7 @@ type Geziyor struct { func init() { log.SetOutput(os.Stdout) + rand.Seed(time.Now().UnixNano()) } // NewGeziyor creates new Geziyor with default values. @@ -110,6 +112,15 @@ func (g *Geziyor) Do(req *http.Request) { // Acquire Semaphore g.acquireSem(req) + // Request Delay + if g.opt.RequestDelayRandomize { + min := float64(g.opt.RequestDelay) * 0.5 + max := float64(g.opt.RequestDelay) * 1.5 + time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min))) + } else { + time.Sleep(g.opt.RequestDelay) + } + // Log log.Println("Fetching: ", req.URL.String()) diff --git a/geziyor_test.go b/geziyor_test.go index 16c5117..3fe1f93 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -5,7 +5,9 @@ import ( "github.com/PuerkitoBio/goquery" "github.com/fpfeng/httpcache" "github.com/geziyor/geziyor" + "math/rand" "testing" + "time" ) func TestGeziyor_Simple(t *testing.T) { @@ -69,3 +71,12 @@ func TestGeziyor_Concurrent_Requests(t *testing.T) { }) gez.Start() } + +func TestRandomDelay(t *testing.T) { + rand.Seed(time.Now().UnixNano()) + delay := time.Millisecond * 1000 + min := float64(delay) * 0.5 + max := float64(delay) * 1.5 + randomDelay := rand.Intn(int(max-min)) + int(min) + fmt.Println(time.Duration(randomDelay)) +} diff --git a/options.go b/options.go index ecb6623..b51445d 100644 --- a/options.go +++ b/options.go @@ -10,10 +10,13 @@ type Options struct { // AllowedDomains is domains that are allowed to make requests // If empty, any domain is allowed AllowedDomains []string + // First requests will made to this url array. (Concurrently) StartURLs []string + // ParseFunc is callback of StartURLs response. ParseFunc func(response *Response) + // Timeout is global request timeout Timeout time.Duration @@ -24,10 +27,14 @@ type Options struct { // Concurrent requests limit ConcurrentRequests int - // Concurrent requests per domain limit ConcurrentRequestsPerDomain int // User Agent UserAgent string + + // Request delays + RequestDelay time.Duration + // RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay + RequestDelayRandomize bool }