Request delays support added
This commit is contained in:
parent
2263108838
commit
b973c1c064
@ -8,7 +8,8 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
|
||||
- 1.000+ Requests/Sec
|
||||
- Caching (Memory/Disk)
|
||||
- Automatic Data Exporting
|
||||
- Limit Concurrency Global/Per Domain
|
||||
- Limit Concurrency (Global/Per Domain)
|
||||
- Request Delays (Constant/Randomized)
|
||||
- Automatic response decoding to UTF-8
|
||||
|
||||
|
||||
|
11
geziyor.go
11
geziyor.go
@ -7,6 +7,7 @@ import (
|
||||
"golang.org/x/net/html/charset"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
@ -30,6 +31,7 @@ type Geziyor struct {
|
||||
|
||||
func init() {
|
||||
log.SetOutput(os.Stdout)
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
}
|
||||
|
||||
// NewGeziyor creates new Geziyor with default values.
|
||||
@ -110,6 +112,15 @@ func (g *Geziyor) Do(req *http.Request) {
|
||||
// Acquire Semaphore
|
||||
g.acquireSem(req)
|
||||
|
||||
// Request Delay
|
||||
if g.opt.RequestDelayRandomize {
|
||||
min := float64(g.opt.RequestDelay) * 0.5
|
||||
max := float64(g.opt.RequestDelay) * 1.5
|
||||
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
|
||||
} else {
|
||||
time.Sleep(g.opt.RequestDelay)
|
||||
}
|
||||
|
||||
// Log
|
||||
log.Println("Fetching: ", req.URL.String())
|
||||
|
||||
|
@ -5,7 +5,9 @@ import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/fpfeng/httpcache"
|
||||
"github.com/geziyor/geziyor"
|
||||
"math/rand"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestGeziyor_Simple(t *testing.T) {
|
||||
@ -69,3 +71,12 @@ func TestGeziyor_Concurrent_Requests(t *testing.T) {
|
||||
})
|
||||
gez.Start()
|
||||
}
|
||||
|
||||
func TestRandomDelay(t *testing.T) {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
delay := time.Millisecond * 1000
|
||||
min := float64(delay) * 0.5
|
||||
max := float64(delay) * 1.5
|
||||
randomDelay := rand.Intn(int(max-min)) + int(min)
|
||||
fmt.Println(time.Duration(randomDelay))
|
||||
}
|
||||
|
@ -10,10 +10,13 @@ type Options struct {
|
||||
// AllowedDomains is domains that are allowed to make requests
|
||||
// If empty, any domain is allowed
|
||||
AllowedDomains []string
|
||||
|
||||
// First requests will made to this url array. (Concurrently)
|
||||
StartURLs []string
|
||||
|
||||
// ParseFunc is callback of StartURLs response.
|
||||
ParseFunc func(response *Response)
|
||||
|
||||
// Timeout is global request timeout
|
||||
Timeout time.Duration
|
||||
|
||||
@ -24,10 +27,14 @@ type Options struct {
|
||||
|
||||
// Concurrent requests limit
|
||||
ConcurrentRequests int
|
||||
|
||||
// Concurrent requests per domain limit
|
||||
ConcurrentRequestsPerDomain int
|
||||
|
||||
// User Agent
|
||||
UserAgent string
|
||||
|
||||
// Request delays
|
||||
RequestDelay time.Duration
|
||||
// RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay
|
||||
RequestDelayRandomize bool
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user