Request delays support added
This commit is contained in:
parent
2263108838
commit
b973c1c064
@ -8,7 +8,8 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
|
|||||||
- 1.000+ Requests/Sec
|
- 1.000+ Requests/Sec
|
||||||
- Caching (Memory/Disk)
|
- Caching (Memory/Disk)
|
||||||
- Automatic Data Exporting
|
- Automatic Data Exporting
|
||||||
- Limit Concurrency Global/Per Domain
|
- Limit Concurrency (Global/Per Domain)
|
||||||
|
- Request Delays (Constant/Randomized)
|
||||||
- Automatic response decoding to UTF-8
|
- Automatic response decoding to UTF-8
|
||||||
|
|
||||||
|
|
||||||
|
11
geziyor.go
11
geziyor.go
@ -7,6 +7,7 @@ import (
|
|||||||
"golang.org/x/net/html/charset"
|
"golang.org/x/net/html/charset"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
|
"math/rand"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
@ -30,6 +31,7 @@ type Geziyor struct {
|
|||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
log.SetOutput(os.Stdout)
|
log.SetOutput(os.Stdout)
|
||||||
|
rand.Seed(time.Now().UnixNano())
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewGeziyor creates new Geziyor with default values.
|
// NewGeziyor creates new Geziyor with default values.
|
||||||
@ -110,6 +112,15 @@ func (g *Geziyor) Do(req *http.Request) {
|
|||||||
// Acquire Semaphore
|
// Acquire Semaphore
|
||||||
g.acquireSem(req)
|
g.acquireSem(req)
|
||||||
|
|
||||||
|
// Request Delay
|
||||||
|
if g.opt.RequestDelayRandomize {
|
||||||
|
min := float64(g.opt.RequestDelay) * 0.5
|
||||||
|
max := float64(g.opt.RequestDelay) * 1.5
|
||||||
|
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
|
||||||
|
} else {
|
||||||
|
time.Sleep(g.opt.RequestDelay)
|
||||||
|
}
|
||||||
|
|
||||||
// Log
|
// Log
|
||||||
log.Println("Fetching: ", req.URL.String())
|
log.Println("Fetching: ", req.URL.String())
|
||||||
|
|
||||||
|
@ -5,7 +5,9 @@ import (
|
|||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"github.com/fpfeng/httpcache"
|
"github.com/fpfeng/httpcache"
|
||||||
"github.com/geziyor/geziyor"
|
"github.com/geziyor/geziyor"
|
||||||
|
"math/rand"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestGeziyor_Simple(t *testing.T) {
|
func TestGeziyor_Simple(t *testing.T) {
|
||||||
@ -69,3 +71,12 @@ func TestGeziyor_Concurrent_Requests(t *testing.T) {
|
|||||||
})
|
})
|
||||||
gez.Start()
|
gez.Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRandomDelay(t *testing.T) {
|
||||||
|
rand.Seed(time.Now().UnixNano())
|
||||||
|
delay := time.Millisecond * 1000
|
||||||
|
min := float64(delay) * 0.5
|
||||||
|
max := float64(delay) * 1.5
|
||||||
|
randomDelay := rand.Intn(int(max-min)) + int(min)
|
||||||
|
fmt.Println(time.Duration(randomDelay))
|
||||||
|
}
|
||||||
|
@ -10,10 +10,13 @@ type Options struct {
|
|||||||
// AllowedDomains is domains that are allowed to make requests
|
// AllowedDomains is domains that are allowed to make requests
|
||||||
// If empty, any domain is allowed
|
// If empty, any domain is allowed
|
||||||
AllowedDomains []string
|
AllowedDomains []string
|
||||||
|
|
||||||
// First requests will made to this url array. (Concurrently)
|
// First requests will made to this url array. (Concurrently)
|
||||||
StartURLs []string
|
StartURLs []string
|
||||||
|
|
||||||
// ParseFunc is callback of StartURLs response.
|
// ParseFunc is callback of StartURLs response.
|
||||||
ParseFunc func(response *Response)
|
ParseFunc func(response *Response)
|
||||||
|
|
||||||
// Timeout is global request timeout
|
// Timeout is global request timeout
|
||||||
Timeout time.Duration
|
Timeout time.Duration
|
||||||
|
|
||||||
@ -24,10 +27,14 @@ type Options struct {
|
|||||||
|
|
||||||
// Concurrent requests limit
|
// Concurrent requests limit
|
||||||
ConcurrentRequests int
|
ConcurrentRequests int
|
||||||
|
|
||||||
// Concurrent requests per domain limit
|
// Concurrent requests per domain limit
|
||||||
ConcurrentRequestsPerDomain int
|
ConcurrentRequestsPerDomain int
|
||||||
|
|
||||||
// User Agent
|
// User Agent
|
||||||
UserAgent string
|
UserAgent string
|
||||||
|
|
||||||
|
// Request delays
|
||||||
|
RequestDelay time.Duration
|
||||||
|
// RequestDelayRandomize uses random interval between 0.5 * RequestDelay and 1.5 * RequestDelay
|
||||||
|
RequestDelayRandomize bool
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user