commit
019fe62883
22
README.md
22
README.md
@ -15,6 +15,7 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
|
|||||||
- Request Delays (Constant/Randomized)
|
- Request Delays (Constant/Randomized)
|
||||||
- Cookies, Middlewares, robots.txt
|
- Cookies, Middlewares, robots.txt
|
||||||
- Automatic response decoding to UTF-8
|
- Automatic response decoding to UTF-8
|
||||||
|
- Proxy management (Single, Round-Robin, Custom)
|
||||||
|
|
||||||
See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings.
|
See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings.
|
||||||
|
|
||||||
@ -167,6 +168,27 @@ geziyor.NewGeziyor(&geziyor.Options{
|
|||||||
}).Start()
|
}).Start()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Proxy - Use proxy per request
|
||||||
|
If you want to use proxy for your requests, and you have 1 proxy, you can just set these env values:
|
||||||
|
`HTTP_PROXY`
|
||||||
|
`HTTPS_PROXY`
|
||||||
|
And geziyor will use those proxies.
|
||||||
|
|
||||||
|
Also, you can use in-order proxy per request by setting `ProxyFunc` option to `client.RoundRobinProxy`
|
||||||
|
Or any custom proxy selection function that you want. See `client/proxy.go` on how to implement that kind of custom proxy selection function.
|
||||||
|
|
||||||
|
Proxies can be HTTP, HTTPS and SOCKS5.
|
||||||
|
|
||||||
|
Note: If you use `http` scheme for proxy, It'll be used for http requests and not for https requests.
|
||||||
|
|
||||||
|
```go
|
||||||
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
|
StartURLs: []string{"http://httpbin.org/anything"},
|
||||||
|
ParseFunc: parseFunc,
|
||||||
|
ProxyFunc: client.RoundRobinProxy("http://some-http-proxy.com", "https://some-https-proxy.com", "socks5://some-socks5-proxy.com"),
|
||||||
|
}).Start()
|
||||||
|
```
|
||||||
|
|
||||||
## Benchmark
|
## Benchmark
|
||||||
|
|
||||||
**8748 request per seconds** on *Macbook Pro 15" 2016*
|
**8748 request per seconds** on *Macbook Pro 15" 2016*
|
||||||
|
@ -38,6 +38,7 @@ type Options struct {
|
|||||||
RetryHTTPCodes []int
|
RetryHTTPCodes []int
|
||||||
RemoteAllocatorURL string
|
RemoteAllocatorURL string
|
||||||
AllocatorOptions []chromedp.ExecAllocatorOption
|
AllocatorOptions []chromedp.ExecAllocatorOption
|
||||||
|
ProxyFunc func(*http.Request) (*url.URL, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Default values for client
|
// Default values for client
|
||||||
@ -53,9 +54,15 @@ var (
|
|||||||
|
|
||||||
// NewClient creates http.Client with modified values for typical web scraper
|
// NewClient creates http.Client with modified values for typical web scraper
|
||||||
func NewClient(opt *Options) *Client {
|
func NewClient(opt *Options) *Client {
|
||||||
|
// Default proxy function is http.ProxyFunction
|
||||||
|
var proxyFunction = http.ProxyFromEnvironment
|
||||||
|
if opt.ProxyFunc != nil {
|
||||||
|
proxyFunction = opt.ProxyFunc
|
||||||
|
}
|
||||||
|
|
||||||
httpClient := &http.Client{
|
httpClient := &http.Client{
|
||||||
Transport: &http.Transport{
|
Transport: &http.Transport{
|
||||||
Proxy: http.ProxyFromEnvironment,
|
Proxy: proxyFunction,
|
||||||
DialContext: (&net.Dialer{
|
DialContext: (&net.Dialer{
|
||||||
Timeout: 30 * time.Second,
|
Timeout: 30 * time.Second,
|
||||||
KeepAlive: 30 * time.Second,
|
KeepAlive: 30 * time.Second,
|
||||||
|
47
client/proxy.go
Normal file
47
client/proxy.go
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
package client
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"github.com/geziyor/geziyor/internal"
|
||||||
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
"sync/atomic"
|
||||||
|
)
|
||||||
|
|
||||||
|
const ProxyURLKey = 0
|
||||||
|
|
||||||
|
type roundRobinProxy struct {
|
||||||
|
proxyURLs []*url.URL
|
||||||
|
index uint32
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *roundRobinProxy) GetProxy(pr *http.Request) (*url.URL, error) {
|
||||||
|
index := atomic.AddUint32(&r.index, 1) - 1
|
||||||
|
u := r.proxyURLs[index%uint32(len(r.proxyURLs))]
|
||||||
|
|
||||||
|
// Set proxy url to context
|
||||||
|
ctx := context.WithValue(pr.Context(), ProxyURLKey, u.String())
|
||||||
|
*pr = *pr.WithContext(ctx)
|
||||||
|
return u, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// RoundRobinProxy creates a proxy switcher function which rotates
|
||||||
|
// ProxyURLs on every request.
|
||||||
|
// The proxy type is determined by the URL scheme. "http", "https"
|
||||||
|
// and "socks5" are supported. If the scheme is empty,
|
||||||
|
// "http" is assumed.
|
||||||
|
func RoundRobinProxy(proxyURLs ...string) func(*http.Request) (*url.URL, error) {
|
||||||
|
if len(proxyURLs) < 1 {
|
||||||
|
return http.ProxyFromEnvironment
|
||||||
|
}
|
||||||
|
parsedProxyURLs := make([]*url.URL, len(proxyURLs))
|
||||||
|
for i, u := range proxyURLs {
|
||||||
|
parsedURL, err := url.Parse(u)
|
||||||
|
if err != nil {
|
||||||
|
internal.Logger.Printf("proxy url parse: %v", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
parsedProxyURLs[i] = parsedURL
|
||||||
|
}
|
||||||
|
return (&roundRobinProxy{parsedProxyURLs, 0}).GetProxy
|
||||||
|
}
|
@ -79,6 +79,7 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
RetryHTTPCodes: opt.RetryHTTPCodes,
|
RetryHTTPCodes: opt.RetryHTTPCodes,
|
||||||
RemoteAllocatorURL: opt.BrowserEndpoint,
|
RemoteAllocatorURL: opt.BrowserEndpoint,
|
||||||
AllocatorOptions: chromedp.DefaultExecAllocatorOptions[:],
|
AllocatorOptions: chromedp.DefaultExecAllocatorOptions[:],
|
||||||
|
ProxyFunc: opt.ProxyFunc,
|
||||||
})
|
})
|
||||||
if opt.Cache != nil {
|
if opt.Cache != nil {
|
||||||
geziyor.Client.Transport = &cache.Transport{
|
geziyor.Client.Transport = &cache.Transport{
|
||||||
|
@ -4,6 +4,7 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
"github.com/elazarl/goproxy"
|
||||||
"github.com/fortytw2/leaktest"
|
"github.com/fortytw2/leaktest"
|
||||||
"github.com/geziyor/geziyor"
|
"github.com/geziyor/geziyor"
|
||||||
"github.com/geziyor/geziyor/cache"
|
"github.com/geziyor/geziyor/cache"
|
||||||
@ -264,6 +265,32 @@ func TestPassMetadata(t *testing.T) {
|
|||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestProxy(t *testing.T) {
|
||||||
|
// Setup fake proxy server
|
||||||
|
testHeaderKey := "Geziyor"
|
||||||
|
testHeaderVal := "value"
|
||||||
|
proxy := goproxy.NewProxyHttpServer()
|
||||||
|
proxy.OnRequest().DoFunc(func(r *http.Request, ctx *goproxy.ProxyCtx) (*http.Request, *http.Response) {
|
||||||
|
r.Header.Set(testHeaderKey, testHeaderVal)
|
||||||
|
return r, nil
|
||||||
|
})
|
||||||
|
ts := httptest.NewServer(proxy)
|
||||||
|
defer ts.Close()
|
||||||
|
|
||||||
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
|
StartURLs: []string{"http://httpbin.org/anything"},
|
||||||
|
ProxyFunc: client.RoundRobinProxy(ts.URL),
|
||||||
|
RobotsTxtDisabled: true,
|
||||||
|
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
||||||
|
var data map[string]interface{}
|
||||||
|
err := json.Unmarshal(r.Body, &data)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
// Check header set
|
||||||
|
assert.Equal(t, testHeaderVal, data["headers"].(map[string]interface{})[testHeaderKey])
|
||||||
|
},
|
||||||
|
}).Start()
|
||||||
|
}
|
||||||
|
|
||||||
// Make sure to increase open file descriptor limits before running
|
// Make sure to increase open file descriptor limits before running
|
||||||
func BenchmarkRequests(b *testing.B) {
|
func BenchmarkRequests(b *testing.B) {
|
||||||
|
|
||||||
|
1
go.mod
1
go.mod
@ -7,6 +7,7 @@ require (
|
|||||||
github.com/VividCortex/gohistogram v1.0.0 // indirect
|
github.com/VividCortex/gohistogram v1.0.0 // indirect
|
||||||
github.com/chromedp/cdproto v0.0.0-20210713064928-7d28b402946a
|
github.com/chromedp/cdproto v0.0.0-20210713064928-7d28b402946a
|
||||||
github.com/chromedp/chromedp v0.7.4
|
github.com/chromedp/chromedp v0.7.4
|
||||||
|
github.com/elazarl/goproxy v0.0.0-20210801061803-8e322dfb79c4 // indirect
|
||||||
github.com/fortytw2/leaktest v1.3.0
|
github.com/fortytw2/leaktest v1.3.0
|
||||||
github.com/go-kit/kit v0.8.0
|
github.com/go-kit/kit v0.8.0
|
||||||
github.com/golang/snappy v0.0.3 // indirect
|
github.com/golang/snappy v0.0.3 // indirect
|
||||||
|
4
go.sum
4
go.sum
@ -18,6 +18,9 @@ github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moA
|
|||||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/elazarl/goproxy v0.0.0-20210801061803-8e322dfb79c4 h1:lS3P5Nw3oPO05Lk2gFiYUOL3QPaH+fRoI1wFOc4G1UY=
|
||||||
|
github.com/elazarl/goproxy v0.0.0-20210801061803-8e322dfb79c4/go.mod h1:Ro8st/ElPeALwNFlcTpWmkr6IoMFfkjXAvTHpevnDsM=
|
||||||
|
github.com/elazarl/goproxy/ext v0.0.0-20190711103511-473e67f1d7d2/go.mod h1:gNh8nYJoAm43RfaxurUnxr+N1PwuFV3ZMl/efxlIlY8=
|
||||||
github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw=
|
github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw=
|
||||||
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
|
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
|
||||||
github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=
|
github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=
|
||||||
@ -79,6 +82,7 @@ github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y8
|
|||||||
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
|
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
|
||||||
github.com/prometheus/procfs v0.0.2 h1:6LJUbpNm42llc4HRCuvApCSWB/WfhuNo9K98Q9sNGfs=
|
github.com/prometheus/procfs v0.0.2 h1:6LJUbpNm42llc4HRCuvApCSWB/WfhuNo9K98Q9sNGfs=
|
||||||
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
|
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
|
||||||
|
github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYeAmZ5ZIpBWTGllZSQnw97Dj+woV0toclVaRGI8pc=
|
||||||
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
|
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
|
||||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
|
@ -6,6 +6,8 @@ import (
|
|||||||
"github.com/geziyor/geziyor/export"
|
"github.com/geziyor/geziyor/export"
|
||||||
"github.com/geziyor/geziyor/metrics"
|
"github.com/geziyor/geziyor/metrics"
|
||||||
"github.com/geziyor/geziyor/middleware"
|
"github.com/geziyor/geziyor/middleware"
|
||||||
|
"net/http"
|
||||||
|
"net/url"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -69,6 +71,9 @@ type Options struct {
|
|||||||
// If true, HTML parsing is disabled to improve performance.
|
// If true, HTML parsing is disabled to improve performance.
|
||||||
ParseHTMLDisabled bool
|
ParseHTMLDisabled bool
|
||||||
|
|
||||||
|
// ProxyFunc setting proxy for each request
|
||||||
|
ProxyFunc func(*http.Request) (*url.URL, error)
|
||||||
|
|
||||||
// Request delays
|
// Request delays
|
||||||
RequestDelay time.Duration
|
RequestDelay time.Duration
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user