Proxy support

This commit is contained in:
Musab Gültekin 2021-09-24 16:15:20 +03:00
parent 242b025c9a
commit 97ecb7f118
8 changed files with 115 additions and 1 deletions

View File

@ -15,6 +15,7 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
- Request Delays (Constant/Randomized)
- Cookies, Middlewares, robots.txt
- Automatic response decoding to UTF-8
- Proxy management (Single, Round-Robin, Custom)
See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings.
@ -167,6 +168,27 @@ geziyor.NewGeziyor(&geziyor.Options{
}).Start()
```
### Proxy - Use proxy per request
If you want to use proxy for your requests, and you have 1 proxy, you can just set these env values:
`HTTP_PROXY`
`HTTPS_PROXY`
And geziyor will use those proxies.
Also, you can use in-order proxy per request by setting `ProxyFunc` option to `client.RoundRobinProxy`
Or any custom proxy selection function that you want. See `client/proxy.go` on how to implement that kind of custom proxy selection function.
Proxies can be HTTP, HTTPS and SOCKS5.
Note: If you use `http` scheme for proxy, It'll be used for http requests and not for https requests.
```go
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://httpbin.org/anything"},
ParseFunc: parseFunc,
ProxyFunc: client.RoundRobinProxy("http://some-http-proxy.com", "https://some-https-proxy.com", "socks5://some-socks5-proxy.com"),
}).Start()
```
## Benchmark
**8748 request per seconds** on *Macbook Pro 15" 2016*

View File

@ -38,6 +38,7 @@ type Options struct {
RetryHTTPCodes []int
RemoteAllocatorURL string
AllocatorOptions []chromedp.ExecAllocatorOption
ProxyFunc func(*http.Request) (*url.URL, error)
}
// Default values for client
@ -53,9 +54,15 @@ var (
// NewClient creates http.Client with modified values for typical web scraper
func NewClient(opt *Options) *Client {
// Default proxy function is http.ProxyFunction
var proxyFunction = http.ProxyFromEnvironment
if opt.ProxyFunc != nil {
proxyFunction = opt.ProxyFunc
}
httpClient := &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
Proxy: proxyFunction,
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,

47
client/proxy.go Normal file
View File

@ -0,0 +1,47 @@
package client
import (
"context"
"github.com/geziyor/geziyor/internal"
"net/http"
"net/url"
"sync/atomic"
)
const ProxyURLKey = 0
type roundRobinProxy struct {
proxyURLs []*url.URL
index uint32
}
func (r *roundRobinProxy) GetProxy(pr *http.Request) (*url.URL, error) {
index := atomic.AddUint32(&r.index, 1) - 1
u := r.proxyURLs[index%uint32(len(r.proxyURLs))]
// Set proxy url to context
ctx := context.WithValue(pr.Context(), ProxyURLKey, u.String())
*pr = *pr.WithContext(ctx)
return u, nil
}
// RoundRobinProxy creates a proxy switcher function which rotates
// ProxyURLs on every request.
// The proxy type is determined by the URL scheme. "http", "https"
// and "socks5" are supported. If the scheme is empty,
// "http" is assumed.
func RoundRobinProxy(proxyURLs ...string) func(*http.Request) (*url.URL, error) {
if len(proxyURLs) < 1 {
return http.ProxyFromEnvironment
}
parsedProxyURLs := make([]*url.URL, len(proxyURLs))
for i, u := range proxyURLs {
parsedURL, err := url.Parse(u)
if err != nil {
internal.Logger.Printf("proxy url parse: %v", err)
return nil
}
parsedProxyURLs[i] = parsedURL
}
return (&roundRobinProxy{parsedProxyURLs, 0}).GetProxy
}

View File

@ -79,6 +79,7 @@ func NewGeziyor(opt *Options) *Geziyor {
RetryHTTPCodes: opt.RetryHTTPCodes,
RemoteAllocatorURL: opt.BrowserEndpoint,
AllocatorOptions: chromedp.DefaultExecAllocatorOptions[:],
ProxyFunc: opt.ProxyFunc,
})
if opt.Cache != nil {
geziyor.Client.Transport = &cache.Transport{

View File

@ -4,6 +4,7 @@ import (
"encoding/json"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/elazarl/goproxy"
"github.com/fortytw2/leaktest"
"github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/cache"
@ -264,6 +265,32 @@ func TestPassMetadata(t *testing.T) {
}).Start()
}
func TestProxy(t *testing.T) {
// Setup fake proxy server
testHeaderKey := "Geziyor"
testHeaderVal := "value"
proxy := goproxy.NewProxyHttpServer()
proxy.OnRequest().DoFunc(func(r *http.Request, ctx *goproxy.ProxyCtx) (*http.Request, *http.Response) {
r.Header.Set(testHeaderKey, testHeaderVal)
return r, nil
})
ts := httptest.NewServer(proxy)
defer ts.Close()
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://httpbin.org/anything"},
ProxyFunc: client.RoundRobinProxy(ts.URL),
RobotsTxtDisabled: true,
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
var data map[string]interface{}
err := json.Unmarshal(r.Body, &data)
assert.NoError(t, err)
// Check header set
assert.Equal(t, testHeaderVal, data["headers"].(map[string]interface{})[testHeaderKey])
},
}).Start()
}
// Make sure to increase open file descriptor limits before running
func BenchmarkRequests(b *testing.B) {

1
go.mod
View File

@ -7,6 +7,7 @@ require (
github.com/VividCortex/gohistogram v1.0.0 // indirect
github.com/chromedp/cdproto v0.0.0-20210713064928-7d28b402946a
github.com/chromedp/chromedp v0.7.4
github.com/elazarl/goproxy v0.0.0-20210801061803-8e322dfb79c4 // indirect
github.com/fortytw2/leaktest v1.3.0
github.com/go-kit/kit v0.8.0
github.com/golang/snappy v0.0.3 // indirect

4
go.sum
View File

@ -18,6 +18,9 @@ github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moA
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/elazarl/goproxy v0.0.0-20210801061803-8e322dfb79c4 h1:lS3P5Nw3oPO05Lk2gFiYUOL3QPaH+fRoI1wFOc4G1UY=
github.com/elazarl/goproxy v0.0.0-20210801061803-8e322dfb79c4/go.mod h1:Ro8st/ElPeALwNFlcTpWmkr6IoMFfkjXAvTHpevnDsM=
github.com/elazarl/goproxy/ext v0.0.0-20190711103511-473e67f1d7d2/go.mod h1:gNh8nYJoAm43RfaxurUnxr+N1PwuFV3ZMl/efxlIlY8=
github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw=
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=
@ -79,6 +82,7 @@ github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y8
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.2 h1:6LJUbpNm42llc4HRCuvApCSWB/WfhuNo9K98Q9sNGfs=
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYeAmZ5ZIpBWTGllZSQnw97Dj+woV0toclVaRGI8pc=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=

View File

@ -6,6 +6,8 @@ import (
"github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/metrics"
"github.com/geziyor/geziyor/middleware"
"net/http"
"net/url"
"time"
)
@ -69,6 +71,9 @@ type Options struct {
// If true, HTML parsing is disabled to improve performance.
ParseHTMLDisabled bool
// ProxyFunc setting proxy for each request
ProxyFunc func(*http.Request) (*url.URL, error)
// Request delays
RequestDelay time.Duration