From 97ecb7f118a299ef0021c7c3aed4e36bee52c8bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Fri, 24 Sep 2021 16:15:20 +0300 Subject: [PATCH] Proxy support --- README.md | 22 ++++++++++++++++++++++ client/client.go | 9 ++++++++- client/proxy.go | 47 +++++++++++++++++++++++++++++++++++++++++++++++ geziyor.go | 1 + geziyor_test.go | 27 +++++++++++++++++++++++++++ go.mod | 1 + go.sum | 4 ++++ options.go | 5 +++++ 8 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 client/proxy.go diff --git a/README.md b/README.md index 037b672..d7e0f65 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use - Request Delays (Constant/Randomized) - Cookies, Middlewares, robots.txt - Automatic response decoding to UTF-8 +- Proxy management (Single, Round-Robin, Custom) See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings. @@ -167,6 +168,27 @@ geziyor.NewGeziyor(&geziyor.Options{ }).Start() ``` +### Proxy - Use proxy per request +If you want to use proxy for your requests, and you have 1 proxy, you can just set these env values: +`HTTP_PROXY` +`HTTPS_PROXY` +And geziyor will use those proxies. + +Also, you can use in-order proxy per request by setting `ProxyFunc` option to `client.RoundRobinProxy` +Or any custom proxy selection function that you want. See `client/proxy.go` on how to implement that kind of custom proxy selection function. + +Proxies can be HTTP, HTTPS and SOCKS5. + +Note: If you use `http` scheme for proxy, It'll be used for http requests and not for https requests. + +```go +geziyor.NewGeziyor(&geziyor.Options{ + StartURLs: []string{"http://httpbin.org/anything"}, + ParseFunc: parseFunc, + ProxyFunc: client.RoundRobinProxy("http://some-http-proxy.com", "https://some-https-proxy.com", "socks5://some-socks5-proxy.com"), +}).Start() +``` + ## Benchmark **8748 request per seconds** on *Macbook Pro 15" 2016* diff --git a/client/client.go b/client/client.go index f3b61bb..8b566d8 100644 --- a/client/client.go +++ b/client/client.go @@ -38,6 +38,7 @@ type Options struct { RetryHTTPCodes []int RemoteAllocatorURL string AllocatorOptions []chromedp.ExecAllocatorOption + ProxyFunc func(*http.Request) (*url.URL, error) } // Default values for client @@ -53,9 +54,15 @@ var ( // NewClient creates http.Client with modified values for typical web scraper func NewClient(opt *Options) *Client { + // Default proxy function is http.ProxyFunction + var proxyFunction = http.ProxyFromEnvironment + if opt.ProxyFunc != nil { + proxyFunction = opt.ProxyFunc + } + httpClient := &http.Client{ Transport: &http.Transport{ - Proxy: http.ProxyFromEnvironment, + Proxy: proxyFunction, DialContext: (&net.Dialer{ Timeout: 30 * time.Second, KeepAlive: 30 * time.Second, diff --git a/client/proxy.go b/client/proxy.go new file mode 100644 index 0000000..0a97323 --- /dev/null +++ b/client/proxy.go @@ -0,0 +1,47 @@ +package client + +import ( + "context" + "github.com/geziyor/geziyor/internal" + "net/http" + "net/url" + "sync/atomic" +) + +const ProxyURLKey = 0 + +type roundRobinProxy struct { + proxyURLs []*url.URL + index uint32 +} + +func (r *roundRobinProxy) GetProxy(pr *http.Request) (*url.URL, error) { + index := atomic.AddUint32(&r.index, 1) - 1 + u := r.proxyURLs[index%uint32(len(r.proxyURLs))] + + // Set proxy url to context + ctx := context.WithValue(pr.Context(), ProxyURLKey, u.String()) + *pr = *pr.WithContext(ctx) + return u, nil +} + +// RoundRobinProxy creates a proxy switcher function which rotates +// ProxyURLs on every request. +// The proxy type is determined by the URL scheme. "http", "https" +// and "socks5" are supported. If the scheme is empty, +// "http" is assumed. +func RoundRobinProxy(proxyURLs ...string) func(*http.Request) (*url.URL, error) { + if len(proxyURLs) < 1 { + return http.ProxyFromEnvironment + } + parsedProxyURLs := make([]*url.URL, len(proxyURLs)) + for i, u := range proxyURLs { + parsedURL, err := url.Parse(u) + if err != nil { + internal.Logger.Printf("proxy url parse: %v", err) + return nil + } + parsedProxyURLs[i] = parsedURL + } + return (&roundRobinProxy{parsedProxyURLs, 0}).GetProxy +} diff --git a/geziyor.go b/geziyor.go index 2386984..5e3b639 100644 --- a/geziyor.go +++ b/geziyor.go @@ -79,6 +79,7 @@ func NewGeziyor(opt *Options) *Geziyor { RetryHTTPCodes: opt.RetryHTTPCodes, RemoteAllocatorURL: opt.BrowserEndpoint, AllocatorOptions: chromedp.DefaultExecAllocatorOptions[:], + ProxyFunc: opt.ProxyFunc, }) if opt.Cache != nil { geziyor.Client.Transport = &cache.Transport{ diff --git a/geziyor_test.go b/geziyor_test.go index 5d552b8..d15fa38 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "github.com/PuerkitoBio/goquery" + "github.com/elazarl/goproxy" "github.com/fortytw2/leaktest" "github.com/geziyor/geziyor" "github.com/geziyor/geziyor/cache" @@ -264,6 +265,32 @@ func TestPassMetadata(t *testing.T) { }).Start() } +func TestProxy(t *testing.T) { + // Setup fake proxy server + testHeaderKey := "Geziyor" + testHeaderVal := "value" + proxy := goproxy.NewProxyHttpServer() + proxy.OnRequest().DoFunc(func(r *http.Request, ctx *goproxy.ProxyCtx) (*http.Request, *http.Response) { + r.Header.Set(testHeaderKey, testHeaderVal) + return r, nil + }) + ts := httptest.NewServer(proxy) + defer ts.Close() + + geziyor.NewGeziyor(&geziyor.Options{ + StartURLs: []string{"http://httpbin.org/anything"}, + ProxyFunc: client.RoundRobinProxy(ts.URL), + RobotsTxtDisabled: true, + ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { + var data map[string]interface{} + err := json.Unmarshal(r.Body, &data) + assert.NoError(t, err) + // Check header set + assert.Equal(t, testHeaderVal, data["headers"].(map[string]interface{})[testHeaderKey]) + }, + }).Start() +} + // Make sure to increase open file descriptor limits before running func BenchmarkRequests(b *testing.B) { diff --git a/go.mod b/go.mod index 0499f20..2c8f105 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/VividCortex/gohistogram v1.0.0 // indirect github.com/chromedp/cdproto v0.0.0-20210713064928-7d28b402946a github.com/chromedp/chromedp v0.7.4 + github.com/elazarl/goproxy v0.0.0-20210801061803-8e322dfb79c4 // indirect github.com/fortytw2/leaktest v1.3.0 github.com/go-kit/kit v0.8.0 github.com/golang/snappy v0.0.3 // indirect diff --git a/go.sum b/go.sum index db4bdf9..605a8a1 100644 --- a/go.sum +++ b/go.sum @@ -18,6 +18,9 @@ github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moA github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/elazarl/goproxy v0.0.0-20210801061803-8e322dfb79c4 h1:lS3P5Nw3oPO05Lk2gFiYUOL3QPaH+fRoI1wFOc4G1UY= +github.com/elazarl/goproxy v0.0.0-20210801061803-8e322dfb79c4/go.mod h1:Ro8st/ElPeALwNFlcTpWmkr6IoMFfkjXAvTHpevnDsM= +github.com/elazarl/goproxy/ext v0.0.0-20190711103511-473e67f1d7d2/go.mod h1:gNh8nYJoAm43RfaxurUnxr+N1PwuFV3ZMl/efxlIlY8= github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I= @@ -79,6 +82,7 @@ github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y8 github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2 h1:6LJUbpNm42llc4HRCuvApCSWB/WfhuNo9K98Q9sNGfs= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= +github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYeAmZ5ZIpBWTGllZSQnw97Dj+woV0toclVaRGI8pc= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= diff --git a/options.go b/options.go index 92d8d36..0a8ec81 100644 --- a/options.go +++ b/options.go @@ -6,6 +6,8 @@ import ( "github.com/geziyor/geziyor/export" "github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/middleware" + "net/http" + "net/url" "time" ) @@ -69,6 +71,9 @@ type Options struct { // If true, HTML parsing is disabled to improve performance. ParseHTMLDisabled bool + // ProxyFunc setting proxy for each request + ProxyFunc func(*http.Request) (*url.URL, error) + // Request delays RequestDelay time.Duration