diff --git a/README.md b/README.md index 9b4a6ce..47a8fea 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,8 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use [![Code Coverage](https://img.shields.io/codecov/c/github/geziyor/geziyor/master.svg)](https://codecov.io/github/geziyor/geziyor?branch=master) ## Features +- **JS Rendering** - 5.000+ Requests/Sec -- JS Rendering - Caching (Memory/Disk/LevelDB) - Automatic Data Exporting (JSON, CSV, or custom) - Metrics (Prometheus, Expvar, or custom) @@ -19,7 +19,7 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings. ## Status -The project is in **development phase**. Thus, we highly recommend you to use Geziyor with go modules. +The project is in **beta** phase. Thus, we highly recommend you to use Geziyor with go modules. ## Usage @@ -61,7 +61,7 @@ If you want to make JS rendered requests, make sure you have Chrome installed. If you want to make concurrent requests over 256, you need to increase limits. Read [this](https://wilsonmar.github.io/maximum-limits/) for more. -### Making Requests +### Making Normal Requests Initial requests start with ```StartURLs []string``` field in ```Options```. Geziyor makes concurrent requests to those URLs. @@ -84,7 +84,6 @@ You can make requests using ```Geziyor``` [methods](https://godoc.org/github.com geziyor.NewGeziyor(&geziyor.Options{ StartRequestsFunc: func(g *geziyor.Geziyor) { g.Get("https://httpbin.org/anything", g.Opt.ParseFunc) - g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc) g.Head("https://httpbin.org/anything", g.Opt.ParseFunc) }, ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { @@ -93,6 +92,23 @@ geziyor.NewGeziyor(&geziyor.Options{ }).Start() ``` +### Making JS Rendered Requests + +JS Rendered requests can be made using ```GetRendered``` method. +By default, geziyor uses local Chrome application CLI to start Chrome browser. Set ```BrowserEndpoint``` option to use different chrome instance. Such as, "ws://localhost:3000" + +```go +geziyor.NewGeziyor(&geziyor.Options{ + StartRequestsFunc: func(g *geziyor.Geziyor) { + g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc) + }, + ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { + fmt.Println(string(r.Body)) + }, + //BrowserEndpoint: "ws://localhost:3000", +}).Start() +``` + ### Extracting Data We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document). diff --git a/client/client.go b/client/client.go index 1d75c68..ac2d91d 100644 --- a/client/client.go +++ b/client/client.go @@ -31,6 +31,7 @@ type Client struct { charsetDetectDisabled bool retryTimes int retryHTTPCodes []int + remoteAllocatorURL string } const ( @@ -44,7 +45,7 @@ var ( ) // NewClient creates http.Client with modified values for typical web scraper -func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, retryHTTPCodes []int) *Client { +func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, retryHTTPCodes []int, remoteAllocatorURL string) *Client { httpClient := &http.Client{ Transport: &http.Transport{ Proxy: http.ProxyFromEnvironment, @@ -68,6 +69,7 @@ func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, re charsetDetectDisabled: charsetDetectDisabled, retryTimes: retryTimes, retryHTTPCodes: retryHTTPCodes, + remoteAllocatorURL: remoteAllocatorURL, } return &client @@ -156,12 +158,16 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) { var body string var res *network.Response - ctx, cancel := chromedp.NewContext(context.Background()) + ctx := context.Background() + if c.remoteAllocatorURL != "" { + ctx, _ = chromedp.NewRemoteAllocator(ctx, c.remoteAllocatorURL) + } + ctx, cancel := chromedp.NewContext(ctx) defer cancel() if err := chromedp.Run(ctx, network.Enable(), - network.SetExtraHTTPHeaders(network.Headers(ConvertHeaderToMap(req.Header))), + network.SetExtraHTTPHeaders(ConvertHeaderToMap(req.Header)), chromedp.ActionFunc(func(ctx context.Context) error { var reqID network.RequestID chromedp.ListenTarget(ctx, func(ev interface{}) { diff --git a/client/client_test.go b/client/client_test.go index db26895..0c1fb40 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -101,7 +101,7 @@ func TestCharsetFromHeaders(t *testing.T) { defer ts.Close() req, _ := NewRequest("GET", ts.URL, nil) - res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequest(req) + res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req) if string(res.Body) != "Gültekin" { t.Fatal(string(res.Body)) @@ -116,7 +116,7 @@ func TestCharsetFromBody(t *testing.T) { defer ts.Close() req, _ := NewRequest("GET", ts.URL, nil) - res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequest(req) + res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req) if string(res.Body) != "Gültekin" { t.Fatal(string(res.Body)) @@ -132,7 +132,7 @@ func TestCharsetProvidedWithRequest(t *testing.T) { req, _ := NewRequest("GET", ts.URL, nil) req.Encoding = "windows-1254" - res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequest(req) + res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req) if string(res.Body) != "Gültekin" { t.Fatal(string(res.Body)) @@ -141,7 +141,7 @@ func TestCharsetProvidedWithRequest(t *testing.T) { func TestRetry(t *testing.T) { req, _ := NewRequest("GET", "https://httpbin.org/status/500", nil) - res, err := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequest(req) + res, err := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req) assert.Nil(t, res) assert.Error(t, err) } diff --git a/geziyor.go b/geziyor.go index 01f68cb..ed2a540 100644 --- a/geziyor.go +++ b/geziyor.go @@ -67,7 +67,7 @@ func NewGeziyor(opt *Options) *Geziyor { } // Client - geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes) + geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes, opt.BrowserEndpoint) if opt.Cache != nil { geziyor.Client.Transport = &cache.Transport{ Policy: opt.CachePolicy, diff --git a/geziyor_test.go b/geziyor_test.go index 180aaba..d2d7a62 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -116,6 +116,20 @@ func TestGetRendered(t *testing.T) { }).Start() } +// Run chrome headless instance to test this +//func TestGetRenderedRemoteAllocator(t *testing.T) { +// geziyor.NewGeziyor(&geziyor.Options{ +// StartRequestsFunc: func(g *geziyor.Geziyor) { +// g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc) +// }, +// ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { +// fmt.Println(string(r.Body)) +// fmt.Println(r.Request.URL.String(), r.Header) +// }, +// BrowserEndpoint: "ws://localhost:3000", +// }).Start() +//} + func TestHEADRequest(t *testing.T) { geziyor.NewGeziyor(&geziyor.Options{ StartRequestsFunc: func(g *geziyor.Geziyor) { @@ -206,7 +220,7 @@ func BenchmarkRequests(b *testing.B) { ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { fmt.Fprint(w, "Hello, client") })) - ts.Client().Transport = client.NewClient(client.DefaultMaxBody, false, client.DefaultRetryTimes, client.DefaultRetryHTTPCodes).Transport + ts.Client().Transport = client.NewClient(client.DefaultMaxBody, false, client.DefaultRetryTimes, client.DefaultRetryHTTPCodes, "").Transport defer ts.Close() // As we don't benchmark creating a server, reset timer. diff --git a/options.go b/options.go index 84c1513..55979b6 100644 --- a/options.go +++ b/options.go @@ -15,6 +15,11 @@ type Options struct { // If empty, any domain is allowed AllowedDomains []string + // Chrome headless browser WS endpoint. + // If you want to run your own Chrome browser runner, provide its endpoint in here + // For example: ws://localhost:3000 + BrowserEndpoint string + // Cache storage backends. // - Memory // - Disk