Remote endpoint support added for js rendered requests. Geziyor is beta now.

This commit is contained in:
Musab Gültekin 2019-08-05 15:14:47 +03:00
parent c117d71fef
commit 0e5230eac8
6 changed files with 54 additions and 13 deletions

View File

@ -6,8 +6,8 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
[![Code Coverage](https://img.shields.io/codecov/c/github/geziyor/geziyor/master.svg)](https://codecov.io/github/geziyor/geziyor?branch=master) [![Code Coverage](https://img.shields.io/codecov/c/github/geziyor/geziyor/master.svg)](https://codecov.io/github/geziyor/geziyor?branch=master)
## Features ## Features
- **JS Rendering**
- 5.000+ Requests/Sec - 5.000+ Requests/Sec
- JS Rendering
- Caching (Memory/Disk/LevelDB) - Caching (Memory/Disk/LevelDB)
- Automatic Data Exporting (JSON, CSV, or custom) - Automatic Data Exporting (JSON, CSV, or custom)
- Metrics (Prometheus, Expvar, or custom) - Metrics (Prometheus, Expvar, or custom)
@ -19,7 +19,7 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings. See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings.
## Status ## Status
The project is in **development phase**. Thus, we highly recommend you to use Geziyor with go modules. The project is in **beta** phase. Thus, we highly recommend you to use Geziyor with go modules.
## Usage ## Usage
@ -61,7 +61,7 @@ If you want to make JS rendered requests, make sure you have Chrome installed.
If you want to make concurrent requests over 256, you need to increase limits. If you want to make concurrent requests over 256, you need to increase limits.
Read [this](https://wilsonmar.github.io/maximum-limits/) for more. Read [this](https://wilsonmar.github.io/maximum-limits/) for more.
### Making Requests ### Making Normal Requests
Initial requests start with ```StartURLs []string``` field in ```Options```. Initial requests start with ```StartURLs []string``` field in ```Options```.
Geziyor makes concurrent requests to those URLs. Geziyor makes concurrent requests to those URLs.
@ -84,7 +84,6 @@ You can make requests using ```Geziyor``` [methods](https://godoc.org/github.com
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) { StartRequestsFunc: func(g *geziyor.Geziyor) {
g.Get("https://httpbin.org/anything", g.Opt.ParseFunc) g.Get("https://httpbin.org/anything", g.Opt.ParseFunc)
g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
g.Head("https://httpbin.org/anything", g.Opt.ParseFunc) g.Head("https://httpbin.org/anything", g.Opt.ParseFunc)
}, },
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
@ -93,6 +92,23 @@ geziyor.NewGeziyor(&geziyor.Options{
}).Start() }).Start()
``` ```
### Making JS Rendered Requests
JS Rendered requests can be made using ```GetRendered``` method.
By default, geziyor uses local Chrome application CLI to start Chrome browser. Set ```BrowserEndpoint``` option to use different chrome instance. Such as, "ws://localhost:3000"
```go
geziyor.NewGeziyor(&geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) {
g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
},
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
fmt.Println(string(r.Body))
},
//BrowserEndpoint: "ws://localhost:3000",
}).Start()
```
### Extracting Data ### Extracting Data
We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document). We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document).

View File

@ -31,6 +31,7 @@ type Client struct {
charsetDetectDisabled bool charsetDetectDisabled bool
retryTimes int retryTimes int
retryHTTPCodes []int retryHTTPCodes []int
remoteAllocatorURL string
} }
const ( const (
@ -44,7 +45,7 @@ var (
) )
// NewClient creates http.Client with modified values for typical web scraper // NewClient creates http.Client with modified values for typical web scraper
func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, retryHTTPCodes []int) *Client { func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, retryHTTPCodes []int, remoteAllocatorURL string) *Client {
httpClient := &http.Client{ httpClient := &http.Client{
Transport: &http.Transport{ Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment, Proxy: http.ProxyFromEnvironment,
@ -68,6 +69,7 @@ func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, re
charsetDetectDisabled: charsetDetectDisabled, charsetDetectDisabled: charsetDetectDisabled,
retryTimes: retryTimes, retryTimes: retryTimes,
retryHTTPCodes: retryHTTPCodes, retryHTTPCodes: retryHTTPCodes,
remoteAllocatorURL: remoteAllocatorURL,
} }
return &client return &client
@ -156,12 +158,16 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) {
var body string var body string
var res *network.Response var res *network.Response
ctx, cancel := chromedp.NewContext(context.Background()) ctx := context.Background()
if c.remoteAllocatorURL != "" {
ctx, _ = chromedp.NewRemoteAllocator(ctx, c.remoteAllocatorURL)
}
ctx, cancel := chromedp.NewContext(ctx)
defer cancel() defer cancel()
if err := chromedp.Run(ctx, if err := chromedp.Run(ctx,
network.Enable(), network.Enable(),
network.SetExtraHTTPHeaders(network.Headers(ConvertHeaderToMap(req.Header))), network.SetExtraHTTPHeaders(ConvertHeaderToMap(req.Header)),
chromedp.ActionFunc(func(ctx context.Context) error { chromedp.ActionFunc(func(ctx context.Context) error {
var reqID network.RequestID var reqID network.RequestID
chromedp.ListenTarget(ctx, func(ev interface{}) { chromedp.ListenTarget(ctx, func(ev interface{}) {

View File

@ -101,7 +101,7 @@ func TestCharsetFromHeaders(t *testing.T) {
defer ts.Close() defer ts.Close()
req, _ := NewRequest("GET", ts.URL, nil) req, _ := NewRequest("GET", ts.URL, nil)
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequest(req) res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req)
if string(res.Body) != "Gültekin" { if string(res.Body) != "Gültekin" {
t.Fatal(string(res.Body)) t.Fatal(string(res.Body))
@ -116,7 +116,7 @@ func TestCharsetFromBody(t *testing.T) {
defer ts.Close() defer ts.Close()
req, _ := NewRequest("GET", ts.URL, nil) req, _ := NewRequest("GET", ts.URL, nil)
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequest(req) res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req)
if string(res.Body) != "Gültekin" { if string(res.Body) != "Gültekin" {
t.Fatal(string(res.Body)) t.Fatal(string(res.Body))
@ -132,7 +132,7 @@ func TestCharsetProvidedWithRequest(t *testing.T) {
req, _ := NewRequest("GET", ts.URL, nil) req, _ := NewRequest("GET", ts.URL, nil)
req.Encoding = "windows-1254" req.Encoding = "windows-1254"
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequest(req) res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req)
if string(res.Body) != "Gültekin" { if string(res.Body) != "Gültekin" {
t.Fatal(string(res.Body)) t.Fatal(string(res.Body))
@ -141,7 +141,7 @@ func TestCharsetProvidedWithRequest(t *testing.T) {
func TestRetry(t *testing.T) { func TestRetry(t *testing.T) {
req, _ := NewRequest("GET", "https://httpbin.org/status/500", nil) req, _ := NewRequest("GET", "https://httpbin.org/status/500", nil)
res, err := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequest(req) res, err := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req)
assert.Nil(t, res) assert.Nil(t, res)
assert.Error(t, err) assert.Error(t, err)
} }

View File

@ -67,7 +67,7 @@ func NewGeziyor(opt *Options) *Geziyor {
} }
// Client // Client
geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes) geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes, opt.BrowserEndpoint)
if opt.Cache != nil { if opt.Cache != nil {
geziyor.Client.Transport = &cache.Transport{ geziyor.Client.Transport = &cache.Transport{
Policy: opt.CachePolicy, Policy: opt.CachePolicy,

View File

@ -116,6 +116,20 @@ func TestGetRendered(t *testing.T) {
}).Start() }).Start()
} }
// Run chrome headless instance to test this
//func TestGetRenderedRemoteAllocator(t *testing.T) {
// geziyor.NewGeziyor(&geziyor.Options{
// StartRequestsFunc: func(g *geziyor.Geziyor) {
// g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
// },
// ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
// fmt.Println(string(r.Body))
// fmt.Println(r.Request.URL.String(), r.Header)
// },
// BrowserEndpoint: "ws://localhost:3000",
// }).Start()
//}
func TestHEADRequest(t *testing.T) { func TestHEADRequest(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) { StartRequestsFunc: func(g *geziyor.Geziyor) {
@ -206,7 +220,7 @@ func BenchmarkRequests(b *testing.B) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, "Hello, client") fmt.Fprint(w, "Hello, client")
})) }))
ts.Client().Transport = client.NewClient(client.DefaultMaxBody, false, client.DefaultRetryTimes, client.DefaultRetryHTTPCodes).Transport ts.Client().Transport = client.NewClient(client.DefaultMaxBody, false, client.DefaultRetryTimes, client.DefaultRetryHTTPCodes, "").Transport
defer ts.Close() defer ts.Close()
// As we don't benchmark creating a server, reset timer. // As we don't benchmark creating a server, reset timer.

View File

@ -15,6 +15,11 @@ type Options struct {
// If empty, any domain is allowed // If empty, any domain is allowed
AllowedDomains []string AllowedDomains []string
// Chrome headless browser WS endpoint.
// If you want to run your own Chrome browser runner, provide its endpoint in here
// For example: ws://localhost:3000
BrowserEndpoint string
// Cache storage backends. // Cache storage backends.
// - Memory // - Memory
// - Disk // - Disk