Remote endpoint support added for js rendered requests. Geziyor is beta now.
This commit is contained in:
parent
c117d71fef
commit
0e5230eac8
24
README.md
24
README.md
@ -6,8 +6,8 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
|
|||||||
[](https://codecov.io/github/geziyor/geziyor?branch=master)
|
[](https://codecov.io/github/geziyor/geziyor?branch=master)
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
- **JS Rendering**
|
||||||
- 5.000+ Requests/Sec
|
- 5.000+ Requests/Sec
|
||||||
- JS Rendering
|
|
||||||
- Caching (Memory/Disk/LevelDB)
|
- Caching (Memory/Disk/LevelDB)
|
||||||
- Automatic Data Exporting (JSON, CSV, or custom)
|
- Automatic Data Exporting (JSON, CSV, or custom)
|
||||||
- Metrics (Prometheus, Expvar, or custom)
|
- Metrics (Prometheus, Expvar, or custom)
|
||||||
@ -19,7 +19,7 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
|
|||||||
See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings.
|
See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings.
|
||||||
|
|
||||||
## Status
|
## Status
|
||||||
The project is in **development phase**. Thus, we highly recommend you to use Geziyor with go modules.
|
The project is in **beta** phase. Thus, we highly recommend you to use Geziyor with go modules.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
@ -61,7 +61,7 @@ If you want to make JS rendered requests, make sure you have Chrome installed.
|
|||||||
If you want to make concurrent requests over 256, you need to increase limits.
|
If you want to make concurrent requests over 256, you need to increase limits.
|
||||||
Read [this](https://wilsonmar.github.io/maximum-limits/) for more.
|
Read [this](https://wilsonmar.github.io/maximum-limits/) for more.
|
||||||
|
|
||||||
### Making Requests
|
### Making Normal Requests
|
||||||
|
|
||||||
Initial requests start with ```StartURLs []string``` field in ```Options```.
|
Initial requests start with ```StartURLs []string``` field in ```Options```.
|
||||||
Geziyor makes concurrent requests to those URLs.
|
Geziyor makes concurrent requests to those URLs.
|
||||||
@ -84,7 +84,6 @@ You can make requests using ```Geziyor``` [methods](https://godoc.org/github.com
|
|||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartRequestsFunc: func(g *geziyor.Geziyor) {
|
StartRequestsFunc: func(g *geziyor.Geziyor) {
|
||||||
g.Get("https://httpbin.org/anything", g.Opt.ParseFunc)
|
g.Get("https://httpbin.org/anything", g.Opt.ParseFunc)
|
||||||
g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
|
|
||||||
g.Head("https://httpbin.org/anything", g.Opt.ParseFunc)
|
g.Head("https://httpbin.org/anything", g.Opt.ParseFunc)
|
||||||
},
|
},
|
||||||
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
||||||
@ -93,6 +92,23 @@ geziyor.NewGeziyor(&geziyor.Options{
|
|||||||
}).Start()
|
}).Start()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Making JS Rendered Requests
|
||||||
|
|
||||||
|
JS Rendered requests can be made using ```GetRendered``` method.
|
||||||
|
By default, geziyor uses local Chrome application CLI to start Chrome browser. Set ```BrowserEndpoint``` option to use different chrome instance. Such as, "ws://localhost:3000"
|
||||||
|
|
||||||
|
```go
|
||||||
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
|
StartRequestsFunc: func(g *geziyor.Geziyor) {
|
||||||
|
g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
|
||||||
|
},
|
||||||
|
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
||||||
|
fmt.Println(string(r.Body))
|
||||||
|
},
|
||||||
|
//BrowserEndpoint: "ws://localhost:3000",
|
||||||
|
}).Start()
|
||||||
|
```
|
||||||
|
|
||||||
### Extracting Data
|
### Extracting Data
|
||||||
|
|
||||||
We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document).
|
We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document).
|
||||||
|
@ -31,6 +31,7 @@ type Client struct {
|
|||||||
charsetDetectDisabled bool
|
charsetDetectDisabled bool
|
||||||
retryTimes int
|
retryTimes int
|
||||||
retryHTTPCodes []int
|
retryHTTPCodes []int
|
||||||
|
remoteAllocatorURL string
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -44,7 +45,7 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// NewClient creates http.Client with modified values for typical web scraper
|
// NewClient creates http.Client with modified values for typical web scraper
|
||||||
func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, retryHTTPCodes []int) *Client {
|
func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, retryHTTPCodes []int, remoteAllocatorURL string) *Client {
|
||||||
httpClient := &http.Client{
|
httpClient := &http.Client{
|
||||||
Transport: &http.Transport{
|
Transport: &http.Transport{
|
||||||
Proxy: http.ProxyFromEnvironment,
|
Proxy: http.ProxyFromEnvironment,
|
||||||
@ -68,6 +69,7 @@ func NewClient(maxBodySize int64, charsetDetectDisabled bool, retryTimes int, re
|
|||||||
charsetDetectDisabled: charsetDetectDisabled,
|
charsetDetectDisabled: charsetDetectDisabled,
|
||||||
retryTimes: retryTimes,
|
retryTimes: retryTimes,
|
||||||
retryHTTPCodes: retryHTTPCodes,
|
retryHTTPCodes: retryHTTPCodes,
|
||||||
|
remoteAllocatorURL: remoteAllocatorURL,
|
||||||
}
|
}
|
||||||
|
|
||||||
return &client
|
return &client
|
||||||
@ -156,12 +158,16 @@ func (c *Client) DoRequestChrome(req *Request) (*Response, error) {
|
|||||||
var body string
|
var body string
|
||||||
var res *network.Response
|
var res *network.Response
|
||||||
|
|
||||||
ctx, cancel := chromedp.NewContext(context.Background())
|
ctx := context.Background()
|
||||||
|
if c.remoteAllocatorURL != "" {
|
||||||
|
ctx, _ = chromedp.NewRemoteAllocator(ctx, c.remoteAllocatorURL)
|
||||||
|
}
|
||||||
|
ctx, cancel := chromedp.NewContext(ctx)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
if err := chromedp.Run(ctx,
|
if err := chromedp.Run(ctx,
|
||||||
network.Enable(),
|
network.Enable(),
|
||||||
network.SetExtraHTTPHeaders(network.Headers(ConvertHeaderToMap(req.Header))),
|
network.SetExtraHTTPHeaders(ConvertHeaderToMap(req.Header)),
|
||||||
chromedp.ActionFunc(func(ctx context.Context) error {
|
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||||
var reqID network.RequestID
|
var reqID network.RequestID
|
||||||
chromedp.ListenTarget(ctx, func(ev interface{}) {
|
chromedp.ListenTarget(ctx, func(ev interface{}) {
|
||||||
|
@ -101,7 +101,7 @@ func TestCharsetFromHeaders(t *testing.T) {
|
|||||||
defer ts.Close()
|
defer ts.Close()
|
||||||
|
|
||||||
req, _ := NewRequest("GET", ts.URL, nil)
|
req, _ := NewRequest("GET", ts.URL, nil)
|
||||||
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequest(req)
|
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req)
|
||||||
|
|
||||||
if string(res.Body) != "Gültekin" {
|
if string(res.Body) != "Gültekin" {
|
||||||
t.Fatal(string(res.Body))
|
t.Fatal(string(res.Body))
|
||||||
@ -116,7 +116,7 @@ func TestCharsetFromBody(t *testing.T) {
|
|||||||
defer ts.Close()
|
defer ts.Close()
|
||||||
|
|
||||||
req, _ := NewRequest("GET", ts.URL, nil)
|
req, _ := NewRequest("GET", ts.URL, nil)
|
||||||
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequest(req)
|
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req)
|
||||||
|
|
||||||
if string(res.Body) != "Gültekin" {
|
if string(res.Body) != "Gültekin" {
|
||||||
t.Fatal(string(res.Body))
|
t.Fatal(string(res.Body))
|
||||||
@ -132,7 +132,7 @@ func TestCharsetProvidedWithRequest(t *testing.T) {
|
|||||||
|
|
||||||
req, _ := NewRequest("GET", ts.URL, nil)
|
req, _ := NewRequest("GET", ts.URL, nil)
|
||||||
req.Encoding = "windows-1254"
|
req.Encoding = "windows-1254"
|
||||||
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequest(req)
|
res, _ := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req)
|
||||||
|
|
||||||
if string(res.Body) != "Gültekin" {
|
if string(res.Body) != "Gültekin" {
|
||||||
t.Fatal(string(res.Body))
|
t.Fatal(string(res.Body))
|
||||||
@ -141,7 +141,7 @@ func TestCharsetProvidedWithRequest(t *testing.T) {
|
|||||||
|
|
||||||
func TestRetry(t *testing.T) {
|
func TestRetry(t *testing.T) {
|
||||||
req, _ := NewRequest("GET", "https://httpbin.org/status/500", nil)
|
req, _ := NewRequest("GET", "https://httpbin.org/status/500", nil)
|
||||||
res, err := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes).DoRequest(req)
|
res, err := NewClient(DefaultMaxBody, false, DefaultRetryTimes, DefaultRetryHTTPCodes, "").DoRequest(req)
|
||||||
assert.Nil(t, res)
|
assert.Nil(t, res)
|
||||||
assert.Error(t, err)
|
assert.Error(t, err)
|
||||||
}
|
}
|
||||||
|
@ -67,7 +67,7 @@ func NewGeziyor(opt *Options) *Geziyor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Client
|
// Client
|
||||||
geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes)
|
geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes, opt.BrowserEndpoint)
|
||||||
if opt.Cache != nil {
|
if opt.Cache != nil {
|
||||||
geziyor.Client.Transport = &cache.Transport{
|
geziyor.Client.Transport = &cache.Transport{
|
||||||
Policy: opt.CachePolicy,
|
Policy: opt.CachePolicy,
|
||||||
|
@ -116,6 +116,20 @@ func TestGetRendered(t *testing.T) {
|
|||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Run chrome headless instance to test this
|
||||||
|
//func TestGetRenderedRemoteAllocator(t *testing.T) {
|
||||||
|
// geziyor.NewGeziyor(&geziyor.Options{
|
||||||
|
// StartRequestsFunc: func(g *geziyor.Geziyor) {
|
||||||
|
// g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
|
||||||
|
// },
|
||||||
|
// ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
||||||
|
// fmt.Println(string(r.Body))
|
||||||
|
// fmt.Println(r.Request.URL.String(), r.Header)
|
||||||
|
// },
|
||||||
|
// BrowserEndpoint: "ws://localhost:3000",
|
||||||
|
// }).Start()
|
||||||
|
//}
|
||||||
|
|
||||||
func TestHEADRequest(t *testing.T) {
|
func TestHEADRequest(t *testing.T) {
|
||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartRequestsFunc: func(g *geziyor.Geziyor) {
|
StartRequestsFunc: func(g *geziyor.Geziyor) {
|
||||||
@ -206,7 +220,7 @@ func BenchmarkRequests(b *testing.B) {
|
|||||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
fmt.Fprint(w, "Hello, client")
|
fmt.Fprint(w, "Hello, client")
|
||||||
}))
|
}))
|
||||||
ts.Client().Transport = client.NewClient(client.DefaultMaxBody, false, client.DefaultRetryTimes, client.DefaultRetryHTTPCodes).Transport
|
ts.Client().Transport = client.NewClient(client.DefaultMaxBody, false, client.DefaultRetryTimes, client.DefaultRetryHTTPCodes, "").Transport
|
||||||
defer ts.Close()
|
defer ts.Close()
|
||||||
|
|
||||||
// As we don't benchmark creating a server, reset timer.
|
// As we don't benchmark creating a server, reset timer.
|
||||||
|
@ -15,6 +15,11 @@ type Options struct {
|
|||||||
// If empty, any domain is allowed
|
// If empty, any domain is allowed
|
||||||
AllowedDomains []string
|
AllowedDomains []string
|
||||||
|
|
||||||
|
// Chrome headless browser WS endpoint.
|
||||||
|
// If you want to run your own Chrome browser runner, provide its endpoint in here
|
||||||
|
// For example: ws://localhost:3000
|
||||||
|
BrowserEndpoint string
|
||||||
|
|
||||||
// Cache storage backends.
|
// Cache storage backends.
|
||||||
// - Memory
|
// - Memory
|
||||||
// - Disk
|
// - Disk
|
||||||
|
Loading…
x
Reference in New Issue
Block a user