JS Rendered requests with Chrome support added
This commit is contained in:
parent
76a687e193
commit
1a7d480b36
@ -6,6 +6,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
|
||||
|
||||
## Features
|
||||
- 1.000+ Requests/Sec
|
||||
- JS Rendering
|
||||
- Caching (Memory/Disk)
|
||||
- Automatic Data Exporting (JSON, CSV, or custom)
|
||||
- Limit Concurrency (Global/Per Domain)
|
||||
|
76
geziyor.go
76
geziyor.go
@ -2,7 +2,10 @@ package geziyor
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/chromedp/cdproto/dom"
|
||||
"github.com/chromedp/chromedp"
|
||||
"github.com/fpfeng/httpcache"
|
||||
"golang.org/x/net/html/charset"
|
||||
"io"
|
||||
@ -107,6 +110,17 @@ func (g *Geziyor) Get(url string, callback func(resp *Response)) {
|
||||
g.Do(&Request{Request: req}, callback)
|
||||
}
|
||||
|
||||
// GetRendered issues GET request using headless browser
|
||||
// Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed.
|
||||
func (g *Geziyor) GetRendered(url string, callback func(resp *Response)) {
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
log.Printf("Request creating error %v\n", err)
|
||||
return
|
||||
}
|
||||
g.Do(&Request{Request: req, rendered: true}, callback)
|
||||
}
|
||||
|
||||
// Head issues a HEAD to the specified URL
|
||||
func (g *Geziyor) Head(url string, callback func(resp *Response)) {
|
||||
req, err := http.NewRequest("HEAD", url, nil)
|
||||
@ -131,14 +145,14 @@ func (g *Geziyor) Do(req *Request, callback func(resp *Response)) {
|
||||
return
|
||||
}
|
||||
|
||||
// Modify Request
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Charset", "utf-8")
|
||||
req.Header.Set("Accept-Language", "en")
|
||||
req.Header.Set("User-Agent", g.Opt.UserAgent)
|
||||
|
||||
// Do request and read response
|
||||
response, err := g.doRequest(req)
|
||||
// Do request normal or chrome and read response
|
||||
var response *Response
|
||||
var err error
|
||||
if !req.rendered {
|
||||
response, err = g.doRequest(req)
|
||||
} else {
|
||||
response, err = g.doRequestChrome(req)
|
||||
}
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
@ -177,6 +191,12 @@ func (g *Geziyor) doRequest(req *Request) (*Response, error) {
|
||||
|
||||
g.delay()
|
||||
|
||||
// Modify Request
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Charset", "utf-8")
|
||||
req.Header.Set("Accept-Language", "en")
|
||||
req.Header.Set("User-Agent", g.Opt.UserAgent)
|
||||
|
||||
log.Println("Fetching: ", req.URL.String())
|
||||
|
||||
// Do request
|
||||
@ -218,6 +238,44 @@ func (g *Geziyor) doRequest(req *Request) (*Response, error) {
|
||||
return &response, nil
|
||||
}
|
||||
|
||||
func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
|
||||
g.acquireSem(req)
|
||||
defer g.releaseSem(req)
|
||||
|
||||
g.delay()
|
||||
|
||||
ctx, cancel := chromedp.NewContext(context.Background())
|
||||
defer cancel()
|
||||
|
||||
var res string
|
||||
|
||||
if err := chromedp.Run(ctx,
|
||||
chromedp.Navigate(req.URL.String()),
|
||||
chromedp.Sleep(1*time.Second),
|
||||
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||
node, err := dom.GetDocument().Do(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
res, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx)
|
||||
return err
|
||||
}),
|
||||
); err != nil {
|
||||
log.Printf("Request getting rendered error: %v\n", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
response := &Response{
|
||||
//Response: resp,
|
||||
Body: []byte(res),
|
||||
//Meta: request.Meta,
|
||||
Geziyor: g,
|
||||
Exports: make(chan interface{}),
|
||||
}
|
||||
|
||||
return response, nil
|
||||
}
|
||||
|
||||
func (g *Geziyor) acquireSem(req *Request) {
|
||||
if g.Opt.ConcurrentRequests != 0 {
|
||||
g.semGlobal <- struct{}{}
|
||||
@ -255,11 +313,13 @@ func (g *Geziyor) checkURL(parsedURL *url.URL) bool {
|
||||
}
|
||||
|
||||
// Check for duplicate requests
|
||||
if !g.Opt.URLRevisitEnabled {
|
||||
if contains(g.visitedURLS, rawURL) {
|
||||
//log.Printf("URL already visited %s\n", rawURL)
|
||||
return false
|
||||
}
|
||||
g.visitedURLS = append(g.visitedURLS, rawURL)
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
@ -138,3 +138,15 @@ func parseAlmaany(r *geziyor.Response) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetRendered(t *testing.T) {
|
||||
geziyor.NewGeziyor(geziyor.Options{
|
||||
StartRequestsFunc: func(g *geziyor.Geziyor) {
|
||||
g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
|
||||
},
|
||||
ParseFunc: func(r *geziyor.Response) {
|
||||
fmt.Println(string(r.Body))
|
||||
},
|
||||
//URLRevisitEnabled: true,
|
||||
}).Start()
|
||||
}
|
||||
|
2
go.mod
2
go.mod
@ -4,6 +4,8 @@ go 1.12
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.5.0
|
||||
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9
|
||||
github.com/chromedp/chromedp v0.3.0
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
|
||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a
|
||||
golang.org/x/text v0.3.2 // indirect
|
||||
|
16
go.sum
16
go.sum
@ -2,11 +2,27 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP
|
||||
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
|
||||
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
|
||||
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9 h1:ARnDd2vEk91rLNra8yk1hF40H8z+1HrD6juNpe7FsI0=
|
||||
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9/go.mod h1:xquOK9dIGFlLaIGI4c6IyfLI/Gz0LiYYuJtzhsUODgI=
|
||||
github.com/chromedp/chromedp v0.3.0 h1:7/pwrXFRq6/ym3sxCykm90DMoyw6VKXY48DgGRgUURA=
|
||||
github.com/chromedp/chromedp v0.3.0/go.mod h1:EktsZcC2iycVrRhC9fDmshBpCK9lNnZYi6x2q9uE7zI=
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ=
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8=
|
||||
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0=
|
||||
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo=
|
||||
github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8=
|
||||
github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
|
||||
github.com/gobwas/ws v1.0.0 h1:1WdyfgUcImUfVBvYbsW2krIsnko+1QU2t45soaF8v1M=
|
||||
github.com/gobwas/ws v1.0.0/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
|
||||
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307 h1:vl4eIlySbjertFaNwiMjXsGrFVK25aOWLq7n+3gh2ls=
|
||||
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307/go.mod h1:BjPj+aVjl9FW/cCGiF3nGh5v+9Gd3VCgBQbod/GlMaQ=
|
||||
github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
|
||||
github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983 h1:wL11wNW7dhKIcRCHSm4sHKPWz0tt4mwBsVodG7+Xyqg=
|
||||
github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
|
||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
|
||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/sys v0.0.0-20190509141414-a5b02f93d862/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
|
||||
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
|
@ -52,4 +52,7 @@ type Options struct {
|
||||
|
||||
// Charset Detection disable
|
||||
CharsetDetectDisabled bool
|
||||
|
||||
// Revisiting same URLs is disabled by default
|
||||
URLRevisitEnabled bool
|
||||
}
|
||||
|
@ -8,4 +8,6 @@ import (
|
||||
type Request struct {
|
||||
*http.Request
|
||||
Meta map[string]interface{}
|
||||
|
||||
rendered bool
|
||||
}
|
||||
|
@ -31,6 +31,9 @@ func (r *Response) JoinURL(relativeURL string) string {
|
||||
}
|
||||
|
||||
func (r *Response) isHTML() bool {
|
||||
if r.Response == nil {
|
||||
return len(r.Body) != 0
|
||||
}
|
||||
contentType := r.Header.Get("Content-Type")
|
||||
for _, htmlContentType := range []string{"text/html", "application/xhtml+xml", "application/vnd.wap.xhtml+xml"} {
|
||||
if strings.Contains(contentType, htmlContentType) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user