JS Rendered requests with Chrome support added
This commit is contained in:
parent
76a687e193
commit
1a7d480b36
@ -6,6 +6,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
|
|||||||
|
|
||||||
## Features
|
## Features
|
||||||
- 1.000+ Requests/Sec
|
- 1.000+ Requests/Sec
|
||||||
|
- JS Rendering
|
||||||
- Caching (Memory/Disk)
|
- Caching (Memory/Disk)
|
||||||
- Automatic Data Exporting (JSON, CSV, or custom)
|
- Automatic Data Exporting (JSON, CSV, or custom)
|
||||||
- Limit Concurrency (Global/Per Domain)
|
- Limit Concurrency (Global/Per Domain)
|
||||||
|
84
geziyor.go
84
geziyor.go
@ -2,7 +2,10 @@ package geziyor
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"context"
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
"github.com/chromedp/cdproto/dom"
|
||||||
|
"github.com/chromedp/chromedp"
|
||||||
"github.com/fpfeng/httpcache"
|
"github.com/fpfeng/httpcache"
|
||||||
"golang.org/x/net/html/charset"
|
"golang.org/x/net/html/charset"
|
||||||
"io"
|
"io"
|
||||||
@ -107,6 +110,17 @@ func (g *Geziyor) Get(url string, callback func(resp *Response)) {
|
|||||||
g.Do(&Request{Request: req}, callback)
|
g.Do(&Request{Request: req}, callback)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetRendered issues GET request using headless browser
|
||||||
|
// Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed.
|
||||||
|
func (g *Geziyor) GetRendered(url string, callback func(resp *Response)) {
|
||||||
|
req, err := http.NewRequest("GET", url, nil)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Request creating error %v\n", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
g.Do(&Request{Request: req, rendered: true}, callback)
|
||||||
|
}
|
||||||
|
|
||||||
// Head issues a HEAD to the specified URL
|
// Head issues a HEAD to the specified URL
|
||||||
func (g *Geziyor) Head(url string, callback func(resp *Response)) {
|
func (g *Geziyor) Head(url string, callback func(resp *Response)) {
|
||||||
req, err := http.NewRequest("HEAD", url, nil)
|
req, err := http.NewRequest("HEAD", url, nil)
|
||||||
@ -131,14 +145,14 @@ func (g *Geziyor) Do(req *Request, callback func(resp *Response)) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Modify Request
|
// Do request normal or chrome and read response
|
||||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
var response *Response
|
||||||
req.Header.Set("Accept-Charset", "utf-8")
|
var err error
|
||||||
req.Header.Set("Accept-Language", "en")
|
if !req.rendered {
|
||||||
req.Header.Set("User-Agent", g.Opt.UserAgent)
|
response, err = g.doRequest(req)
|
||||||
|
} else {
|
||||||
// Do request and read response
|
response, err = g.doRequestChrome(req)
|
||||||
response, err := g.doRequest(req)
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -177,6 +191,12 @@ func (g *Geziyor) doRequest(req *Request) (*Response, error) {
|
|||||||
|
|
||||||
g.delay()
|
g.delay()
|
||||||
|
|
||||||
|
// Modify Request
|
||||||
|
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||||
|
req.Header.Set("Accept-Charset", "utf-8")
|
||||||
|
req.Header.Set("Accept-Language", "en")
|
||||||
|
req.Header.Set("User-Agent", g.Opt.UserAgent)
|
||||||
|
|
||||||
log.Println("Fetching: ", req.URL.String())
|
log.Println("Fetching: ", req.URL.String())
|
||||||
|
|
||||||
// Do request
|
// Do request
|
||||||
@ -218,6 +238,44 @@ func (g *Geziyor) doRequest(req *Request) (*Response, error) {
|
|||||||
return &response, nil
|
return &response, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
|
||||||
|
g.acquireSem(req)
|
||||||
|
defer g.releaseSem(req)
|
||||||
|
|
||||||
|
g.delay()
|
||||||
|
|
||||||
|
ctx, cancel := chromedp.NewContext(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
var res string
|
||||||
|
|
||||||
|
if err := chromedp.Run(ctx,
|
||||||
|
chromedp.Navigate(req.URL.String()),
|
||||||
|
chromedp.Sleep(1*time.Second),
|
||||||
|
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||||
|
node, err := dom.GetDocument().Do(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
res, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx)
|
||||||
|
return err
|
||||||
|
}),
|
||||||
|
); err != nil {
|
||||||
|
log.Printf("Request getting rendered error: %v\n", err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
response := &Response{
|
||||||
|
//Response: resp,
|
||||||
|
Body: []byte(res),
|
||||||
|
//Meta: request.Meta,
|
||||||
|
Geziyor: g,
|
||||||
|
Exports: make(chan interface{}),
|
||||||
|
}
|
||||||
|
|
||||||
|
return response, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (g *Geziyor) acquireSem(req *Request) {
|
func (g *Geziyor) acquireSem(req *Request) {
|
||||||
if g.Opt.ConcurrentRequests != 0 {
|
if g.Opt.ConcurrentRequests != 0 {
|
||||||
g.semGlobal <- struct{}{}
|
g.semGlobal <- struct{}{}
|
||||||
@ -255,11 +313,13 @@ func (g *Geziyor) checkURL(parsedURL *url.URL) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check for duplicate requests
|
// Check for duplicate requests
|
||||||
if contains(g.visitedURLS, rawURL) {
|
if !g.Opt.URLRevisitEnabled {
|
||||||
//log.Printf("URL already visited %s\n", rawURL)
|
if contains(g.visitedURLS, rawURL) {
|
||||||
return false
|
//log.Printf("URL already visited %s\n", rawURL)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
g.visitedURLS = append(g.visitedURLS, rawURL)
|
||||||
}
|
}
|
||||||
g.visitedURLS = append(g.visitedURLS, rawURL)
|
|
||||||
|
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
@ -138,3 +138,15 @@ func parseAlmaany(r *geziyor.Response) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGetRendered(t *testing.T) {
|
||||||
|
geziyor.NewGeziyor(geziyor.Options{
|
||||||
|
StartRequestsFunc: func(g *geziyor.Geziyor) {
|
||||||
|
g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
|
||||||
|
},
|
||||||
|
ParseFunc: func(r *geziyor.Response) {
|
||||||
|
fmt.Println(string(r.Body))
|
||||||
|
},
|
||||||
|
//URLRevisitEnabled: true,
|
||||||
|
}).Start()
|
||||||
|
}
|
||||||
|
2
go.mod
2
go.mod
@ -4,6 +4,8 @@ go 1.12
|
|||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/PuerkitoBio/goquery v1.5.0
|
github.com/PuerkitoBio/goquery v1.5.0
|
||||||
|
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9
|
||||||
|
github.com/chromedp/chromedp v0.3.0
|
||||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
|
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
|
||||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a
|
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a
|
||||||
golang.org/x/text v0.3.2 // indirect
|
golang.org/x/text v0.3.2 // indirect
|
||||||
|
16
go.sum
16
go.sum
@ -2,11 +2,27 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP
|
|||||||
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
|
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
|
||||||
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
|
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
|
||||||
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||||
|
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9 h1:ARnDd2vEk91rLNra8yk1hF40H8z+1HrD6juNpe7FsI0=
|
||||||
|
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9/go.mod h1:xquOK9dIGFlLaIGI4c6IyfLI/Gz0LiYYuJtzhsUODgI=
|
||||||
|
github.com/chromedp/chromedp v0.3.0 h1:7/pwrXFRq6/ym3sxCykm90DMoyw6VKXY48DgGRgUURA=
|
||||||
|
github.com/chromedp/chromedp v0.3.0/go.mod h1:EktsZcC2iycVrRhC9fDmshBpCK9lNnZYi6x2q9uE7zI=
|
||||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ=
|
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ=
|
||||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8=
|
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8=
|
||||||
|
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0=
|
||||||
|
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo=
|
||||||
|
github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8=
|
||||||
|
github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
|
||||||
|
github.com/gobwas/ws v1.0.0 h1:1WdyfgUcImUfVBvYbsW2krIsnko+1QU2t45soaF8v1M=
|
||||||
|
github.com/gobwas/ws v1.0.0/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
|
||||||
|
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307 h1:vl4eIlySbjertFaNwiMjXsGrFVK25aOWLq7n+3gh2ls=
|
||||||
|
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307/go.mod h1:BjPj+aVjl9FW/cCGiF3nGh5v+9Gd3VCgBQbod/GlMaQ=
|
||||||
|
github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
|
||||||
|
github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983 h1:wL11wNW7dhKIcRCHSm4sHKPWz0tt4mwBsVodG7+Xyqg=
|
||||||
|
github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
|
||||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
|
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
|
||||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||||
|
golang.org/x/sys v0.0.0-20190509141414-a5b02f93d862/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
|
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
|
||||||
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
|
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
|
||||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
|
@ -52,4 +52,7 @@ type Options struct {
|
|||||||
|
|
||||||
// Charset Detection disable
|
// Charset Detection disable
|
||||||
CharsetDetectDisabled bool
|
CharsetDetectDisabled bool
|
||||||
|
|
||||||
|
// Revisiting same URLs is disabled by default
|
||||||
|
URLRevisitEnabled bool
|
||||||
}
|
}
|
||||||
|
@ -8,4 +8,6 @@ import (
|
|||||||
type Request struct {
|
type Request struct {
|
||||||
*http.Request
|
*http.Request
|
||||||
Meta map[string]interface{}
|
Meta map[string]interface{}
|
||||||
|
|
||||||
|
rendered bool
|
||||||
}
|
}
|
||||||
|
@ -31,6 +31,9 @@ func (r *Response) JoinURL(relativeURL string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (r *Response) isHTML() bool {
|
func (r *Response) isHTML() bool {
|
||||||
|
if r.Response == nil {
|
||||||
|
return len(r.Body) != 0
|
||||||
|
}
|
||||||
contentType := r.Header.Get("Content-Type")
|
contentType := r.Header.Get("Content-Type")
|
||||||
for _, htmlContentType := range []string{"text/html", "application/xhtml+xml", "application/vnd.wap.xhtml+xml"} {
|
for _, htmlContentType := range []string{"text/html", "application/xhtml+xml", "application/vnd.wap.xhtml+xml"} {
|
||||||
if strings.Contains(contentType, htmlContentType) {
|
if strings.Contains(contentType, htmlContentType) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user