JS Rendered requests with Chrome support added

This commit is contained in:
Musab Gültekin 2019-06-13 22:08:45 +03:00
parent 76a687e193
commit 1a7d480b36
8 changed files with 111 additions and 12 deletions

View File

@ -6,6 +6,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
## Features ## Features
- 1.000+ Requests/Sec - 1.000+ Requests/Sec
- JS Rendering
- Caching (Memory/Disk) - Caching (Memory/Disk)
- Automatic Data Exporting (JSON, CSV, or custom) - Automatic Data Exporting (JSON, CSV, or custom)
- Limit Concurrency (Global/Per Domain) - Limit Concurrency (Global/Per Domain)

View File

@ -2,7 +2,10 @@ package geziyor
import ( import (
"bytes" "bytes"
"context"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/chromedp/cdproto/dom"
"github.com/chromedp/chromedp"
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"golang.org/x/net/html/charset" "golang.org/x/net/html/charset"
"io" "io"
@ -107,6 +110,17 @@ func (g *Geziyor) Get(url string, callback func(resp *Response)) {
g.Do(&Request{Request: req}, callback) g.Do(&Request{Request: req}, callback)
} }
// GetRendered issues GET request using headless browser
// Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed.
func (g *Geziyor) GetRendered(url string, callback func(resp *Response)) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
log.Printf("Request creating error %v\n", err)
return
}
g.Do(&Request{Request: req, rendered: true}, callback)
}
// Head issues a HEAD to the specified URL // Head issues a HEAD to the specified URL
func (g *Geziyor) Head(url string, callback func(resp *Response)) { func (g *Geziyor) Head(url string, callback func(resp *Response)) {
req, err := http.NewRequest("HEAD", url, nil) req, err := http.NewRequest("HEAD", url, nil)
@ -131,14 +145,14 @@ func (g *Geziyor) Do(req *Request, callback func(resp *Response)) {
return return
} }
// Modify Request // Do request normal or chrome and read response
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") var response *Response
req.Header.Set("Accept-Charset", "utf-8") var err error
req.Header.Set("Accept-Language", "en") if !req.rendered {
req.Header.Set("User-Agent", g.Opt.UserAgent) response, err = g.doRequest(req)
} else {
// Do request and read response response, err = g.doRequestChrome(req)
response, err := g.doRequest(req) }
if err != nil { if err != nil {
return return
} }
@ -177,6 +191,12 @@ func (g *Geziyor) doRequest(req *Request) (*Response, error) {
g.delay() g.delay()
// Modify Request
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Set("Accept-Charset", "utf-8")
req.Header.Set("Accept-Language", "en")
req.Header.Set("User-Agent", g.Opt.UserAgent)
log.Println("Fetching: ", req.URL.String()) log.Println("Fetching: ", req.URL.String())
// Do request // Do request
@ -218,6 +238,44 @@ func (g *Geziyor) doRequest(req *Request) (*Response, error) {
return &response, nil return &response, nil
} }
func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
g.acquireSem(req)
defer g.releaseSem(req)
g.delay()
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
var res string
if err := chromedp.Run(ctx,
chromedp.Navigate(req.URL.String()),
chromedp.Sleep(1*time.Second),
chromedp.ActionFunc(func(ctx context.Context) error {
node, err := dom.GetDocument().Do(ctx)
if err != nil {
return err
}
res, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx)
return err
}),
); err != nil {
log.Printf("Request getting rendered error: %v\n", err)
return nil, err
}
response := &Response{
//Response: resp,
Body: []byte(res),
//Meta: request.Meta,
Geziyor: g,
Exports: make(chan interface{}),
}
return response, nil
}
func (g *Geziyor) acquireSem(req *Request) { func (g *Geziyor) acquireSem(req *Request) {
if g.Opt.ConcurrentRequests != 0 { if g.Opt.ConcurrentRequests != 0 {
g.semGlobal <- struct{}{} g.semGlobal <- struct{}{}
@ -255,11 +313,13 @@ func (g *Geziyor) checkURL(parsedURL *url.URL) bool {
} }
// Check for duplicate requests // Check for duplicate requests
if contains(g.visitedURLS, rawURL) { if !g.Opt.URLRevisitEnabled {
//log.Printf("URL already visited %s\n", rawURL) if contains(g.visitedURLS, rawURL) {
return false //log.Printf("URL already visited %s\n", rawURL)
return false
}
g.visitedURLS = append(g.visitedURLS, rawURL)
} }
g.visitedURLS = append(g.visitedURLS, rawURL)
return true return true
} }

View File

@ -138,3 +138,15 @@ func parseAlmaany(r *geziyor.Response) {
} }
} }
} }
func TestGetRendered(t *testing.T) {
geziyor.NewGeziyor(geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) {
g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
},
ParseFunc: func(r *geziyor.Response) {
fmt.Println(string(r.Body))
},
//URLRevisitEnabled: true,
}).Start()
}

2
go.mod
View File

@ -4,6 +4,8 @@ go 1.12
require ( require (
github.com/PuerkitoBio/goquery v1.5.0 github.com/PuerkitoBio/goquery v1.5.0
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9
github.com/chromedp/chromedp v0.3.0
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a golang.org/x/net v0.0.0-20181114220301-adae6a3d119a
golang.org/x/text v0.3.2 // indirect golang.org/x/text v0.3.2 // indirect

16
go.sum
View File

@ -2,11 +2,27 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9 h1:ARnDd2vEk91rLNra8yk1hF40H8z+1HrD6juNpe7FsI0=
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9/go.mod h1:xquOK9dIGFlLaIGI4c6IyfLI/Gz0LiYYuJtzhsUODgI=
github.com/chromedp/chromedp v0.3.0 h1:7/pwrXFRq6/ym3sxCykm90DMoyw6VKXY48DgGRgUURA=
github.com/chromedp/chromedp v0.3.0/go.mod h1:EktsZcC2iycVrRhC9fDmshBpCK9lNnZYi6x2q9uE7zI=
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ= github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ=
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8= github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8=
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0=
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo=
github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8=
github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.0.0 h1:1WdyfgUcImUfVBvYbsW2krIsnko+1QU2t45soaF8v1M=
github.com/gobwas/ws v1.0.0/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307 h1:vl4eIlySbjertFaNwiMjXsGrFVK25aOWLq7n+3gh2ls=
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307/go.mod h1:BjPj+aVjl9FW/cCGiF3nGh5v+9Gd3VCgBQbod/GlMaQ=
github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983 h1:wL11wNW7dhKIcRCHSm4sHKPWz0tt4mwBsVodG7+Xyqg=
github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/sys v0.0.0-20190509141414-a5b02f93d862/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

View File

@ -52,4 +52,7 @@ type Options struct {
// Charset Detection disable // Charset Detection disable
CharsetDetectDisabled bool CharsetDetectDisabled bool
// Revisiting same URLs is disabled by default
URLRevisitEnabled bool
} }

View File

@ -8,4 +8,6 @@ import (
type Request struct { type Request struct {
*http.Request *http.Request
Meta map[string]interface{} Meta map[string]interface{}
rendered bool
} }

View File

@ -31,6 +31,9 @@ func (r *Response) JoinURL(relativeURL string) string {
} }
func (r *Response) isHTML() bool { func (r *Response) isHTML() bool {
if r.Response == nil {
return len(r.Body) != 0
}
contentType := r.Header.Get("Content-Type") contentType := r.Header.Get("Content-Type")
for _, htmlContentType := range []string{"text/html", "application/xhtml+xml", "application/vnd.wap.xhtml+xml"} { for _, htmlContentType := range []string{"text/html", "application/xhtml+xml", "application/vnd.wap.xhtml+xml"} {
if strings.Contains(contentType, htmlContentType) { if strings.Contains(contentType, htmlContentType) {