From 1a7d480b368a21f7056b1965c74ccb8a35942ce9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Thu, 13 Jun 2019 22:08:45 +0300 Subject: [PATCH] JS Rendered requests with Chrome support added --- README.md | 1 + geziyor.go | 84 ++++++++++++++++++++++++++++++++++++++++++------- geziyor_test.go | 12 +++++++ go.mod | 2 ++ go.sum | 16 ++++++++++ options.go | 3 ++ request.go | 2 ++ response.go | 3 ++ 8 files changed, 111 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index b268ddd..3e8b777 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl ## Features - 1.000+ Requests/Sec +- JS Rendering - Caching (Memory/Disk) - Automatic Data Exporting (JSON, CSV, or custom) - Limit Concurrency (Global/Per Domain) diff --git a/geziyor.go b/geziyor.go index fafb558..76d0e6f 100644 --- a/geziyor.go +++ b/geziyor.go @@ -2,7 +2,10 @@ package geziyor import ( "bytes" + "context" "github.com/PuerkitoBio/goquery" + "github.com/chromedp/cdproto/dom" + "github.com/chromedp/chromedp" "github.com/fpfeng/httpcache" "golang.org/x/net/html/charset" "io" @@ -107,6 +110,17 @@ func (g *Geziyor) Get(url string, callback func(resp *Response)) { g.Do(&Request{Request: req}, callback) } +// GetRendered issues GET request using headless browser +// Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed. +func (g *Geziyor) GetRendered(url string, callback func(resp *Response)) { + req, err := http.NewRequest("GET", url, nil) + if err != nil { + log.Printf("Request creating error %v\n", err) + return + } + g.Do(&Request{Request: req, rendered: true}, callback) +} + // Head issues a HEAD to the specified URL func (g *Geziyor) Head(url string, callback func(resp *Response)) { req, err := http.NewRequest("HEAD", url, nil) @@ -131,14 +145,14 @@ func (g *Geziyor) Do(req *Request, callback func(resp *Response)) { return } - // Modify Request - req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") - req.Header.Set("Accept-Charset", "utf-8") - req.Header.Set("Accept-Language", "en") - req.Header.Set("User-Agent", g.Opt.UserAgent) - - // Do request and read response - response, err := g.doRequest(req) + // Do request normal or chrome and read response + var response *Response + var err error + if !req.rendered { + response, err = g.doRequest(req) + } else { + response, err = g.doRequestChrome(req) + } if err != nil { return } @@ -177,6 +191,12 @@ func (g *Geziyor) doRequest(req *Request) (*Response, error) { g.delay() + // Modify Request + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + req.Header.Set("Accept-Charset", "utf-8") + req.Header.Set("Accept-Language", "en") + req.Header.Set("User-Agent", g.Opt.UserAgent) + log.Println("Fetching: ", req.URL.String()) // Do request @@ -218,6 +238,44 @@ func (g *Geziyor) doRequest(req *Request) (*Response, error) { return &response, nil } +func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) { + g.acquireSem(req) + defer g.releaseSem(req) + + g.delay() + + ctx, cancel := chromedp.NewContext(context.Background()) + defer cancel() + + var res string + + if err := chromedp.Run(ctx, + chromedp.Navigate(req.URL.String()), + chromedp.Sleep(1*time.Second), + chromedp.ActionFunc(func(ctx context.Context) error { + node, err := dom.GetDocument().Do(ctx) + if err != nil { + return err + } + res, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx) + return err + }), + ); err != nil { + log.Printf("Request getting rendered error: %v\n", err) + return nil, err + } + + response := &Response{ + //Response: resp, + Body: []byte(res), + //Meta: request.Meta, + Geziyor: g, + Exports: make(chan interface{}), + } + + return response, nil +} + func (g *Geziyor) acquireSem(req *Request) { if g.Opt.ConcurrentRequests != 0 { g.semGlobal <- struct{}{} @@ -255,11 +313,13 @@ func (g *Geziyor) checkURL(parsedURL *url.URL) bool { } // Check for duplicate requests - if contains(g.visitedURLS, rawURL) { - //log.Printf("URL already visited %s\n", rawURL) - return false + if !g.Opt.URLRevisitEnabled { + if contains(g.visitedURLS, rawURL) { + //log.Printf("URL already visited %s\n", rawURL) + return false + } + g.visitedURLS = append(g.visitedURLS, rawURL) } - g.visitedURLS = append(g.visitedURLS, rawURL) return true } diff --git a/geziyor_test.go b/geziyor_test.go index acb576a..44f17fd 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -138,3 +138,15 @@ func parseAlmaany(r *geziyor.Response) { } } } + +func TestGetRendered(t *testing.T) { + geziyor.NewGeziyor(geziyor.Options{ + StartRequestsFunc: func(g *geziyor.Geziyor) { + g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc) + }, + ParseFunc: func(r *geziyor.Response) { + fmt.Println(string(r.Body)) + }, + //URLRevisitEnabled: true, + }).Start() +} diff --git a/go.mod b/go.mod index 7f5b3ba..fcde546 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,8 @@ go 1.12 require ( github.com/PuerkitoBio/goquery v1.5.0 + github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9 + github.com/chromedp/chromedp v0.3.0 github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 golang.org/x/net v0.0.0-20181114220301-adae6a3d119a golang.org/x/text v0.3.2 // indirect diff --git a/go.sum b/go.sum index 6c455f9..10868b5 100644 --- a/go.sum +++ b/go.sum @@ -2,11 +2,27 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9 h1:ARnDd2vEk91rLNra8yk1hF40H8z+1HrD6juNpe7FsI0= +github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9/go.mod h1:xquOK9dIGFlLaIGI4c6IyfLI/Gz0LiYYuJtzhsUODgI= +github.com/chromedp/chromedp v0.3.0 h1:7/pwrXFRq6/ym3sxCykm90DMoyw6VKXY48DgGRgUURA= +github.com/chromedp/chromedp v0.3.0/go.mod h1:EktsZcC2iycVrRhC9fDmshBpCK9lNnZYi6x2q9uE7zI= github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ= github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8= +github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0= +github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo= +github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8= +github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= +github.com/gobwas/ws v1.0.0 h1:1WdyfgUcImUfVBvYbsW2krIsnko+1QU2t45soaF8v1M= +github.com/gobwas/ws v1.0.0/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM= +github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307 h1:vl4eIlySbjertFaNwiMjXsGrFVK25aOWLq7n+3gh2ls= +github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307/go.mod h1:BjPj+aVjl9FW/cCGiF3nGh5v+9Gd3VCgBQbod/GlMaQ= +github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983 h1:wL11wNW7dhKIcRCHSm4sHKPWz0tt4mwBsVodG7+Xyqg= +github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/sys v0.0.0-20190509141414-a5b02f93d862/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/options.go b/options.go index 95e411f..7aa5523 100644 --- a/options.go +++ b/options.go @@ -52,4 +52,7 @@ type Options struct { // Charset Detection disable CharsetDetectDisabled bool + + // Revisiting same URLs is disabled by default + URLRevisitEnabled bool } diff --git a/request.go b/request.go index 192326b..b9c72dc 100644 --- a/request.go +++ b/request.go @@ -8,4 +8,6 @@ import ( type Request struct { *http.Request Meta map[string]interface{} + + rendered bool } diff --git a/response.go b/response.go index a7d4c11..c95eb01 100644 --- a/response.go +++ b/response.go @@ -31,6 +31,9 @@ func (r *Response) JoinURL(relativeURL string) string { } func (r *Response) isHTML() bool { + if r.Response == nil { + return len(r.Body) != 0 + } contentType := r.Header.Get("Content-Type") for _, htmlContentType := range []string{"text/html", "application/xhtml+xml", "application/vnd.wap.xhtml+xml"} { if strings.Contains(contentType, htmlContentType) {