Making Requests and reading responses refactored to client package.
This commit is contained in:
parent
0eac5f5f40
commit
ec4551a8a0
117
client/client.go
117
client/client.go
@ -1,7 +1,14 @@
|
||||
package client
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"context"
|
||||
"github.com/chromedp/cdproto/dom"
|
||||
"github.com/chromedp/cdproto/network"
|
||||
"github.com/chromedp/chromedp"
|
||||
"github.com/pkg/errors"
|
||||
"golang.org/x/net/html/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
@ -39,6 +46,114 @@ func NewClient() *Client {
|
||||
return &Client{Client: client}
|
||||
}
|
||||
|
||||
// DoRequest selects appropriate request handler, client or Chrome
|
||||
func (c *Client) DoRequest(req *Request, maxBodySize int64, charsetDetectDisabled bool) (*Response, error) {
|
||||
if !req.Rendered {
|
||||
return c.DoRequestClient(req, maxBodySize, charsetDetectDisabled)
|
||||
} else {
|
||||
return c.DoRequestChrome(req)
|
||||
}
|
||||
}
|
||||
|
||||
// DoRequestClient is a simple wrapper to read response according to options.
|
||||
func (c *Client) DoRequestClient(req *Request, maxBodySize int64, charsetDetectDisabled bool) (*Response, error) {
|
||||
// Do request
|
||||
resp, err := c.Do(req.Request)
|
||||
if resp != nil {
|
||||
defer resp.Body.Close()
|
||||
}
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "Response error")
|
||||
}
|
||||
|
||||
// Limit response body reading
|
||||
bodyReader := io.LimitReader(resp.Body, maxBodySize)
|
||||
|
||||
// Start reading body and determine encoding
|
||||
if !charsetDetectDisabled && resp.Request.Method != "HEAD" {
|
||||
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "Determine encoding error")
|
||||
}
|
||||
}
|
||||
|
||||
body, err := ioutil.ReadAll(bodyReader)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "Reading body error")
|
||||
}
|
||||
|
||||
response := Response{
|
||||
Response: resp,
|
||||
Body: body,
|
||||
Meta: req.Meta,
|
||||
Request: req,
|
||||
}
|
||||
|
||||
return &response, nil
|
||||
}
|
||||
|
||||
// DoRequestChrome opens up a new chrome instance and makes request
|
||||
func (c *Client) DoRequestChrome(req *Request) (*Response, error) {
|
||||
var body string
|
||||
var reqID network.RequestID
|
||||
var res *network.Response
|
||||
|
||||
ctx, cancel := chromedp.NewContext(context.Background())
|
||||
defer cancel()
|
||||
|
||||
if err := chromedp.Run(ctx,
|
||||
network.Enable(),
|
||||
network.SetExtraHTTPHeaders(network.Headers(ConvertHeaderToMap(req.Header))),
|
||||
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||
chromedp.ListenTarget(ctx, func(ev interface{}) {
|
||||
switch ev.(type) {
|
||||
case *network.EventRequestWillBeSent:
|
||||
reqEvent := ev.(*network.EventRequestWillBeSent)
|
||||
if _, exists := reqEvent.Request.Headers["Referer"]; !exists {
|
||||
reqID = reqEvent.RequestID
|
||||
}
|
||||
//if reqEvent := ev.(*network.EventRequestWillBeSent); reqEvent.Request.URL == req.URL.String() {
|
||||
// reqID = reqEvent.RequestID
|
||||
//}
|
||||
case *network.EventResponseReceived:
|
||||
if resEvent := ev.(*network.EventResponseReceived); resEvent.RequestID == reqID {
|
||||
res = resEvent.Response
|
||||
}
|
||||
}
|
||||
})
|
||||
return nil
|
||||
}),
|
||||
chromedp.Navigate(req.URL.String()),
|
||||
chromedp.WaitReady(":root"),
|
||||
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||
node, err := dom.GetDocument().Do(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
body, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx)
|
||||
return err
|
||||
}),
|
||||
); err != nil {
|
||||
return nil, errors.Wrap(err, "Request getting rendered error")
|
||||
}
|
||||
|
||||
// Set new URL in case of redirection
|
||||
req.URL, _ = url.Parse(res.URL)
|
||||
|
||||
response := Response{
|
||||
Response: &http.Response{
|
||||
Request: req.Request,
|
||||
StatusCode: int(res.Status),
|
||||
Header: ConvertMapToHeader(res.Headers),
|
||||
},
|
||||
Body: []byte(body),
|
||||
Meta: req.Meta,
|
||||
Request: req,
|
||||
}
|
||||
|
||||
return &response, nil
|
||||
}
|
||||
|
||||
// SetCookies handles the receipt of the cookies in a reply for the given URL
|
||||
func (c *Client) SetCookies(URL string, cookies []*http.Cookie) error {
|
||||
if c.Jar == nil {
|
||||
|
121
geziyor.go
121
geziyor.go
@ -1,23 +1,15 @@
|
||||
package geziyor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/chromedp/cdproto/dom"
|
||||
"github.com/chromedp/cdproto/network"
|
||||
"github.com/chromedp/chromedp"
|
||||
"github.com/fpfeng/httpcache"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/metrics"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"golang.org/x/net/html/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/http/cookiejar"
|
||||
"net/url"
|
||||
"sync"
|
||||
)
|
||||
|
||||
@ -214,131 +206,26 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
|
||||
}
|
||||
}
|
||||
|
||||
// Do request normal or Chrome and read response
|
||||
var response *client.Response
|
||||
var err error
|
||||
if !req.Rendered {
|
||||
response, err = g.doRequestClient(req)
|
||||
} else {
|
||||
response, err = g.doRequestChrome(req)
|
||||
}
|
||||
res, err := g.Client.DoRequest(req, g.Opt.MaxBodySize, g.Opt.CharsetDetectDisabled)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
for _, middlewareFunc := range g.responseMiddlewares {
|
||||
middlewareFunc(g, response)
|
||||
middlewareFunc(g, res)
|
||||
}
|
||||
|
||||
// Callbacks
|
||||
if callback != nil {
|
||||
callback(g, response)
|
||||
callback(g, res)
|
||||
} else {
|
||||
if g.Opt.ParseFunc != nil {
|
||||
g.Opt.ParseFunc(g, response)
|
||||
g.Opt.ParseFunc(g, res)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (g *Geziyor) doRequestClient(req *client.Request) (*client.Response, error) {
|
||||
|
||||
// Do request
|
||||
resp, err := g.Client.Do(req.Request)
|
||||
if resp != nil {
|
||||
defer resp.Body.Close()
|
||||
}
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "Response error")
|
||||
}
|
||||
|
||||
// Limit response body reading
|
||||
bodyReader := io.LimitReader(resp.Body, g.Opt.MaxBodySize)
|
||||
|
||||
// Start reading body and determine encoding
|
||||
if !g.Opt.CharsetDetectDisabled && resp.Request.Method != "HEAD" {
|
||||
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "Determine encoding error")
|
||||
}
|
||||
}
|
||||
|
||||
body, err := ioutil.ReadAll(bodyReader)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "Reading body error")
|
||||
}
|
||||
|
||||
response := client.Response{
|
||||
Response: resp,
|
||||
Body: body,
|
||||
Meta: req.Meta,
|
||||
Request: req,
|
||||
}
|
||||
|
||||
return &response, nil
|
||||
}
|
||||
|
||||
func (g *Geziyor) doRequestChrome(req *client.Request) (*client.Response, error) {
|
||||
var body string
|
||||
var reqID network.RequestID
|
||||
var res *network.Response
|
||||
|
||||
ctx, cancel := chromedp.NewContext(context.Background())
|
||||
defer cancel()
|
||||
|
||||
if err := chromedp.Run(ctx,
|
||||
network.Enable(),
|
||||
network.SetExtraHTTPHeaders(network.Headers(client.ConvertHeaderToMap(req.Header))),
|
||||
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||
chromedp.ListenTarget(ctx, func(ev interface{}) {
|
||||
switch ev.(type) {
|
||||
case *network.EventRequestWillBeSent:
|
||||
reqEvent := ev.(*network.EventRequestWillBeSent)
|
||||
if _, exists := reqEvent.Request.Headers["Referer"]; !exists {
|
||||
reqID = reqEvent.RequestID
|
||||
}
|
||||
//if reqEvent := ev.(*network.EventRequestWillBeSent); reqEvent.Request.URL == req.URL.String() {
|
||||
// reqID = reqEvent.RequestID
|
||||
//}
|
||||
case *network.EventResponseReceived:
|
||||
if resEvent := ev.(*network.EventResponseReceived); resEvent.RequestID == reqID {
|
||||
res = resEvent.Response
|
||||
}
|
||||
}
|
||||
})
|
||||
return nil
|
||||
}),
|
||||
chromedp.Navigate(req.URL.String()),
|
||||
chromedp.WaitReady(":root"),
|
||||
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||
node, err := dom.GetDocument().Do(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
body, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx)
|
||||
return err
|
||||
}),
|
||||
); err != nil {
|
||||
return nil, errors.Wrap(err, "Request getting rendered error")
|
||||
}
|
||||
|
||||
// Set new URL in case of redirection
|
||||
req.URL, _ = url.Parse(res.URL)
|
||||
|
||||
response := client.Response{
|
||||
Response: &http.Response{
|
||||
Request: req.Request,
|
||||
StatusCode: int(res.Status),
|
||||
Header: client.ConvertMapToHeader(res.Headers),
|
||||
},
|
||||
Body: []byte(body),
|
||||
Meta: req.Meta,
|
||||
Request: req,
|
||||
}
|
||||
|
||||
return &response, nil
|
||||
}
|
||||
|
||||
func (g *Geziyor) acquireSem(req *client.Request) {
|
||||
if g.Opt.ConcurrentRequests != 0 {
|
||||
g.semGlobal <- struct{}{}
|
||||
|
@ -68,6 +68,9 @@ func quotesParse(g *geziyor.Geziyor, r *client.Response) {
|
||||
}
|
||||
|
||||
func TestAllLinks(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping test in short mode.")
|
||||
}
|
||||
defer leaktest.Check(t)()
|
||||
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
|
Loading…
x
Reference in New Issue
Block a user