Response middlewares support implemented.

This commit is contained in:
Musab Gültekin 2019-06-16 18:29:07 +03:00
parent 80383ebd6f
commit e50fa3b1dc
6 changed files with 38 additions and 12 deletions

View File

@ -64,6 +64,10 @@ See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for
go get github.com/geziyor/geziyor
**NOTE**: macOS limits the maximum number of open file descriptors.
If you want to make concurrent requests over 256, you need to increase limits.
Read [this](https://wilsonmar.github.io/maximum-limits/) for more.
### Making Requests
Initial requests start with ```StartURLs []string``` field in ```Options```.

View File

@ -33,19 +33,16 @@ func (e *CSVExporter) Export(exports chan interface{}) {
// Detect type and extract CSV values
val := reflect.ValueOf(res)
switch val.Kind() {
case reflect.Slice:
for i := 0; i < val.Len(); i++ {
values = append(values, fmt.Sprint(val.Index(i)))
}
//case reflect.Map:
// iter := val.MapRange()
// for iter.Next() {
// values = append(values, fmt.Sprint(iter.Value()))
// }
}
if err := writer.Write(values); err != nil {
log.Printf("CSV writing error on exporter: %v\n", err)
}

View File

@ -1,9 +1,7 @@
package geziyor
import (
"bytes"
"context"
"github.com/PuerkitoBio/goquery"
"github.com/chromedp/cdproto/dom"
"github.com/chromedp/chromedp"
"github.com/fpfeng/httpcache"
@ -39,6 +37,7 @@ type Geziyor struct {
}
visitedURLs sync.Map
requestMiddlewares []RequestMiddleware
responseMiddlewares []ResponseMiddleware
}
func init() {
@ -58,6 +57,9 @@ func NewGeziyor(opt Options) *Geziyor {
duplicateRequestsMiddleware,
defaultHeadersMiddleware,
},
responseMiddlewares: []ResponseMiddleware{
parseHTMLMiddleware,
},
}
if opt.UserAgent == "" {
@ -86,6 +88,7 @@ func NewGeziyor(opt Options) *Geziyor {
log.SetOutput(ioutil.Discard)
}
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...)
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...)
return geziyor
}
@ -186,8 +189,8 @@ func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) {
return
}
if !g.Opt.ParseHTMLDisabled && response.isHTML() {
response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(response.Body))
for _, middlewareFunc := range g.responseMiddlewares {
middlewareFunc(g, response)
}
// Callbacks
@ -240,6 +243,7 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
Response: resp,
Body: body,
Meta: req.Meta,
Request: req,
}
return &response, nil
@ -276,6 +280,7 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
//Response: resp,
Body: []byte(res),
Meta: req.Meta,
Request: req,
}
return &response, nil

View File

@ -1,11 +1,19 @@
package geziyor
import "github.com/geziyor/geziyor/internal"
import (
"bytes"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor/internal"
)
// RequestMiddleware called before requests made.
// Set request.Cancelled = true to cancel request
type RequestMiddleware func(g *Geziyor, r *Request)
// ResponseMiddleware called after request response receive
type ResponseMiddleware func(g *Geziyor, r *Response)
// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains
func allowedDomainsMiddleware(g *Geziyor, r *Request) {
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
@ -33,3 +41,11 @@ func defaultHeadersMiddleware(g *Geziyor, r *Request) {
r.Header = internal.SetDefaultHeader(r.Header, "Accept-Language", "en")
r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
}
// parseHTMLMiddleware parses response if response is HTML
func parseHTMLMiddleware(g *Geziyor, r *Response) {
fmt.Println(r.Request.depth)
if !g.Opt.ParseHTMLDisabled && r.isHTML() {
r.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
}
}

View File

@ -50,6 +50,9 @@ type Options struct {
// Called before requests made to manipulate requests
RequestMiddlewares []RequestMiddleware
// Called after response received
ResponseMiddlewares []ResponseMiddleware
// Max body reading size in bytes. Default: 1GB
MaxBodySize int64

View File

@ -14,6 +14,7 @@ type Response struct {
Body []byte
DocHTML *goquery.Document
Meta map[string]interface{}
Request *Request
}
// JoinURL joins base response URL and provided relative URL.