Response middlewares support implemented.

This commit is contained in:
Musab Gültekin 2019-06-16 18:29:07 +03:00
parent 80383ebd6f
commit e50fa3b1dc
6 changed files with 38 additions and 12 deletions

View File

@ -64,6 +64,10 @@ See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for
go get github.com/geziyor/geziyor go get github.com/geziyor/geziyor
**NOTE**: macOS limits the maximum number of open file descriptors.
If you want to make concurrent requests over 256, you need to increase limits.
Read [this](https://wilsonmar.github.io/maximum-limits/) for more.
### Making Requests ### Making Requests
Initial requests start with ```StartURLs []string``` field in ```Options```. Initial requests start with ```StartURLs []string``` field in ```Options```.

View File

@ -33,19 +33,16 @@ func (e *CSVExporter) Export(exports chan interface{}) {
// Detect type and extract CSV values // Detect type and extract CSV values
val := reflect.ValueOf(res) val := reflect.ValueOf(res)
switch val.Kind() { switch val.Kind() {
case reflect.Slice: case reflect.Slice:
for i := 0; i < val.Len(); i++ { for i := 0; i < val.Len(); i++ {
values = append(values, fmt.Sprint(val.Index(i))) values = append(values, fmt.Sprint(val.Index(i)))
} }
//case reflect.Map: //case reflect.Map:
// iter := val.MapRange() // iter := val.MapRange()
// for iter.Next() { // for iter.Next() {
// values = append(values, fmt.Sprint(iter.Value())) // values = append(values, fmt.Sprint(iter.Value()))
// } // }
} }
if err := writer.Write(values); err != nil { if err := writer.Write(values); err != nil {
log.Printf("CSV writing error on exporter: %v\n", err) log.Printf("CSV writing error on exporter: %v\n", err)
} }

View File

@ -1,9 +1,7 @@
package geziyor package geziyor
import ( import (
"bytes"
"context" "context"
"github.com/PuerkitoBio/goquery"
"github.com/chromedp/cdproto/dom" "github.com/chromedp/cdproto/dom"
"github.com/chromedp/chromedp" "github.com/chromedp/chromedp"
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
@ -39,6 +37,7 @@ type Geziyor struct {
} }
visitedURLs sync.Map visitedURLs sync.Map
requestMiddlewares []RequestMiddleware requestMiddlewares []RequestMiddleware
responseMiddlewares []ResponseMiddleware
} }
func init() { func init() {
@ -58,6 +57,9 @@ func NewGeziyor(opt Options) *Geziyor {
duplicateRequestsMiddleware, duplicateRequestsMiddleware,
defaultHeadersMiddleware, defaultHeadersMiddleware,
}, },
responseMiddlewares: []ResponseMiddleware{
parseHTMLMiddleware,
},
} }
if opt.UserAgent == "" { if opt.UserAgent == "" {
@ -86,6 +88,7 @@ func NewGeziyor(opt Options) *Geziyor {
log.SetOutput(ioutil.Discard) log.SetOutput(ioutil.Discard)
} }
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...) geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...)
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...)
return geziyor return geziyor
} }
@ -186,8 +189,8 @@ func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) {
return return
} }
if !g.Opt.ParseHTMLDisabled && response.isHTML() { for _, middlewareFunc := range g.responseMiddlewares {
response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(response.Body)) middlewareFunc(g, response)
} }
// Callbacks // Callbacks
@ -240,6 +243,7 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
Response: resp, Response: resp,
Body: body, Body: body,
Meta: req.Meta, Meta: req.Meta,
Request: req,
} }
return &response, nil return &response, nil
@ -276,6 +280,7 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
//Response: resp, //Response: resp,
Body: []byte(res), Body: []byte(res),
Meta: req.Meta, Meta: req.Meta,
Request: req,
} }
return &response, nil return &response, nil

View File

@ -1,11 +1,19 @@
package geziyor package geziyor
import "github.com/geziyor/geziyor/internal" import (
"bytes"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor/internal"
)
// RequestMiddleware called before requests made. // RequestMiddleware called before requests made.
// Set request.Cancelled = true to cancel request // Set request.Cancelled = true to cancel request
type RequestMiddleware func(g *Geziyor, r *Request) type RequestMiddleware func(g *Geziyor, r *Request)
// ResponseMiddleware called after request response receive
type ResponseMiddleware func(g *Geziyor, r *Response)
// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains // allowedDomainsMiddleware checks for request host if it exists in AllowedDomains
func allowedDomainsMiddleware(g *Geziyor, r *Request) { func allowedDomainsMiddleware(g *Geziyor, r *Request) {
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) { if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
@ -33,3 +41,11 @@ func defaultHeadersMiddleware(g *Geziyor, r *Request) {
r.Header = internal.SetDefaultHeader(r.Header, "Accept-Language", "en") r.Header = internal.SetDefaultHeader(r.Header, "Accept-Language", "en")
r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent) r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
} }
// parseHTMLMiddleware parses response if response is HTML
func parseHTMLMiddleware(g *Geziyor, r *Response) {
fmt.Println(r.Request.depth)
if !g.Opt.ParseHTMLDisabled && r.isHTML() {
r.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
}
}

View File

@ -50,6 +50,9 @@ type Options struct {
// Called before requests made to manipulate requests // Called before requests made to manipulate requests
RequestMiddlewares []RequestMiddleware RequestMiddlewares []RequestMiddleware
// Called after response received
ResponseMiddlewares []ResponseMiddleware
// Max body reading size in bytes. Default: 1GB // Max body reading size in bytes. Default: 1GB
MaxBodySize int64 MaxBodySize int64

View File

@ -14,6 +14,7 @@ type Response struct {
Body []byte Body []byte
DocHTML *goquery.Document DocHTML *goquery.Document
Meta map[string]interface{} Meta map[string]interface{}
Request *Request
} }
// JoinURL joins base response URL and provided relative URL. // JoinURL joins base response URL and provided relative URL.