Response middlewares support implemented.
This commit is contained in:
parent
80383ebd6f
commit
e50fa3b1dc
@ -64,6 +64,10 @@ See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for
|
||||
|
||||
go get github.com/geziyor/geziyor
|
||||
|
||||
**NOTE**: macOS limits the maximum number of open file descriptors.
|
||||
If you want to make concurrent requests over 256, you need to increase limits.
|
||||
Read [this](https://wilsonmar.github.io/maximum-limits/) for more.
|
||||
|
||||
### Making Requests
|
||||
|
||||
Initial requests start with ```StartURLs []string``` field in ```Options```.
|
||||
|
@ -33,19 +33,16 @@ func (e *CSVExporter) Export(exports chan interface{}) {
|
||||
// Detect type and extract CSV values
|
||||
val := reflect.ValueOf(res)
|
||||
switch val.Kind() {
|
||||
|
||||
case reflect.Slice:
|
||||
for i := 0; i < val.Len(); i++ {
|
||||
values = append(values, fmt.Sprint(val.Index(i)))
|
||||
}
|
||||
|
||||
//case reflect.Map:
|
||||
// iter := val.MapRange()
|
||||
// for iter.Next() {
|
||||
// values = append(values, fmt.Sprint(iter.Value()))
|
||||
// }
|
||||
}
|
||||
|
||||
if err := writer.Write(values); err != nil {
|
||||
log.Printf("CSV writing error on exporter: %v\n", err)
|
||||
}
|
||||
|
21
geziyor.go
21
geziyor.go
@ -1,9 +1,7 @@
|
||||
package geziyor
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/chromedp/cdproto/dom"
|
||||
"github.com/chromedp/chromedp"
|
||||
"github.com/fpfeng/httpcache"
|
||||
@ -37,8 +35,9 @@ type Geziyor struct {
|
||||
sync.RWMutex
|
||||
hostSems map[string]chan struct{}
|
||||
}
|
||||
visitedURLs sync.Map
|
||||
requestMiddlewares []RequestMiddleware
|
||||
visitedURLs sync.Map
|
||||
requestMiddlewares []RequestMiddleware
|
||||
responseMiddlewares []ResponseMiddleware
|
||||
}
|
||||
|
||||
func init() {
|
||||
@ -58,6 +57,9 @@ func NewGeziyor(opt Options) *Geziyor {
|
||||
duplicateRequestsMiddleware,
|
||||
defaultHeadersMiddleware,
|
||||
},
|
||||
responseMiddlewares: []ResponseMiddleware{
|
||||
parseHTMLMiddleware,
|
||||
},
|
||||
}
|
||||
|
||||
if opt.UserAgent == "" {
|
||||
@ -86,6 +88,7 @@ func NewGeziyor(opt Options) *Geziyor {
|
||||
log.SetOutput(ioutil.Discard)
|
||||
}
|
||||
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...)
|
||||
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...)
|
||||
|
||||
return geziyor
|
||||
}
|
||||
@ -186,8 +189,8 @@ func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) {
|
||||
return
|
||||
}
|
||||
|
||||
if !g.Opt.ParseHTMLDisabled && response.isHTML() {
|
||||
response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(response.Body))
|
||||
for _, middlewareFunc := range g.responseMiddlewares {
|
||||
middlewareFunc(g, response)
|
||||
}
|
||||
|
||||
// Callbacks
|
||||
@ -240,6 +243,7 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
|
||||
Response: resp,
|
||||
Body: body,
|
||||
Meta: req.Meta,
|
||||
Request: req,
|
||||
}
|
||||
|
||||
return &response, nil
|
||||
@ -274,8 +278,9 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
|
||||
|
||||
response := Response{
|
||||
//Response: resp,
|
||||
Body: []byte(res),
|
||||
Meta: req.Meta,
|
||||
Body: []byte(res),
|
||||
Meta: req.Meta,
|
||||
Request: req,
|
||||
}
|
||||
|
||||
return &response, nil
|
||||
|
@ -1,11 +1,19 @@
|
||||
package geziyor
|
||||
|
||||
import "github.com/geziyor/geziyor/internal"
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/geziyor/geziyor/internal"
|
||||
)
|
||||
|
||||
// RequestMiddleware called before requests made.
|
||||
// Set request.Cancelled = true to cancel request
|
||||
type RequestMiddleware func(g *Geziyor, r *Request)
|
||||
|
||||
// ResponseMiddleware called after request response receive
|
||||
type ResponseMiddleware func(g *Geziyor, r *Response)
|
||||
|
||||
// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains
|
||||
func allowedDomainsMiddleware(g *Geziyor, r *Request) {
|
||||
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
|
||||
@ -33,3 +41,11 @@ func defaultHeadersMiddleware(g *Geziyor, r *Request) {
|
||||
r.Header = internal.SetDefaultHeader(r.Header, "Accept-Language", "en")
|
||||
r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
|
||||
}
|
||||
|
||||
// parseHTMLMiddleware parses response if response is HTML
|
||||
func parseHTMLMiddleware(g *Geziyor, r *Response) {
|
||||
fmt.Println(r.Request.depth)
|
||||
if !g.Opt.ParseHTMLDisabled && r.isHTML() {
|
||||
r.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
|
||||
}
|
||||
}
|
||||
|
@ -50,6 +50,9 @@ type Options struct {
|
||||
// Called before requests made to manipulate requests
|
||||
RequestMiddlewares []RequestMiddleware
|
||||
|
||||
// Called after response received
|
||||
ResponseMiddlewares []ResponseMiddleware
|
||||
|
||||
// Max body reading size in bytes. Default: 1GB
|
||||
MaxBodySize int64
|
||||
|
||||
|
@ -14,6 +14,7 @@ type Response struct {
|
||||
Body []byte
|
||||
DocHTML *goquery.Document
|
||||
Meta map[string]interface{}
|
||||
Request *Request
|
||||
}
|
||||
|
||||
// JoinURL joins base response URL and provided relative URL.
|
||||
|
Loading…
x
Reference in New Issue
Block a user