Response middlewares support implemented.
This commit is contained in:
parent
80383ebd6f
commit
e50fa3b1dc
@ -64,6 +64,10 @@ See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for
|
|||||||
|
|
||||||
go get github.com/geziyor/geziyor
|
go get github.com/geziyor/geziyor
|
||||||
|
|
||||||
|
**NOTE**: macOS limits the maximum number of open file descriptors.
|
||||||
|
If you want to make concurrent requests over 256, you need to increase limits.
|
||||||
|
Read [this](https://wilsonmar.github.io/maximum-limits/) for more.
|
||||||
|
|
||||||
### Making Requests
|
### Making Requests
|
||||||
|
|
||||||
Initial requests start with ```StartURLs []string``` field in ```Options```.
|
Initial requests start with ```StartURLs []string``` field in ```Options```.
|
||||||
|
@ -33,19 +33,16 @@ func (e *CSVExporter) Export(exports chan interface{}) {
|
|||||||
// Detect type and extract CSV values
|
// Detect type and extract CSV values
|
||||||
val := reflect.ValueOf(res)
|
val := reflect.ValueOf(res)
|
||||||
switch val.Kind() {
|
switch val.Kind() {
|
||||||
|
|
||||||
case reflect.Slice:
|
case reflect.Slice:
|
||||||
for i := 0; i < val.Len(); i++ {
|
for i := 0; i < val.Len(); i++ {
|
||||||
values = append(values, fmt.Sprint(val.Index(i)))
|
values = append(values, fmt.Sprint(val.Index(i)))
|
||||||
}
|
}
|
||||||
|
|
||||||
//case reflect.Map:
|
//case reflect.Map:
|
||||||
// iter := val.MapRange()
|
// iter := val.MapRange()
|
||||||
// for iter.Next() {
|
// for iter.Next() {
|
||||||
// values = append(values, fmt.Sprint(iter.Value()))
|
// values = append(values, fmt.Sprint(iter.Value()))
|
||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := writer.Write(values); err != nil {
|
if err := writer.Write(values); err != nil {
|
||||||
log.Printf("CSV writing error on exporter: %v\n", err)
|
log.Printf("CSV writing error on exporter: %v\n", err)
|
||||||
}
|
}
|
||||||
|
21
geziyor.go
21
geziyor.go
@ -1,9 +1,7 @@
|
|||||||
package geziyor
|
package geziyor
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"context"
|
"context"
|
||||||
"github.com/PuerkitoBio/goquery"
|
|
||||||
"github.com/chromedp/cdproto/dom"
|
"github.com/chromedp/cdproto/dom"
|
||||||
"github.com/chromedp/chromedp"
|
"github.com/chromedp/chromedp"
|
||||||
"github.com/fpfeng/httpcache"
|
"github.com/fpfeng/httpcache"
|
||||||
@ -37,8 +35,9 @@ type Geziyor struct {
|
|||||||
sync.RWMutex
|
sync.RWMutex
|
||||||
hostSems map[string]chan struct{}
|
hostSems map[string]chan struct{}
|
||||||
}
|
}
|
||||||
visitedURLs sync.Map
|
visitedURLs sync.Map
|
||||||
requestMiddlewares []RequestMiddleware
|
requestMiddlewares []RequestMiddleware
|
||||||
|
responseMiddlewares []ResponseMiddleware
|
||||||
}
|
}
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
@ -58,6 +57,9 @@ func NewGeziyor(opt Options) *Geziyor {
|
|||||||
duplicateRequestsMiddleware,
|
duplicateRequestsMiddleware,
|
||||||
defaultHeadersMiddleware,
|
defaultHeadersMiddleware,
|
||||||
},
|
},
|
||||||
|
responseMiddlewares: []ResponseMiddleware{
|
||||||
|
parseHTMLMiddleware,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
if opt.UserAgent == "" {
|
if opt.UserAgent == "" {
|
||||||
@ -86,6 +88,7 @@ func NewGeziyor(opt Options) *Geziyor {
|
|||||||
log.SetOutput(ioutil.Discard)
|
log.SetOutput(ioutil.Discard)
|
||||||
}
|
}
|
||||||
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...)
|
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...)
|
||||||
|
geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...)
|
||||||
|
|
||||||
return geziyor
|
return geziyor
|
||||||
}
|
}
|
||||||
@ -186,8 +189,8 @@ func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if !g.Opt.ParseHTMLDisabled && response.isHTML() {
|
for _, middlewareFunc := range g.responseMiddlewares {
|
||||||
response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(response.Body))
|
middlewareFunc(g, response)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Callbacks
|
// Callbacks
|
||||||
@ -240,6 +243,7 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
|
|||||||
Response: resp,
|
Response: resp,
|
||||||
Body: body,
|
Body: body,
|
||||||
Meta: req.Meta,
|
Meta: req.Meta,
|
||||||
|
Request: req,
|
||||||
}
|
}
|
||||||
|
|
||||||
return &response, nil
|
return &response, nil
|
||||||
@ -274,8 +278,9 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
|
|||||||
|
|
||||||
response := Response{
|
response := Response{
|
||||||
//Response: resp,
|
//Response: resp,
|
||||||
Body: []byte(res),
|
Body: []byte(res),
|
||||||
Meta: req.Meta,
|
Meta: req.Meta,
|
||||||
|
Request: req,
|
||||||
}
|
}
|
||||||
|
|
||||||
return &response, nil
|
return &response, nil
|
||||||
|
@ -1,11 +1,19 @@
|
|||||||
package geziyor
|
package geziyor
|
||||||
|
|
||||||
import "github.com/geziyor/geziyor/internal"
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
"github.com/geziyor/geziyor/internal"
|
||||||
|
)
|
||||||
|
|
||||||
// RequestMiddleware called before requests made.
|
// RequestMiddleware called before requests made.
|
||||||
// Set request.Cancelled = true to cancel request
|
// Set request.Cancelled = true to cancel request
|
||||||
type RequestMiddleware func(g *Geziyor, r *Request)
|
type RequestMiddleware func(g *Geziyor, r *Request)
|
||||||
|
|
||||||
|
// ResponseMiddleware called after request response receive
|
||||||
|
type ResponseMiddleware func(g *Geziyor, r *Response)
|
||||||
|
|
||||||
// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains
|
// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains
|
||||||
func allowedDomainsMiddleware(g *Geziyor, r *Request) {
|
func allowedDomainsMiddleware(g *Geziyor, r *Request) {
|
||||||
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
|
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
|
||||||
@ -33,3 +41,11 @@ func defaultHeadersMiddleware(g *Geziyor, r *Request) {
|
|||||||
r.Header = internal.SetDefaultHeader(r.Header, "Accept-Language", "en")
|
r.Header = internal.SetDefaultHeader(r.Header, "Accept-Language", "en")
|
||||||
r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
|
r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// parseHTMLMiddleware parses response if response is HTML
|
||||||
|
func parseHTMLMiddleware(g *Geziyor, r *Response) {
|
||||||
|
fmt.Println(r.Request.depth)
|
||||||
|
if !g.Opt.ParseHTMLDisabled && r.isHTML() {
|
||||||
|
r.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -50,6 +50,9 @@ type Options struct {
|
|||||||
// Called before requests made to manipulate requests
|
// Called before requests made to manipulate requests
|
||||||
RequestMiddlewares []RequestMiddleware
|
RequestMiddlewares []RequestMiddleware
|
||||||
|
|
||||||
|
// Called after response received
|
||||||
|
ResponseMiddlewares []ResponseMiddleware
|
||||||
|
|
||||||
// Max body reading size in bytes. Default: 1GB
|
// Max body reading size in bytes. Default: 1GB
|
||||||
MaxBodySize int64
|
MaxBodySize int64
|
||||||
|
|
||||||
|
@ -14,6 +14,7 @@ type Response struct {
|
|||||||
Body []byte
|
Body []byte
|
||||||
DocHTML *goquery.Document
|
DocHTML *goquery.Document
|
||||||
Meta map[string]interface{}
|
Meta map[string]interface{}
|
||||||
|
Request *Request
|
||||||
}
|
}
|
||||||
|
|
||||||
// JoinURL joins base response URL and provided relative URL.
|
// JoinURL joins base response URL and provided relative URL.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user