diff --git a/README.md b/README.md index 8043905..bd0c2e5 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,10 @@ See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for go get github.com/geziyor/geziyor +**NOTE**: macOS limits the maximum number of open file descriptors. +If you want to make concurrent requests over 256, you need to increase limits. +Read [this](https://wilsonmar.github.io/maximum-limits/) for more. + ### Making Requests Initial requests start with ```StartURLs []string``` field in ```Options```. diff --git a/exporter/csv.go b/exporter/csv.go index 3b37530..e1ccc1f 100644 --- a/exporter/csv.go +++ b/exporter/csv.go @@ -33,19 +33,16 @@ func (e *CSVExporter) Export(exports chan interface{}) { // Detect type and extract CSV values val := reflect.ValueOf(res) switch val.Kind() { - case reflect.Slice: for i := 0; i < val.Len(); i++ { values = append(values, fmt.Sprint(val.Index(i))) } - //case reflect.Map: // iter := val.MapRange() // for iter.Next() { // values = append(values, fmt.Sprint(iter.Value())) // } } - if err := writer.Write(values); err != nil { log.Printf("CSV writing error on exporter: %v\n", err) } diff --git a/geziyor.go b/geziyor.go index e44140c..c34526b 100644 --- a/geziyor.go +++ b/geziyor.go @@ -1,9 +1,7 @@ package geziyor import ( - "bytes" "context" - "github.com/PuerkitoBio/goquery" "github.com/chromedp/cdproto/dom" "github.com/chromedp/chromedp" "github.com/fpfeng/httpcache" @@ -37,8 +35,9 @@ type Geziyor struct { sync.RWMutex hostSems map[string]chan struct{} } - visitedURLs sync.Map - requestMiddlewares []RequestMiddleware + visitedURLs sync.Map + requestMiddlewares []RequestMiddleware + responseMiddlewares []ResponseMiddleware } func init() { @@ -58,6 +57,9 @@ func NewGeziyor(opt Options) *Geziyor { duplicateRequestsMiddleware, defaultHeadersMiddleware, }, + responseMiddlewares: []ResponseMiddleware{ + parseHTMLMiddleware, + }, } if opt.UserAgent == "" { @@ -86,6 +88,7 @@ func NewGeziyor(opt Options) *Geziyor { log.SetOutput(ioutil.Discard) } geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...) + geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, opt.ResponseMiddlewares...) return geziyor } @@ -186,8 +189,8 @@ func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) { return } - if !g.Opt.ParseHTMLDisabled && response.isHTML() { - response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(response.Body)) + for _, middlewareFunc := range g.responseMiddlewares { + middlewareFunc(g, response) } // Callbacks @@ -240,6 +243,7 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) { Response: resp, Body: body, Meta: req.Meta, + Request: req, } return &response, nil @@ -274,8 +278,9 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) { response := Response{ //Response: resp, - Body: []byte(res), - Meta: req.Meta, + Body: []byte(res), + Meta: req.Meta, + Request: req, } return &response, nil diff --git a/middleware.go b/middleware.go index 8452b4f..44b89bc 100644 --- a/middleware.go +++ b/middleware.go @@ -1,11 +1,19 @@ package geziyor -import "github.com/geziyor/geziyor/internal" +import ( + "bytes" + "fmt" + "github.com/PuerkitoBio/goquery" + "github.com/geziyor/geziyor/internal" +) // RequestMiddleware called before requests made. // Set request.Cancelled = true to cancel request type RequestMiddleware func(g *Geziyor, r *Request) +// ResponseMiddleware called after request response receive +type ResponseMiddleware func(g *Geziyor, r *Response) + // allowedDomainsMiddleware checks for request host if it exists in AllowedDomains func allowedDomainsMiddleware(g *Geziyor, r *Request) { if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) { @@ -33,3 +41,11 @@ func defaultHeadersMiddleware(g *Geziyor, r *Request) { r.Header = internal.SetDefaultHeader(r.Header, "Accept-Language", "en") r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent) } + +// parseHTMLMiddleware parses response if response is HTML +func parseHTMLMiddleware(g *Geziyor, r *Response) { + fmt.Println(r.Request.depth) + if !g.Opt.ParseHTMLDisabled && r.isHTML() { + r.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) + } +} diff --git a/options.go b/options.go index 7e05593..efb3fc8 100644 --- a/options.go +++ b/options.go @@ -50,6 +50,9 @@ type Options struct { // Called before requests made to manipulate requests RequestMiddlewares []RequestMiddleware + // Called after response received + ResponseMiddlewares []ResponseMiddleware + // Max body reading size in bytes. Default: 1GB MaxBodySize int64 diff --git a/response.go b/response.go index 2c970a9..1aa3ed3 100644 --- a/response.go +++ b/response.go @@ -14,6 +14,7 @@ type Response struct { Body []byte DocHTML *goquery.Document Meta map[string]interface{} + Request *Request } // JoinURL joins base response URL and provided relative URL.