geziyor/middleware.go
Musab Gültekin 936d157785 Revert "Try parsing HTML even if content-type is empty."
This reverts commit f384fc2c
2019-06-18 13:03:00 +03:00

50 lines
1.7 KiB
Go

package geziyor
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor/internal"
)
// RequestMiddleware called before requests made.
// Set request.Cancelled = true to cancel request
type RequestMiddleware func(g *Geziyor, r *Request)
// ResponseMiddleware called after request response receive
type ResponseMiddleware func(g *Geziyor, r *Response)
// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains
func allowedDomainsMiddleware(g *Geziyor, r *Request) {
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
//log.Printf("Domain not allowed: %s\n", req.Host)
r.Cancelled = true
return
}
}
// duplicateRequestsMiddleware checks for already visited URLs
func duplicateRequestsMiddleware(g *Geziyor, r *Request) {
if !g.Opt.URLRevisitEnabled {
key := r.Request.URL.String() + r.Request.Method
if _, visited := g.visitedURLs.LoadOrStore(key, struct{}{}); visited {
//log.Printf("URL already visited %s\n", rawURL)
r.Cancelled = true
}
}
}
// defaultHeadersMiddleware sets default request headers
func defaultHeadersMiddleware(g *Geziyor, r *Request) {
r.Header = internal.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Header = internal.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
r.Header = internal.SetDefaultHeader(r.Header, "Accept-Language", "en")
r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
}
// parseHTMLMiddleware parses response if response is HTML
func parseHTMLMiddleware(g *Geziyor, r *Response) {
if !g.Opt.ParseHTMLDisabled && r.isHTML() {
r.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
}
}