From 936d157785b04158ca1e03074ba45a5d410db7db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Tue, 18 Jun 2019 13:03:00 +0300 Subject: [PATCH] Revert "Try parsing HTML even if content-type is empty." This reverts commit f384fc2c --- middleware.go | 2 +- response.go | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/middleware.go b/middleware.go index ea502f9..432ed4b 100644 --- a/middleware.go +++ b/middleware.go @@ -43,7 +43,7 @@ func defaultHeadersMiddleware(g *Geziyor, r *Request) { // parseHTMLMiddleware parses response if response is HTML func parseHTMLMiddleware(g *Geziyor, r *Response) { - if !g.Opt.ParseHTMLDisabled { + if !g.Opt.ParseHTMLDisabled && r.isHTML() { r.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) } } diff --git a/response.go b/response.go index e216d95..1aa3ed3 100644 --- a/response.go +++ b/response.go @@ -4,6 +4,7 @@ import ( "github.com/PuerkitoBio/goquery" "net/http" "net/url" + "strings" ) // Response type wraps http.Response @@ -26,3 +27,16 @@ func (r *Response) JoinURL(relativeURL string) string { joinedURL := r.Response.Request.URL.ResolveReference(parsedRelativeURL) return joinedURL.String() } + +func (r *Response) isHTML() bool { + if r.Response == nil { + return len(r.Body) != 0 + } + contentType := r.Header.Get("Content-Type") + for _, htmlContentType := range []string{"text/html", "application/xhtml+xml", "application/vnd.wap.xhtml+xml"} { + if strings.Contains(contentType, htmlContentType) { + return true + } + } + return false +}