Try parsing HTML even if content-type is empty.
This commit is contained in:
4
go.mod
4
go.mod
@ -4,8 +4,8 @@ go 1.12
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.5.0
|
||||
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9
|
||||
github.com/chromedp/chromedp v0.3.0
|
||||
github.com/chromedp/cdproto v0.0.0-20190609032908-dd39f0bf0a54
|
||||
github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05
|
||||
github.com/fortytw2/leaktest v1.3.0
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
|
||||
golang.org/x/net v0.0.0-20190522155817-f3200d17e092
|
||||
|
17
go.sum
17
go.sum
@ -2,10 +2,10 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP
|
||||
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
|
||||
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
|
||||
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9 h1:ARnDd2vEk91rLNra8yk1hF40H8z+1HrD6juNpe7FsI0=
|
||||
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9/go.mod h1:xquOK9dIGFlLaIGI4c6IyfLI/Gz0LiYYuJtzhsUODgI=
|
||||
github.com/chromedp/chromedp v0.3.0 h1:7/pwrXFRq6/ym3sxCykm90DMoyw6VKXY48DgGRgUURA=
|
||||
github.com/chromedp/chromedp v0.3.0/go.mod h1:EktsZcC2iycVrRhC9fDmshBpCK9lNnZYi6x2q9uE7zI=
|
||||
github.com/chromedp/cdproto v0.0.0-20190609032908-dd39f0bf0a54 h1:2NlKweNkC3yy6IRI11bPdH66tgSKxCxX6rKbNlg3aG4=
|
||||
github.com/chromedp/cdproto v0.0.0-20190609032908-dd39f0bf0a54/go.mod h1:5NWqr1Ri5aJB5uSvUXfVpbBslleS+eMjspUWv2Lcaow=
|
||||
github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05 h1:5iy45UjpWvkgTcd7GrGQSPr7sifrp9nNweI/eAsMjGE=
|
||||
github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05/go.mod h1:MsTqWB2yT7cErDFnF1F3y0PN8i/a/qQj+0GXKLW/I3s=
|
||||
github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw=
|
||||
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ=
|
||||
@ -14,11 +14,10 @@ github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah
|
||||
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo=
|
||||
github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8=
|
||||
github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
|
||||
github.com/gobwas/ws v1.0.0 h1:1WdyfgUcImUfVBvYbsW2krIsnko+1QU2t45soaF8v1M=
|
||||
github.com/gobwas/ws v1.0.0/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
|
||||
github.com/gobwas/ws v1.0.1 h1:iYpM3WoNpsexO6bqCN1MnvVRylnKg6278zivIZDRXUM=
|
||||
github.com/gobwas/ws v1.0.1/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
|
||||
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307 h1:vl4eIlySbjertFaNwiMjXsGrFVK25aOWLq7n+3gh2ls=
|
||||
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307/go.mod h1:BjPj+aVjl9FW/cCGiF3nGh5v+9Gd3VCgBQbod/GlMaQ=
|
||||
github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
|
||||
github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983 h1:wL11wNW7dhKIcRCHSm4sHKPWz0tt4mwBsVodG7+Xyqg=
|
||||
github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
@ -28,8 +27,8 @@ golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73r
|
||||
golang.org/x/net v0.0.0-20190522155817-f3200d17e092 h1:4QSRKanuywn15aTZvI/mIDEgPQpswuFndXpOj3rKEco=
|
||||
golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190509141414-a5b02f93d862 h1:rM0ROo5vb9AdYJi1110yjWGMej9ITfKddS89P3Fkhug=
|
||||
golang.org/x/sys v0.0.0-20190509141414-a5b02f93d862/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20190610081024-1e42afee0f76 h1:QSmW7Q3mFdAGjtAd0byXmFJ55inUydyZ4WQmiuItAIQ=
|
||||
golang.org/x/sys v0.0.0-20190610081024-1e42afee0f76/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
|
||||
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
|
||||
|
@ -43,7 +43,7 @@ func defaultHeadersMiddleware(g *Geziyor, r *Request) {
|
||||
|
||||
// parseHTMLMiddleware parses response if response is HTML
|
||||
func parseHTMLMiddleware(g *Geziyor, r *Response) {
|
||||
if !g.Opt.ParseHTMLDisabled && r.isHTML() {
|
||||
if !g.Opt.ParseHTMLDisabled {
|
||||
r.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
|
||||
}
|
||||
}
|
||||
|
14
response.go
14
response.go
@ -4,7 +4,6 @@ import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Response type wraps http.Response
|
||||
@ -27,16 +26,3 @@ func (r *Response) JoinURL(relativeURL string) string {
|
||||
joinedURL := r.Response.Request.URL.ResolveReference(parsedRelativeURL)
|
||||
return joinedURL.String()
|
||||
}
|
||||
|
||||
func (r *Response) isHTML() bool {
|
||||
if r.Response == nil {
|
||||
return len(r.Body) != 0
|
||||
}
|
||||
contentType := r.Header.Get("Content-Type")
|
||||
for _, htmlContentType := range []string{"text/html", "application/xhtml+xml", "application/vnd.wap.xhtml+xml"} {
|
||||
if strings.Contains(contentType, htmlContentType) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
Reference in New Issue
Block a user