Request callbacks added.

Recover from all panics and continue scraping.
Only parse HTML if response is HTML.
This commit is contained in:
Musab Gültekin
2019-06-09 21:13:30 +03:00
parent 7abc7a370d
commit ca2414c5c8
5 changed files with 48 additions and 18 deletions

View File

@ -4,14 +4,15 @@ import (
"github.com/PuerkitoBio/goquery"
"net/http"
"net/url"
"strings"
)
// Response type wraps http.Response
// Contains parsed response data and Geziyor functions.
type Response struct {
*http.Response
Body []byte
Doc *goquery.Document
Body []byte
DocHTML *goquery.Document
Geziyor *Geziyor
Exports chan interface{}
@ -27,3 +28,13 @@ func (r *Response) JoinURL(relativeURL string) string {
joinedURL := r.Response.Request.URL.ResolveReference(parsedRelativeURL)
return joinedURL.String()
}
func (r *Response) isHTML() bool {
contentType := r.Header.Get("Content-Type")
for _, htmlContentType := range []string{"text/html", "application/xhtml+xml", "application/vnd.wap.xhtml+xml"} {
if strings.Contains(contentType, htmlContentType) {
return true
}
}
return false
}