package middleware import ( "bytes" "github.com/PuerkitoBio/goquery" "github.com/antchfx/htmlquery" "golang.org/x/text/encoding/simplifiedchinese" "golang.org/x/text/transform" "softdown.com/shusou/geziyor/client" "softdown.com/shusou/geziyor/internal" ) const ( EncodingGBK = "gb2312" EncodingUTF8 = "gb2312" ) // ParseHTML parses response if response is HTML type ParseHTML struct { ParseHTMLDisabled bool } func (p *ParseHTML) ProcessResponse(r *client.Response) { if !p.ParseHTMLDisabled && r.IsHTML() { if r.Request.Encoding == EncodingGBK { reader := transform.NewReader(bytes.NewReader(r.Body), simplifiedchinese.GB18030.NewDecoder()) doc, err := goquery.NewDocumentFromReader(reader) if err != nil { internal.Logger.Println(err.Error()) return } r.HTMLDoc = doc // xpath node, err := htmlquery.Parse(reader) if err != nil { internal.Logger.Println(err.Error()) return } r.HTMLNode = node } else { doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) if err != nil { internal.Logger.Println(err.Error()) return } r.HTMLDoc = doc // xpath node, err := htmlquery.Parse(bytes.NewReader(r.Body)) if err != nil { internal.Logger.Println(err.Error()) return } r.HTMLNode = node } } }