geziyor/middleware/parse_html.go
Administrator 88f37ecc2d 备份
2024-09-05 18:16:17 +08:00

60 lines
1.3 KiB
Go

package middleware
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"github.com/antchfx/htmlquery"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/internal"
)
const (
EncodingGBK = "gb2312"
EncodingUTF8 = "gb2312"
)
// ParseHTML parses response if response is HTML
type ParseHTML struct {
ParseHTMLDisabled bool
}
func (p *ParseHTML) ProcessResponse(r *client.Response) {
if !p.ParseHTMLDisabled && r.IsHTML() {
if r.Request.Encoding == EncodingGBK {
reader := transform.NewReader(bytes.NewReader(r.Body), simplifiedchinese.GB18030.NewDecoder())
doc, err := goquery.NewDocumentFromReader(reader)
if err != nil {
internal.Logger.Println(err.Error())
return
}
r.HTMLDoc = doc
// xpath
node, err := htmlquery.Parse(reader)
if err != nil {
internal.Logger.Println(err.Error())
return
}
r.HTMLNode = node
} else {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
if err != nil {
internal.Logger.Println(err.Error())
return
}
r.HTMLDoc = doc
// xpath
node, err := htmlquery.Parse(bytes.NewReader(r.Body))
if err != nil {
internal.Logger.Println(err.Error())
return
}
r.HTMLNode = node
}
}
}