This commit is contained in:
Administrator
2024-09-05 18:16:17 +08:00
parent 688c516c9f
commit 88f37ecc2d
9 changed files with 170 additions and 26 deletions

View File

@ -3,10 +3,18 @@ package middleware
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"github.com/antchfx/htmlquery"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/internal"
)
const (
EncodingGBK = "gb2312"
EncodingUTF8 = "gb2312"
)
// ParseHTML parses response if response is HTML
type ParseHTML struct {
ParseHTMLDisabled bool
@ -14,11 +22,38 @@ type ParseHTML struct {
func (p *ParseHTML) ProcessResponse(r *client.Response) {
if !p.ParseHTMLDisabled && r.IsHTML() {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
if err != nil {
internal.Logger.Println(err.Error())
return
if r.Request.Encoding == EncodingGBK {
reader := transform.NewReader(bytes.NewReader(r.Body), simplifiedchinese.GB18030.NewDecoder())
doc, err := goquery.NewDocumentFromReader(reader)
if err != nil {
internal.Logger.Println(err.Error())
return
}
r.HTMLDoc = doc
// xpath
node, err := htmlquery.Parse(reader)
if err != nil {
internal.Logger.Println(err.Error())
return
}
r.HTMLNode = node
} else {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
if err != nil {
internal.Logger.Println(err.Error())
return
}
r.HTMLDoc = doc
// xpath
node, err := htmlquery.Parse(bytes.NewReader(r.Body))
if err != nil {
internal.Logger.Println(err.Error())
return
}
r.HTMLNode = node
}
r.HTMLDoc = doc
}
}