This commit is contained in:
Administrator 2024-09-05 18:16:17 +08:00
parent 688c516c9f
commit 88f37ecc2d
9 changed files with 170 additions and 26 deletions

View File

@ -2,6 +2,7 @@ package client
import ( import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
"net/http" "net/http"
"net/url" "net/url"
"strings" "strings"
@ -18,6 +19,9 @@ type Response struct {
// Goquery Document object. If response IsHTML, its non-nil. // Goquery Document object. If response IsHTML, its non-nil.
HTMLDoc *goquery.Document HTMLDoc *goquery.Document
// xpath Document object. If response IsHTML, its non-nil.
HTMLNode *html.Node
Request *Request Request *Request
} }
@ -33,6 +37,15 @@ func (r *Response) JoinURL(relativeURL string) string {
return joinedURL.String() return joinedURL.String()
} }
// JoinURL2 joins base response URL and provided relative URL.
func (r *Response) JoinURL2(relativeURL string) string {
joinedURL, err := r.Request.URL.Parse(relativeURL)
if err != nil {
return ""
}
return joinedURL.String()
}
// IsHTML checks if response content is HTML by looking content-type header // IsHTML checks if response content is HTML by looking content-type header
func (r *Response) IsHTML() bool { func (r *Response) IsHTML() bool {
contentType := r.Header.Get("Content-Type") contentType := r.Header.Get("Content-Type")

View File

@ -5,27 +5,56 @@ import (
"softdown.com/shusou/geziyor" "softdown.com/shusou/geziyor"
"softdown.com/shusou/geziyor/client" "softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/export" "softdown.com/shusou/geziyor/export"
"softdown.com/shusou/geziyor/filter"
"softdown.com/shusou/geziyor/middleware"
) )
func main() { func main() {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://dytt.dytt8.net/index.htm"}, StartURLs: []string{"https://dytt.dytt8.net/index.htm"},
Encoding: "gb2312", Encoding: middleware.EncodingGBK,
ParseFunc: quotesParse, ParseFunc: menuParse,
Exporters: []export.Exporter{&export.JSON{}}, Exporters: []export.Exporter{&export.MeiliSearch{}},
}).Start() }).Start()
} }
func quotesParse(g *geziyor.Geziyor, r *client.Response) { func menuParse(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div.co_content2 ul a").Each(func(i int, s *goquery.Selection) { r.HTMLDoc.Find("div#menu ul li").Each(func(i int, s *goquery.Selection) {
//fmt.Println(s.Html()) if url, exists := s.Find("a").Attr("href"); exists == true {
var url, _ = s.Attr("href") g.Get(r.JoinURL2(url), pageParse)
g.Exports <- map[string]interface{}{
"title": s.Text(),
"url": url,
} }
}) })
//if href, ok := r.HTMLDoc.Find("li.next > a").Attr("href"); ok { }
// g.Get(r.JoinURL(href), quotesParse)
//} func pageParse(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div.co_content8 ul b").Each(func(i int, s *goquery.Selection) {
if url, exists := s.Find("a").Eq(1).Attr("href"); exists == true {
g.Get(r.JoinURL2(url), detailParse)
}
})
// 分页解析
r.HTMLDoc.Find("div.x a").Each(func(i int, selection *goquery.Selection) {
if href, exists := selection.Attr("href"); exists {
g.Get(r.JoinURL2(href), pageParse)
}
})
//htmlquery.Find(r.HTMLNode, "")
}
func detailParse(g *geziyor.Geziyor, r *client.Response) {
s := r.HTMLDoc.Find("body")
url := r.Request.URL.String()
title := s.Find("div.title_all h1").Text()
html, _ := s.Find("div#Zoom").Html()
text := s.Find("div#Zoom").Text()
image, _ := s.Find("div#Zoom").Find("img").Attr("src")
g.Exports <- map[string]interface{}{
"image": image,
"text": text,
"html": filter.FilterScriptTags(html),
"url": url,
"title": title,
}
} }

30
export/meilisearch.go Normal file
View File

@ -0,0 +1,30 @@
package export
import (
"fmt"
"github.com/meilisearch/meilisearch-go"
"github.com/rs/xid"
)
type MeiliSearch struct{}
func (*MeiliSearch) Export(exports chan interface{}) error {
client := meilisearch.New("http://localhost:7700", meilisearch.WithAPIKey("123456789"))
// An index is where the documents are stored.
index := client.Index("movies")
for res := range exports {
data := res.(map[string]interface{})
guid := xid.New().String()
task, err := index.AddDocuments([]map[string]interface{}{
{"id": guid, "title": data["title"].(string), "url": data["url"].(string), "html": data["html"].(string)},
})
if err != nil {
return err
}
fmt.Println(task.TaskUID)
}
return nil
}

11
filter/js.go Normal file
View File

@ -0,0 +1,11 @@
package filter
import "regexp"
// FilterScriptTags 使用正则表达式过滤 HTML 文本中的 <script> 标签(不区分大小写)
func FilterScriptTags(input string) string {
// 正则表达式匹配 <script> 标签及其内容,不区分大小写
re := regexp.MustCompile(`(?i)<script[^>]*>.*?</script>`)
// 替换匹配的部分为空字符串
return re.ReplaceAllString(input, "")
}

View File

@ -162,7 +162,7 @@ func (g *Geziyor) Start() {
g.Opt.StartRequestsFunc(g) g.Opt.StartRequestsFunc(g)
} else { } else {
for _, startURL := range g.Opt.StartURLs { for _, startURL := range g.Opt.StartURLs {
g.Get(startURL, g.Opt.Encoding, g.Opt.ParseFunc) g.Get(startURL, g.Opt.ParseFunc)
} }
} }
@ -174,8 +174,8 @@ func (g *Geziyor) Start() {
} }
// Get issues a GET to the specified URL. // Get issues a GET to the specified URL.
func (g *Geziyor) Get(url, encoding string, callback func(g *Geziyor, r *client.Response)) { func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("GET", url, encoding, nil) req, err := client.NewRequest("GET", url, g.Opt.Encoding, nil)
if err != nil { if err != nil {
internal.Logger.Printf("Request creating error %v\n", err) internal.Logger.Printf("Request creating error %v\n", err)
return return
@ -186,8 +186,8 @@ func (g *Geziyor) Get(url, encoding string, callback func(g *Geziyor, r *client.
// GetRendered issues GET request using headless browser // GetRendered issues GET request using headless browser
// Opens up a new Chrome instance, makes request, waits for rendering HTML DOM and closed. // Opens up a new Chrome instance, makes request, waits for rendering HTML DOM and closed.
// Rendered requests only supported for GET requests. // Rendered requests only supported for GET requests.
func (g *Geziyor) GetRendered(url, encoding string, callback func(g *Geziyor, r *client.Response)) { func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("GET", url, encoding, nil) req, err := client.NewRequest("GET", url, g.Opt.Encoding, nil)
if err != nil { if err != nil {
internal.Logger.Printf("Request creating error %v\n", err) internal.Logger.Printf("Request creating error %v\n", err)
return return
@ -198,7 +198,7 @@ func (g *Geziyor) GetRendered(url, encoding string, callback func(g *Geziyor, r
// Head issues a HEAD to the specified URL // Head issues a HEAD to the specified URL
func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)) { func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("HEAD", url, "", nil) req, err := client.NewRequest("HEAD", url, g.Opt.Encoding, nil)
if err != nil { if err != nil {
internal.Logger.Printf("Request creating error %v\n", err) internal.Logger.Printf("Request creating error %v\n", err)
return return
@ -208,7 +208,7 @@ func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)
// Post issues a POST to the specified URL // Post issues a POST to the specified URL
func (g *Geziyor) Post(url string, body io.Reader, callback func(g *Geziyor, r *client.Response)) { func (g *Geziyor) Post(url string, body io.Reader, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("POST", url, "", body) req, err := client.NewRequest("POST", url, g.Opt.Encoding, body)
if err != nil { if err != nil {
internal.Logger.Printf("Request creating error %v\n", err) internal.Logger.Printf("Request creating error %v\n", err)
return return

6
go.mod
View File

@ -22,6 +22,8 @@ require (
require ( require (
github.com/VividCortex/gohistogram v1.0.0 // indirect github.com/VividCortex/gohistogram v1.0.0 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/antchfx/htmlquery v1.3.2 // indirect
github.com/antchfx/xpath v1.3.1 // indirect
github.com/beorn7/perks v1.0.1 // indirect github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect github.com/chromedp/sysutil v1.0.0 // indirect
@ -29,16 +31,20 @@ require (
github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect github.com/gobwas/ws v1.4.0 // indirect
github.com/golang-jwt/jwt/v4 v4.5.0 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect
github.com/google/btree v1.1.3 // indirect github.com/google/btree v1.1.3 // indirect
github.com/josharian/intern v1.0.0 // indirect github.com/josharian/intern v1.0.0 // indirect
github.com/kr/text v0.2.0 // indirect github.com/kr/text v0.2.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect github.com/mailru/easyjson v0.7.7 // indirect
github.com/meilisearch/meilisearch-go v0.28.0 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect github.com/prometheus/procfs v0.15.1 // indirect
github.com/rs/xid v1.6.0 // indirect
golang.org/x/sys v0.24.0 // indirect golang.org/x/sys v0.24.0 // indirect
google.golang.org/protobuf v1.34.2 // indirect google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect

19
go.sum
View File

@ -4,6 +4,10 @@ github.com/VividCortex/gohistogram v1.0.0 h1:6+hBz+qvs0JOrrNhhmR7lFxo5sINxBCGXrd
github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/4+TcAqDqk/vUH7g= github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/4+TcAqDqk/vUH7g=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/antchfx/htmlquery v1.3.2 h1:85YdttVkR1rAY+Oiv/nKI4FCimID+NXhDn82kz3mEvs=
github.com/antchfx/htmlquery v1.3.2/go.mod h1:1mbkcEgEarAokJiWhTfr4hR06w/q2ZZjnYLrDt6CTUk=
github.com/antchfx/xpath v1.3.1 h1:PNbFuUqHwWl0xRjvUPjJ95Agbmdj2uzzIwmQKgu4oCk=
github.com/antchfx/xpath v1.3.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
@ -34,6 +38,10 @@ github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
github.com/golang-jwt/jwt/v4 v4.5.0 h1:7cYmW1XlMY7h7ii7UhUyChSgS5wUJEnm9uZVTGqOWzg=
github.com/golang-jwt/jwt/v4 v4.5.0/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w= github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w=
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
@ -53,6 +61,8 @@ github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kUL
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/meilisearch/meilisearch-go v0.28.0 h1:f3XJ66ZM+R8bANAOLqsjvoq/HhQNpVJPYoNt6QgNzME=
github.com/meilisearch/meilisearch-go v0.28.0/go.mod h1:Szcc9CaDiKIfjdgdt49jlmDKpEzjD+x+b6Y6heMdlQ0=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
@ -77,8 +87,15 @@ github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoG
github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYeAmZ5ZIpBWTGllZSQnw97Dj+woV0toclVaRGI8pc= github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYeAmZ5ZIpBWTGllZSQnw97Dj+woV0toclVaRGI8pc=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/rs/xid v1.6.0 h1:fV591PaemRlL6JfRxGDEPl69wICngIQ3shQtzfy2gxU=
github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE= github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE=
@ -95,6 +112,7 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
@ -144,5 +162,6 @@ gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWD
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@ -3,10 +3,18 @@ package middleware
import ( import (
"bytes" "bytes"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/antchfx/htmlquery"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
"softdown.com/shusou/geziyor/client" "softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/internal" "softdown.com/shusou/geziyor/internal"
) )
const (
EncodingGBK = "gb2312"
EncodingUTF8 = "gb2312"
)
// ParseHTML parses response if response is HTML // ParseHTML parses response if response is HTML
type ParseHTML struct { type ParseHTML struct {
ParseHTMLDisabled bool ParseHTMLDisabled bool
@ -14,11 +22,38 @@ type ParseHTML struct {
func (p *ParseHTML) ProcessResponse(r *client.Response) { func (p *ParseHTML) ProcessResponse(r *client.Response) {
if !p.ParseHTMLDisabled && r.IsHTML() { if !p.ParseHTMLDisabled && r.IsHTML() {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) if r.Request.Encoding == EncodingGBK {
if err != nil { reader := transform.NewReader(bytes.NewReader(r.Body), simplifiedchinese.GB18030.NewDecoder())
internal.Logger.Println(err.Error()) doc, err := goquery.NewDocumentFromReader(reader)
return if err != nil {
internal.Logger.Println(err.Error())
return
}
r.HTMLDoc = doc
// xpath
node, err := htmlquery.Parse(reader)
if err != nil {
internal.Logger.Println(err.Error())
return
}
r.HTMLNode = node
} else {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
if err != nil {
internal.Logger.Println(err.Error())
return
}
r.HTMLDoc = doc
// xpath
node, err := htmlquery.Parse(bytes.NewReader(r.Body))
if err != nil {
internal.Logger.Println(err.Error())
return
}
r.HTMLNode = node
} }
r.HTMLDoc = doc
} }
} }

View File

@ -124,6 +124,7 @@ type Options struct {
// Default: "Geziyor 1.0" // Default: "Geziyor 1.0"
UserAgent string UserAgent string
// 网页编码 // Optional response body encoding. Leave empty for automatic detection.
// If you're having issues with auto detection, set this.
Encoding string Encoding string
} }