备份
This commit is contained in:
parent
688c516c9f
commit
88f37ecc2d
@ -2,6 +2,7 @@ package client
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
"golang.org/x/net/html"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"strings"
|
"strings"
|
||||||
@ -18,6 +19,9 @@ type Response struct {
|
|||||||
// Goquery Document object. If response IsHTML, its non-nil.
|
// Goquery Document object. If response IsHTML, its non-nil.
|
||||||
HTMLDoc *goquery.Document
|
HTMLDoc *goquery.Document
|
||||||
|
|
||||||
|
// xpath Document object. If response IsHTML, its non-nil.
|
||||||
|
HTMLNode *html.Node
|
||||||
|
|
||||||
Request *Request
|
Request *Request
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -33,6 +37,15 @@ func (r *Response) JoinURL(relativeURL string) string {
|
|||||||
return joinedURL.String()
|
return joinedURL.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// JoinURL2 joins base response URL and provided relative URL.
|
||||||
|
func (r *Response) JoinURL2(relativeURL string) string {
|
||||||
|
joinedURL, err := r.Request.URL.Parse(relativeURL)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return joinedURL.String()
|
||||||
|
}
|
||||||
|
|
||||||
// IsHTML checks if response content is HTML by looking content-type header
|
// IsHTML checks if response content is HTML by looking content-type header
|
||||||
func (r *Response) IsHTML() bool {
|
func (r *Response) IsHTML() bool {
|
||||||
contentType := r.Header.Get("Content-Type")
|
contentType := r.Header.Get("Content-Type")
|
||||||
|
55
cmd/main.go
55
cmd/main.go
@ -5,27 +5,56 @@ import (
|
|||||||
"softdown.com/shusou/geziyor"
|
"softdown.com/shusou/geziyor"
|
||||||
"softdown.com/shusou/geziyor/client"
|
"softdown.com/shusou/geziyor/client"
|
||||||
"softdown.com/shusou/geziyor/export"
|
"softdown.com/shusou/geziyor/export"
|
||||||
|
"softdown.com/shusou/geziyor/filter"
|
||||||
|
"softdown.com/shusou/geziyor/middleware"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
geziyor.NewGeziyor(&geziyor.Options{
|
geziyor.NewGeziyor(&geziyor.Options{
|
||||||
StartURLs: []string{"https://dytt.dytt8.net/index.htm"},
|
StartURLs: []string{"https://dytt.dytt8.net/index.htm"},
|
||||||
Encoding: "gb2312",
|
Encoding: middleware.EncodingGBK,
|
||||||
ParseFunc: quotesParse,
|
ParseFunc: menuParse,
|
||||||
Exporters: []export.Exporter{&export.JSON{}},
|
Exporters: []export.Exporter{&export.MeiliSearch{}},
|
||||||
}).Start()
|
}).Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
func quotesParse(g *geziyor.Geziyor, r *client.Response) {
|
func menuParse(g *geziyor.Geziyor, r *client.Response) {
|
||||||
r.HTMLDoc.Find("div.co_content2 ul a").Each(func(i int, s *goquery.Selection) {
|
r.HTMLDoc.Find("div#menu ul li").Each(func(i int, s *goquery.Selection) {
|
||||||
//fmt.Println(s.Html())
|
if url, exists := s.Find("a").Attr("href"); exists == true {
|
||||||
var url, _ = s.Attr("href")
|
g.Get(r.JoinURL2(url), pageParse)
|
||||||
g.Exports <- map[string]interface{}{
|
|
||||||
"title": s.Text(),
|
|
||||||
"url": url,
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
//if href, ok := r.HTMLDoc.Find("li.next > a").Attr("href"); ok {
|
}
|
||||||
// g.Get(r.JoinURL(href), quotesParse)
|
|
||||||
//}
|
func pageParse(g *geziyor.Geziyor, r *client.Response) {
|
||||||
|
r.HTMLDoc.Find("div.co_content8 ul b").Each(func(i int, s *goquery.Selection) {
|
||||||
|
if url, exists := s.Find("a").Eq(1).Attr("href"); exists == true {
|
||||||
|
g.Get(r.JoinURL2(url), detailParse)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
// 分页解析
|
||||||
|
r.HTMLDoc.Find("div.x a").Each(func(i int, selection *goquery.Selection) {
|
||||||
|
if href, exists := selection.Attr("href"); exists {
|
||||||
|
g.Get(r.JoinURL2(href), pageParse)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
//htmlquery.Find(r.HTMLNode, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
func detailParse(g *geziyor.Geziyor, r *client.Response) {
|
||||||
|
s := r.HTMLDoc.Find("body")
|
||||||
|
url := r.Request.URL.String()
|
||||||
|
title := s.Find("div.title_all h1").Text()
|
||||||
|
html, _ := s.Find("div#Zoom").Html()
|
||||||
|
text := s.Find("div#Zoom").Text()
|
||||||
|
image, _ := s.Find("div#Zoom").Find("img").Attr("src")
|
||||||
|
|
||||||
|
g.Exports <- map[string]interface{}{
|
||||||
|
"image": image,
|
||||||
|
"text": text,
|
||||||
|
"html": filter.FilterScriptTags(html),
|
||||||
|
"url": url,
|
||||||
|
"title": title,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
30
export/meilisearch.go
Normal file
30
export/meilisearch.go
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
package export
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"github.com/meilisearch/meilisearch-go"
|
||||||
|
"github.com/rs/xid"
|
||||||
|
)
|
||||||
|
|
||||||
|
type MeiliSearch struct{}
|
||||||
|
|
||||||
|
func (*MeiliSearch) Export(exports chan interface{}) error {
|
||||||
|
client := meilisearch.New("http://localhost:7700", meilisearch.WithAPIKey("123456789"))
|
||||||
|
|
||||||
|
// An index is where the documents are stored.
|
||||||
|
index := client.Index("movies")
|
||||||
|
|
||||||
|
for res := range exports {
|
||||||
|
data := res.(map[string]interface{})
|
||||||
|
guid := xid.New().String()
|
||||||
|
task, err := index.AddDocuments([]map[string]interface{}{
|
||||||
|
{"id": guid, "title": data["title"].(string), "url": data["url"].(string), "html": data["html"].(string)},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
fmt.Println(task.TaskUID)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
11
filter/js.go
Normal file
11
filter/js.go
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
package filter
|
||||||
|
|
||||||
|
import "regexp"
|
||||||
|
|
||||||
|
// FilterScriptTags 使用正则表达式过滤 HTML 文本中的 <script> 标签(不区分大小写)
|
||||||
|
func FilterScriptTags(input string) string {
|
||||||
|
// 正则表达式匹配 <script> 标签及其内容,不区分大小写
|
||||||
|
re := regexp.MustCompile(`(?i)<script[^>]*>.*?</script>`)
|
||||||
|
// 替换匹配的部分为空字符串
|
||||||
|
return re.ReplaceAllString(input, "")
|
||||||
|
}
|
14
geziyor.go
14
geziyor.go
@ -162,7 +162,7 @@ func (g *Geziyor) Start() {
|
|||||||
g.Opt.StartRequestsFunc(g)
|
g.Opt.StartRequestsFunc(g)
|
||||||
} else {
|
} else {
|
||||||
for _, startURL := range g.Opt.StartURLs {
|
for _, startURL := range g.Opt.StartURLs {
|
||||||
g.Get(startURL, g.Opt.Encoding, g.Opt.ParseFunc)
|
g.Get(startURL, g.Opt.ParseFunc)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -174,8 +174,8 @@ func (g *Geziyor) Start() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get issues a GET to the specified URL.
|
// Get issues a GET to the specified URL.
|
||||||
func (g *Geziyor) Get(url, encoding string, callback func(g *Geziyor, r *client.Response)) {
|
func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *client.Response)) {
|
||||||
req, err := client.NewRequest("GET", url, encoding, nil)
|
req, err := client.NewRequest("GET", url, g.Opt.Encoding, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
internal.Logger.Printf("Request creating error %v\n", err)
|
internal.Logger.Printf("Request creating error %v\n", err)
|
||||||
return
|
return
|
||||||
@ -186,8 +186,8 @@ func (g *Geziyor) Get(url, encoding string, callback func(g *Geziyor, r *client.
|
|||||||
// GetRendered issues GET request using headless browser
|
// GetRendered issues GET request using headless browser
|
||||||
// Opens up a new Chrome instance, makes request, waits for rendering HTML DOM and closed.
|
// Opens up a new Chrome instance, makes request, waits for rendering HTML DOM and closed.
|
||||||
// Rendered requests only supported for GET requests.
|
// Rendered requests only supported for GET requests.
|
||||||
func (g *Geziyor) GetRendered(url, encoding string, callback func(g *Geziyor, r *client.Response)) {
|
func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *client.Response)) {
|
||||||
req, err := client.NewRequest("GET", url, encoding, nil)
|
req, err := client.NewRequest("GET", url, g.Opt.Encoding, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
internal.Logger.Printf("Request creating error %v\n", err)
|
internal.Logger.Printf("Request creating error %v\n", err)
|
||||||
return
|
return
|
||||||
@ -198,7 +198,7 @@ func (g *Geziyor) GetRendered(url, encoding string, callback func(g *Geziyor, r
|
|||||||
|
|
||||||
// Head issues a HEAD to the specified URL
|
// Head issues a HEAD to the specified URL
|
||||||
func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)) {
|
func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)) {
|
||||||
req, err := client.NewRequest("HEAD", url, "", nil)
|
req, err := client.NewRequest("HEAD", url, g.Opt.Encoding, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
internal.Logger.Printf("Request creating error %v\n", err)
|
internal.Logger.Printf("Request creating error %v\n", err)
|
||||||
return
|
return
|
||||||
@ -208,7 +208,7 @@ func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)
|
|||||||
|
|
||||||
// Post issues a POST to the specified URL
|
// Post issues a POST to the specified URL
|
||||||
func (g *Geziyor) Post(url string, body io.Reader, callback func(g *Geziyor, r *client.Response)) {
|
func (g *Geziyor) Post(url string, body io.Reader, callback func(g *Geziyor, r *client.Response)) {
|
||||||
req, err := client.NewRequest("POST", url, "", body)
|
req, err := client.NewRequest("POST", url, g.Opt.Encoding, body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
internal.Logger.Printf("Request creating error %v\n", err)
|
internal.Logger.Printf("Request creating error %v\n", err)
|
||||||
return
|
return
|
||||||
|
6
go.mod
6
go.mod
@ -22,6 +22,8 @@ require (
|
|||||||
require (
|
require (
|
||||||
github.com/VividCortex/gohistogram v1.0.0 // indirect
|
github.com/VividCortex/gohistogram v1.0.0 // indirect
|
||||||
github.com/andybalholm/cascadia v1.3.2 // indirect
|
github.com/andybalholm/cascadia v1.3.2 // indirect
|
||||||
|
github.com/antchfx/htmlquery v1.3.2 // indirect
|
||||||
|
github.com/antchfx/xpath v1.3.1 // indirect
|
||||||
github.com/beorn7/perks v1.0.1 // indirect
|
github.com/beorn7/perks v1.0.1 // indirect
|
||||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||||
github.com/chromedp/sysutil v1.0.0 // indirect
|
github.com/chromedp/sysutil v1.0.0 // indirect
|
||||||
@ -29,16 +31,20 @@ require (
|
|||||||
github.com/gobwas/httphead v0.1.0 // indirect
|
github.com/gobwas/httphead v0.1.0 // indirect
|
||||||
github.com/gobwas/pool v0.2.1 // indirect
|
github.com/gobwas/pool v0.2.1 // indirect
|
||||||
github.com/gobwas/ws v1.4.0 // indirect
|
github.com/gobwas/ws v1.4.0 // indirect
|
||||||
|
github.com/golang-jwt/jwt/v4 v4.5.0 // indirect
|
||||||
|
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
|
||||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect
|
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect
|
||||||
github.com/google/btree v1.1.3 // indirect
|
github.com/google/btree v1.1.3 // indirect
|
||||||
github.com/josharian/intern v1.0.0 // indirect
|
github.com/josharian/intern v1.0.0 // indirect
|
||||||
github.com/kr/text v0.2.0 // indirect
|
github.com/kr/text v0.2.0 // indirect
|
||||||
github.com/mailru/easyjson v0.7.7 // indirect
|
github.com/mailru/easyjson v0.7.7 // indirect
|
||||||
|
github.com/meilisearch/meilisearch-go v0.28.0 // indirect
|
||||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||||
github.com/prometheus/client_model v0.6.1 // indirect
|
github.com/prometheus/client_model v0.6.1 // indirect
|
||||||
github.com/prometheus/common v0.55.0 // indirect
|
github.com/prometheus/common v0.55.0 // indirect
|
||||||
github.com/prometheus/procfs v0.15.1 // indirect
|
github.com/prometheus/procfs v0.15.1 // indirect
|
||||||
|
github.com/rs/xid v1.6.0 // indirect
|
||||||
golang.org/x/sys v0.24.0 // indirect
|
golang.org/x/sys v0.24.0 // indirect
|
||||||
google.golang.org/protobuf v1.34.2 // indirect
|
google.golang.org/protobuf v1.34.2 // indirect
|
||||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||||
|
19
go.sum
19
go.sum
@ -4,6 +4,10 @@ github.com/VividCortex/gohistogram v1.0.0 h1:6+hBz+qvs0JOrrNhhmR7lFxo5sINxBCGXrd
|
|||||||
github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/4+TcAqDqk/vUH7g=
|
github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/4+TcAqDqk/vUH7g=
|
||||||
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
|
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
|
||||||
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
|
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
|
||||||
|
github.com/antchfx/htmlquery v1.3.2 h1:85YdttVkR1rAY+Oiv/nKI4FCimID+NXhDn82kz3mEvs=
|
||||||
|
github.com/antchfx/htmlquery v1.3.2/go.mod h1:1mbkcEgEarAokJiWhTfr4hR06w/q2ZZjnYLrDt6CTUk=
|
||||||
|
github.com/antchfx/xpath v1.3.1 h1:PNbFuUqHwWl0xRjvUPjJ95Agbmdj2uzzIwmQKgu4oCk=
|
||||||
|
github.com/antchfx/xpath v1.3.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
|
||||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||||
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||||
@ -34,6 +38,10 @@ github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
|
|||||||
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
|
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
|
||||||
github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
|
github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
|
||||||
github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
|
github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
|
||||||
|
github.com/golang-jwt/jwt/v4 v4.5.0 h1:7cYmW1XlMY7h7ii7UhUyChSgS5wUJEnm9uZVTGqOWzg=
|
||||||
|
github.com/golang-jwt/jwt/v4 v4.5.0/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
|
||||||
|
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
|
||||||
|
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
|
||||||
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w=
|
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w=
|
||||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||||
@ -53,6 +61,8 @@ github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kUL
|
|||||||
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
|
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
|
||||||
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
|
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
|
||||||
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
|
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
|
||||||
|
github.com/meilisearch/meilisearch-go v0.28.0 h1:f3XJ66ZM+R8bANAOLqsjvoq/HhQNpVJPYoNt6QgNzME=
|
||||||
|
github.com/meilisearch/meilisearch-go v0.28.0/go.mod h1:Szcc9CaDiKIfjdgdt49jlmDKpEzjD+x+b6Y6heMdlQ0=
|
||||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||||
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
|
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
|
||||||
@ -77,8 +87,15 @@ github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoG
|
|||||||
github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYeAmZ5ZIpBWTGllZSQnw97Dj+woV0toclVaRGI8pc=
|
github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYeAmZ5ZIpBWTGllZSQnw97Dj+woV0toclVaRGI8pc=
|
||||||
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
||||||
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
||||||
|
github.com/rs/xid v1.6.0 h1:fV591PaemRlL6JfRxGDEPl69wICngIQ3shQtzfy2gxU=
|
||||||
|
github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0=
|
||||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
|
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||||
|
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
||||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||||
|
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||||
|
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||||
|
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||||
github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE=
|
github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE=
|
||||||
@ -95,6 +112,7 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
|
|||||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||||
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||||
|
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||||
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
|
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
|
||||||
golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
|
golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
|
||||||
golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
|
golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
|
||||||
@ -144,5 +162,6 @@ gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWD
|
|||||||
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||||
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
|
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
|
||||||
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
|
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
|
||||||
|
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
@ -3,10 +3,18 @@ package middleware
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
"github.com/antchfx/htmlquery"
|
||||||
|
"golang.org/x/text/encoding/simplifiedchinese"
|
||||||
|
"golang.org/x/text/transform"
|
||||||
"softdown.com/shusou/geziyor/client"
|
"softdown.com/shusou/geziyor/client"
|
||||||
"softdown.com/shusou/geziyor/internal"
|
"softdown.com/shusou/geziyor/internal"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
EncodingGBK = "gb2312"
|
||||||
|
EncodingUTF8 = "gb2312"
|
||||||
|
)
|
||||||
|
|
||||||
// ParseHTML parses response if response is HTML
|
// ParseHTML parses response if response is HTML
|
||||||
type ParseHTML struct {
|
type ParseHTML struct {
|
||||||
ParseHTMLDisabled bool
|
ParseHTMLDisabled bool
|
||||||
@ -14,11 +22,38 @@ type ParseHTML struct {
|
|||||||
|
|
||||||
func (p *ParseHTML) ProcessResponse(r *client.Response) {
|
func (p *ParseHTML) ProcessResponse(r *client.Response) {
|
||||||
if !p.ParseHTMLDisabled && r.IsHTML() {
|
if !p.ParseHTMLDisabled && r.IsHTML() {
|
||||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
|
if r.Request.Encoding == EncodingGBK {
|
||||||
if err != nil {
|
reader := transform.NewReader(bytes.NewReader(r.Body), simplifiedchinese.GB18030.NewDecoder())
|
||||||
internal.Logger.Println(err.Error())
|
doc, err := goquery.NewDocumentFromReader(reader)
|
||||||
return
|
if err != nil {
|
||||||
|
internal.Logger.Println(err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
r.HTMLDoc = doc
|
||||||
|
|
||||||
|
// xpath
|
||||||
|
node, err := htmlquery.Parse(reader)
|
||||||
|
if err != nil {
|
||||||
|
internal.Logger.Println(err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
r.HTMLNode = node
|
||||||
|
|
||||||
|
} else {
|
||||||
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
|
||||||
|
if err != nil {
|
||||||
|
internal.Logger.Println(err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
r.HTMLDoc = doc
|
||||||
|
|
||||||
|
// xpath
|
||||||
|
node, err := htmlquery.Parse(bytes.NewReader(r.Body))
|
||||||
|
if err != nil {
|
||||||
|
internal.Logger.Println(err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
r.HTMLNode = node
|
||||||
}
|
}
|
||||||
r.HTMLDoc = doc
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -124,6 +124,7 @@ type Options struct {
|
|||||||
// Default: "Geziyor 1.0"
|
// Default: "Geziyor 1.0"
|
||||||
UserAgent string
|
UserAgent string
|
||||||
|
|
||||||
// 网页编码
|
// Optional response body encoding. Leave empty for automatic detection.
|
||||||
|
// If you're having issues with auto detection, set this.
|
||||||
Encoding string
|
Encoding string
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user