From 88f37ecc2d3b1335efdb458eec25230e523a8a1b Mon Sep 17 00:00:00 2001 From: Administrator Date: Thu, 5 Sep 2024 18:16:17 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=87=E4=BB=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- client/response.go | 13 ++++++++++ cmd/main.go | 55 ++++++++++++++++++++++++++++++---------- export/meilisearch.go | 30 ++++++++++++++++++++++ filter/js.go | 11 ++++++++ geziyor.go | 14 +++++----- go.mod | 6 +++++ go.sum | 19 ++++++++++++++ middleware/parse_html.go | 45 ++++++++++++++++++++++++++++---- options.go | 3 ++- 9 files changed, 170 insertions(+), 26 deletions(-) create mode 100644 export/meilisearch.go create mode 100644 filter/js.go diff --git a/client/response.go b/client/response.go index 57a24c6..358268e 100644 --- a/client/response.go +++ b/client/response.go @@ -2,6 +2,7 @@ package client import ( "github.com/PuerkitoBio/goquery" + "golang.org/x/net/html" "net/http" "net/url" "strings" @@ -18,6 +19,9 @@ type Response struct { // Goquery Document object. If response IsHTML, its non-nil. HTMLDoc *goquery.Document + // xpath Document object. If response IsHTML, its non-nil. + HTMLNode *html.Node + Request *Request } @@ -33,6 +37,15 @@ func (r *Response) JoinURL(relativeURL string) string { return joinedURL.String() } +// JoinURL2 joins base response URL and provided relative URL. +func (r *Response) JoinURL2(relativeURL string) string { + joinedURL, err := r.Request.URL.Parse(relativeURL) + if err != nil { + return "" + } + return joinedURL.String() +} + // IsHTML checks if response content is HTML by looking content-type header func (r *Response) IsHTML() bool { contentType := r.Header.Get("Content-Type") diff --git a/cmd/main.go b/cmd/main.go index f8aac10..11b2d59 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -5,27 +5,56 @@ import ( "softdown.com/shusou/geziyor" "softdown.com/shusou/geziyor/client" "softdown.com/shusou/geziyor/export" + "softdown.com/shusou/geziyor/filter" + "softdown.com/shusou/geziyor/middleware" ) func main() { geziyor.NewGeziyor(&geziyor.Options{ StartURLs: []string{"https://dytt.dytt8.net/index.htm"}, - Encoding: "gb2312", - ParseFunc: quotesParse, - Exporters: []export.Exporter{&export.JSON{}}, + Encoding: middleware.EncodingGBK, + ParseFunc: menuParse, + Exporters: []export.Exporter{&export.MeiliSearch{}}, }).Start() } -func quotesParse(g *geziyor.Geziyor, r *client.Response) { - r.HTMLDoc.Find("div.co_content2 ul a").Each(func(i int, s *goquery.Selection) { - //fmt.Println(s.Html()) - var url, _ = s.Attr("href") - g.Exports <- map[string]interface{}{ - "title": s.Text(), - "url": url, +func menuParse(g *geziyor.Geziyor, r *client.Response) { + r.HTMLDoc.Find("div#menu ul li").Each(func(i int, s *goquery.Selection) { + if url, exists := s.Find("a").Attr("href"); exists == true { + g.Get(r.JoinURL2(url), pageParse) } }) - //if href, ok := r.HTMLDoc.Find("li.next > a").Attr("href"); ok { - // g.Get(r.JoinURL(href), quotesParse) - //} +} + +func pageParse(g *geziyor.Geziyor, r *client.Response) { + r.HTMLDoc.Find("div.co_content8 ul b").Each(func(i int, s *goquery.Selection) { + if url, exists := s.Find("a").Eq(1).Attr("href"); exists == true { + g.Get(r.JoinURL2(url), detailParse) + } + }) + // 分页解析 + r.HTMLDoc.Find("div.x a").Each(func(i int, selection *goquery.Selection) { + if href, exists := selection.Attr("href"); exists { + g.Get(r.JoinURL2(href), pageParse) + } + }) + + //htmlquery.Find(r.HTMLNode, "") +} + +func detailParse(g *geziyor.Geziyor, r *client.Response) { + s := r.HTMLDoc.Find("body") + url := r.Request.URL.String() + title := s.Find("div.title_all h1").Text() + html, _ := s.Find("div#Zoom").Html() + text := s.Find("div#Zoom").Text() + image, _ := s.Find("div#Zoom").Find("img").Attr("src") + + g.Exports <- map[string]interface{}{ + "image": image, + "text": text, + "html": filter.FilterScriptTags(html), + "url": url, + "title": title, + } } diff --git a/export/meilisearch.go b/export/meilisearch.go new file mode 100644 index 0000000..1628e14 --- /dev/null +++ b/export/meilisearch.go @@ -0,0 +1,30 @@ +package export + +import ( + "fmt" + "github.com/meilisearch/meilisearch-go" + "github.com/rs/xid" +) + +type MeiliSearch struct{} + +func (*MeiliSearch) Export(exports chan interface{}) error { + client := meilisearch.New("http://localhost:7700", meilisearch.WithAPIKey("123456789")) + + // An index is where the documents are stored. + index := client.Index("movies") + + for res := range exports { + data := res.(map[string]interface{}) + guid := xid.New().String() + task, err := index.AddDocuments([]map[string]interface{}{ + {"id": guid, "title": data["title"].(string), "url": data["url"].(string), "html": data["html"].(string)}, + }) + if err != nil { + return err + } + fmt.Println(task.TaskUID) + } + + return nil +} diff --git a/filter/js.go b/filter/js.go new file mode 100644 index 0000000..42e003f --- /dev/null +++ b/filter/js.go @@ -0,0 +1,11 @@ +package filter + +import "regexp" + +// FilterScriptTags 使用正则表达式过滤 HTML 文本中的 `) + // 替换匹配的部分为空字符串 + return re.ReplaceAllString(input, "") +} diff --git a/geziyor.go b/geziyor.go index 32ce970..4113a58 100644 --- a/geziyor.go +++ b/geziyor.go @@ -162,7 +162,7 @@ func (g *Geziyor) Start() { g.Opt.StartRequestsFunc(g) } else { for _, startURL := range g.Opt.StartURLs { - g.Get(startURL, g.Opt.Encoding, g.Opt.ParseFunc) + g.Get(startURL, g.Opt.ParseFunc) } } @@ -174,8 +174,8 @@ func (g *Geziyor) Start() { } // Get issues a GET to the specified URL. -func (g *Geziyor) Get(url, encoding string, callback func(g *Geziyor, r *client.Response)) { - req, err := client.NewRequest("GET", url, encoding, nil) +func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *client.Response)) { + req, err := client.NewRequest("GET", url, g.Opt.Encoding, nil) if err != nil { internal.Logger.Printf("Request creating error %v\n", err) return @@ -186,8 +186,8 @@ func (g *Geziyor) Get(url, encoding string, callback func(g *Geziyor, r *client. // GetRendered issues GET request using headless browser // Opens up a new Chrome instance, makes request, waits for rendering HTML DOM and closed. // Rendered requests only supported for GET requests. -func (g *Geziyor) GetRendered(url, encoding string, callback func(g *Geziyor, r *client.Response)) { - req, err := client.NewRequest("GET", url, encoding, nil) +func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *client.Response)) { + req, err := client.NewRequest("GET", url, g.Opt.Encoding, nil) if err != nil { internal.Logger.Printf("Request creating error %v\n", err) return @@ -198,7 +198,7 @@ func (g *Geziyor) GetRendered(url, encoding string, callback func(g *Geziyor, r // Head issues a HEAD to the specified URL func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)) { - req, err := client.NewRequest("HEAD", url, "", nil) + req, err := client.NewRequest("HEAD", url, g.Opt.Encoding, nil) if err != nil { internal.Logger.Printf("Request creating error %v\n", err) return @@ -208,7 +208,7 @@ func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response) // Post issues a POST to the specified URL func (g *Geziyor) Post(url string, body io.Reader, callback func(g *Geziyor, r *client.Response)) { - req, err := client.NewRequest("POST", url, "", body) + req, err := client.NewRequest("POST", url, g.Opt.Encoding, body) if err != nil { internal.Logger.Printf("Request creating error %v\n", err) return diff --git a/go.mod b/go.mod index a2e6a78..0edc72f 100644 --- a/go.mod +++ b/go.mod @@ -22,6 +22,8 @@ require ( require ( github.com/VividCortex/gohistogram v1.0.0 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/antchfx/htmlquery v1.3.2 // indirect + github.com/antchfx/xpath v1.3.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/chromedp/sysutil v1.0.0 // indirect @@ -29,16 +31,20 @@ require ( github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect + github.com/golang-jwt/jwt/v4 v4.5.0 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect github.com/google/btree v1.1.3 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/kr/text v0.2.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/meilisearch/meilisearch-go v0.28.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.55.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect + github.com/rs/xid v1.6.0 // indirect golang.org/x/sys v0.24.0 // indirect google.golang.org/protobuf v1.34.2 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index 8084f8b..0366785 100644 --- a/go.sum +++ b/go.sum @@ -4,6 +4,10 @@ github.com/VividCortex/gohistogram v1.0.0 h1:6+hBz+qvs0JOrrNhhmR7lFxo5sINxBCGXrd github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/4+TcAqDqk/vUH7g= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/antchfx/htmlquery v1.3.2 h1:85YdttVkR1rAY+Oiv/nKI4FCimID+NXhDn82kz3mEvs= +github.com/antchfx/htmlquery v1.3.2/go.mod h1:1mbkcEgEarAokJiWhTfr4hR06w/q2ZZjnYLrDt6CTUk= +github.com/antchfx/xpath v1.3.1 h1:PNbFuUqHwWl0xRjvUPjJ95Agbmdj2uzzIwmQKgu4oCk= +github.com/antchfx/xpath v1.3.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= @@ -34,6 +38,10 @@ github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= +github.com/golang-jwt/jwt/v4 v4.5.0 h1:7cYmW1XlMY7h7ii7UhUyChSgS5wUJEnm9uZVTGqOWzg= +github.com/golang-jwt/jwt/v4 v4.5.0/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w= github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= @@ -53,6 +61,8 @@ github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kUL github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/meilisearch/meilisearch-go v0.28.0 h1:f3XJ66ZM+R8bANAOLqsjvoq/HhQNpVJPYoNt6QgNzME= +github.com/meilisearch/meilisearch-go v0.28.0/go.mod h1:Szcc9CaDiKIfjdgdt49jlmDKpEzjD+x+b6Y6heMdlQ0= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= @@ -77,8 +87,15 @@ github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoG github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYeAmZ5ZIpBWTGllZSQnw97Dj+woV0toclVaRGI8pc= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/rs/xid v1.6.0 h1:fV591PaemRlL6JfRxGDEPl69wICngIQ3shQtzfy2gxU= +github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE= @@ -95,6 +112,7 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= @@ -144,5 +162,6 @@ gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWD gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/middleware/parse_html.go b/middleware/parse_html.go index 06370ed..a00ea47 100644 --- a/middleware/parse_html.go +++ b/middleware/parse_html.go @@ -3,10 +3,18 @@ package middleware import ( "bytes" "github.com/PuerkitoBio/goquery" + "github.com/antchfx/htmlquery" + "golang.org/x/text/encoding/simplifiedchinese" + "golang.org/x/text/transform" "softdown.com/shusou/geziyor/client" "softdown.com/shusou/geziyor/internal" ) +const ( + EncodingGBK = "gb2312" + EncodingUTF8 = "gb2312" +) + // ParseHTML parses response if response is HTML type ParseHTML struct { ParseHTMLDisabled bool @@ -14,11 +22,38 @@ type ParseHTML struct { func (p *ParseHTML) ProcessResponse(r *client.Response) { if !p.ParseHTMLDisabled && r.IsHTML() { - doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) - if err != nil { - internal.Logger.Println(err.Error()) - return + if r.Request.Encoding == EncodingGBK { + reader := transform.NewReader(bytes.NewReader(r.Body), simplifiedchinese.GB18030.NewDecoder()) + doc, err := goquery.NewDocumentFromReader(reader) + if err != nil { + internal.Logger.Println(err.Error()) + return + } + r.HTMLDoc = doc + + // xpath + node, err := htmlquery.Parse(reader) + if err != nil { + internal.Logger.Println(err.Error()) + return + } + r.HTMLNode = node + + } else { + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) + if err != nil { + internal.Logger.Println(err.Error()) + return + } + r.HTMLDoc = doc + + // xpath + node, err := htmlquery.Parse(bytes.NewReader(r.Body)) + if err != nil { + internal.Logger.Println(err.Error()) + return + } + r.HTMLNode = node } - r.HTMLDoc = doc } } diff --git a/options.go b/options.go index 570875a..079f71c 100644 --- a/options.go +++ b/options.go @@ -124,6 +124,7 @@ type Options struct { // Default: "Geziyor 1.0" UserAgent string - // 网页编码 + // Optional response body encoding. Leave empty for automatic detection. + // If you're having issues with auto detection, set this. Encoding string }