This commit is contained in:
Administrator
2024-09-05 18:16:17 +08:00
parent 688c516c9f
commit 88f37ecc2d
9 changed files with 170 additions and 26 deletions

View File

@ -5,27 +5,56 @@ import (
"softdown.com/shusou/geziyor"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/export"
"softdown.com/shusou/geziyor/filter"
"softdown.com/shusou/geziyor/middleware"
)
func main() {
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://dytt.dytt8.net/index.htm"},
Encoding: "gb2312",
ParseFunc: quotesParse,
Exporters: []export.Exporter{&export.JSON{}},
Encoding: middleware.EncodingGBK,
ParseFunc: menuParse,
Exporters: []export.Exporter{&export.MeiliSearch{}},
}).Start()
}
func quotesParse(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div.co_content2 ul a").Each(func(i int, s *goquery.Selection) {
//fmt.Println(s.Html())
var url, _ = s.Attr("href")
g.Exports <- map[string]interface{}{
"title": s.Text(),
"url": url,
func menuParse(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div#menu ul li").Each(func(i int, s *goquery.Selection) {
if url, exists := s.Find("a").Attr("href"); exists == true {
g.Get(r.JoinURL2(url), pageParse)
}
})
//if href, ok := r.HTMLDoc.Find("li.next > a").Attr("href"); ok {
// g.Get(r.JoinURL(href), quotesParse)
//}
}
func pageParse(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div.co_content8 ul b").Each(func(i int, s *goquery.Selection) {
if url, exists := s.Find("a").Eq(1).Attr("href"); exists == true {
g.Get(r.JoinURL2(url), detailParse)
}
})
// 分页解析
r.HTMLDoc.Find("div.x a").Each(func(i int, selection *goquery.Selection) {
if href, exists := selection.Attr("href"); exists {
g.Get(r.JoinURL2(href), pageParse)
}
})
//htmlquery.Find(r.HTMLNode, "")
}
func detailParse(g *geziyor.Geziyor, r *client.Response) {
s := r.HTMLDoc.Find("body")
url := r.Request.URL.String()
title := s.Find("div.title_all h1").Text()
html, _ := s.Find("div#Zoom").Html()
text := s.Find("div#Zoom").Text()
image, _ := s.Find("div#Zoom").Find("img").Attr("src")
g.Exports <- map[string]interface{}{
"image": image,
"text": text,
"html": filter.FilterScriptTags(html),
"url": url,
"title": title,
}
}