geziyor/cmd/main.go
Administrator 88f37ecc2d 备份
2024-09-05 18:16:17 +08:00

61 lines
1.6 KiB
Go

package main
import (
"github.com/PuerkitoBio/goquery"
"softdown.com/shusou/geziyor"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/export"
"softdown.com/shusou/geziyor/filter"
"softdown.com/shusou/geziyor/middleware"
)
func main() {
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://dytt.dytt8.net/index.htm"},
Encoding: middleware.EncodingGBK,
ParseFunc: menuParse,
Exporters: []export.Exporter{&export.MeiliSearch{}},
}).Start()
}
func menuParse(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div#menu ul li").Each(func(i int, s *goquery.Selection) {
if url, exists := s.Find("a").Attr("href"); exists == true {
g.Get(r.JoinURL2(url), pageParse)
}
})
}
func pageParse(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div.co_content8 ul b").Each(func(i int, s *goquery.Selection) {
if url, exists := s.Find("a").Eq(1).Attr("href"); exists == true {
g.Get(r.JoinURL2(url), detailParse)
}
})
// 分页解析
r.HTMLDoc.Find("div.x a").Each(func(i int, selection *goquery.Selection) {
if href, exists := selection.Attr("href"); exists {
g.Get(r.JoinURL2(href), pageParse)
}
})
//htmlquery.Find(r.HTMLNode, "")
}
func detailParse(g *geziyor.Geziyor, r *client.Response) {
s := r.HTMLDoc.Find("body")
url := r.Request.URL.String()
title := s.Find("div.title_all h1").Text()
html, _ := s.Find("div#Zoom").Html()
text := s.Find("div#Zoom").Text()
image, _ := s.Find("div#Zoom").Find("img").Attr("src")
g.Exports <- map[string]interface{}{
"image": image,
"text": text,
"html": filter.FilterScriptTags(html),
"url": url,
"title": title,
}
}