初始化

This commit is contained in:
Administrator 2024-09-04 16:48:42 +08:00
parent 229b8ca83a
commit 688c516c9f
26 changed files with 126 additions and 1936 deletions

3
cache/cache.go vendored
View File

@ -11,11 +11,11 @@ import (
"bufio" "bufio"
"bytes" "bytes"
"errors" "errors"
"github.com/geziyor/geziyor/cache/memorycache"
"io" "io"
"io/ioutil" "io/ioutil"
"net/http" "net/http"
"net/http/httputil" "net/http/httputil"
"softdown.com/shusou/geziyor/cache/memorycache"
"strings" "strings"
"testing" "testing"
"time" "time"
@ -122,7 +122,6 @@ func varyMatches(cachedResp *http.Response, req *http.Request) bool {
// RoundTrip is a wrapper for caching requests. // RoundTrip is a wrapper for caching requests.
// If there is a fresh Response already in cache, then it will be returned without connecting to // If there is a fresh Response already in cache, then it will be returned without connecting to
// the server. // the server.
//
func (t *Transport) RoundTrip(req *http.Request) (resp *http.Response, err error) { func (t *Transport) RoundTrip(req *http.Request) (resp *http.Response, err error) {
if t.Policy == Dummy { if t.Policy == Dummy {
return t.RoundTripDummy(req) return t.RoundTripDummy(req)

2
cache/cache_test.go vendored
View File

@ -4,12 +4,12 @@ import (
"bytes" "bytes"
"errors" "errors"
"flag" "flag"
"github.com/geziyor/geziyor/cache/memorycache"
"io" "io"
"io/ioutil" "io/ioutil"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
"os" "os"
"softdown.com/shusou/geziyor/cache/memorycache"
"strconv" "strconv"
"testing" "testing"
"time" "time"

View File

@ -1,9 +1,9 @@
package diskcache package diskcache
import ( import (
"github.com/geziyor/geziyor/cache"
"io/ioutil" "io/ioutil"
"os" "os"
"softdown.com/shusou/geziyor/cache"
"testing" "testing"
) )

View File

@ -1,10 +1,10 @@
package leveldbcache package leveldbcache
import ( import (
"github.com/geziyor/geziyor/cache"
"io/ioutil" "io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
"softdown.com/shusou/geziyor/cache"
"testing" "testing"
) )

View File

@ -13,9 +13,9 @@ import (
"github.com/chromedp/cdproto/dom" "github.com/chromedp/cdproto/dom"
"github.com/chromedp/cdproto/network" "github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp" "github.com/chromedp/chromedp"
"github.com/geziyor/geziyor/internal"
"golang.org/x/net/html/charset" "golang.org/x/net/html/charset"
"golang.org/x/text/transform" "golang.org/x/text/transform"
"softdown.com/shusou/geziyor/internal"
) )
var ( var (
@ -47,7 +47,7 @@ type Options struct {
// Default values for client // Default values for client
const ( const (
DefaultUserAgent = "Geziyor 1.0" DefaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB
DefaultRetryTimes = 2 DefaultRetryTimes = 2
) )

View File

@ -6,7 +6,7 @@ import (
"net/url" "net/url"
"sync/atomic" "sync/atomic"
"github.com/geziyor/geziyor/internal" "softdown.com/shusou/geziyor/internal"
) )
type ProxyURLKey int type ProxyURLKey int

View File

@ -40,15 +40,16 @@ func (r *Request) Cancel() {
} }
// NewRequest returns a new Request given a method, URL, and optional body. // NewRequest returns a new Request given a method, URL, and optional body.
func NewRequest(method, url string, body io.Reader) (*Request, error) { func NewRequest(method, url, encoding string, body io.Reader) (*Request, error) {
req, err := http.NewRequest(method, url, body) req, err := http.NewRequest(method, url, body)
if err != nil { if err != nil {
return nil, err return nil, err
} }
request := Request{ request := Request{
Request: req, Request: req,
Meta: make(map[string]interface{}), Encoding: encoding,
Meta: make(map[string]interface{}),
} }
return &request, nil return &request, nil

View File

@ -6,7 +6,7 @@ import (
) )
func TestMeta(t *testing.T) { func TestMeta(t *testing.T) {
req, err := NewRequest("GET", "https://github.com/geziyor/geziyor", nil) req, err := NewRequest("GET", "https://softdown.com/shusou/geziyor", nil)
assert.NoError(t, err) assert.NoError(t, err)
req.Meta["key"] = "value" req.Meta["key"] = "value"

31
cmd/main.go Normal file
View File

@ -0,0 +1,31 @@
package main
import (
"github.com/PuerkitoBio/goquery"
"softdown.com/shusou/geziyor"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/export"
)
func main() {
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://dytt.dytt8.net/index.htm"},
Encoding: "gb2312",
ParseFunc: quotesParse,
Exporters: []export.Exporter{&export.JSON{}},
}).Start()
}
func quotesParse(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div.co_content2 ul a").Each(func(i int, s *goquery.Selection) {
//fmt.Println(s.Html())
var url, _ = s.Attr("href")
g.Exports <- map[string]interface{}{
"title": s.Text(),
"url": url,
}
})
//if href, ok := r.HTMLDoc.Find("li.next > a").Attr("href"); ok {
// g.Get(r.JoinURL(href), quotesParse)
//}
}

View File

@ -3,9 +3,9 @@ package export
import ( import (
"encoding/csv" "encoding/csv"
"fmt" "fmt"
"github.com/geziyor/geziyor/internal"
"os" "os"
"reflect" "reflect"
"softdown.com/shusou/geziyor/internal"
"sort" "sort"
) )

View File

@ -4,8 +4,8 @@ import (
"bytes" "bytes"
"encoding/json" "encoding/json"
"fmt" "fmt"
"github.com/geziyor/geziyor/internal"
"os" "os"
"softdown.com/shusou/geziyor/internal"
) )
// JSONLine exports response data as JSON streaming file // JSONLine exports response data as JSON streaming file

View File

@ -2,13 +2,13 @@ package geziyor
import ( import (
"github.com/chromedp/chromedp" "github.com/chromedp/chromedp"
"github.com/geziyor/geziyor/cache"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/internal"
"github.com/geziyor/geziyor/metrics"
"github.com/geziyor/geziyor/middleware"
"golang.org/x/time/rate" "golang.org/x/time/rate"
"softdown.com/shusou/geziyor/cache"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/export"
"softdown.com/shusou/geziyor/internal"
"softdown.com/shusou/geziyor/metrics"
"softdown.com/shusou/geziyor/middleware"
"io" "io"
"io/ioutil" "io/ioutil"
@ -162,7 +162,7 @@ func (g *Geziyor) Start() {
g.Opt.StartRequestsFunc(g) g.Opt.StartRequestsFunc(g)
} else { } else {
for _, startURL := range g.Opt.StartURLs { for _, startURL := range g.Opt.StartURLs {
g.Get(startURL, g.Opt.ParseFunc) g.Get(startURL, g.Opt.Encoding, g.Opt.ParseFunc)
} }
} }
@ -174,8 +174,8 @@ func (g *Geziyor) Start() {
} }
// Get issues a GET to the specified URL. // Get issues a GET to the specified URL.
func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *client.Response)) { func (g *Geziyor) Get(url, encoding string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("GET", url, nil) req, err := client.NewRequest("GET", url, encoding, nil)
if err != nil { if err != nil {
internal.Logger.Printf("Request creating error %v\n", err) internal.Logger.Printf("Request creating error %v\n", err)
return return
@ -186,8 +186,8 @@ func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *client.Response))
// GetRendered issues GET request using headless browser // GetRendered issues GET request using headless browser
// Opens up a new Chrome instance, makes request, waits for rendering HTML DOM and closed. // Opens up a new Chrome instance, makes request, waits for rendering HTML DOM and closed.
// Rendered requests only supported for GET requests. // Rendered requests only supported for GET requests.
func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *client.Response)) { func (g *Geziyor) GetRendered(url, encoding string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("GET", url, nil) req, err := client.NewRequest("GET", url, encoding, nil)
if err != nil { if err != nil {
internal.Logger.Printf("Request creating error %v\n", err) internal.Logger.Printf("Request creating error %v\n", err)
return return
@ -198,7 +198,7 @@ func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *client.Re
// Head issues a HEAD to the specified URL // Head issues a HEAD to the specified URL
func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)) { func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("HEAD", url, nil) req, err := client.NewRequest("HEAD", url, "", nil)
if err != nil { if err != nil {
internal.Logger.Printf("Request creating error %v\n", err) internal.Logger.Printf("Request creating error %v\n", err)
return return
@ -208,7 +208,7 @@ func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)
// Post issues a POST to the specified URL // Post issues a POST to the specified URL
func (g *Geziyor) Post(url string, body io.Reader, callback func(g *Geziyor, r *client.Response)) { func (g *Geziyor) Post(url string, body io.Reader, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("POST", url, body) req, err := client.NewRequest("POST", url, "", body)
if err != nil { if err != nil {
internal.Logger.Printf("Request creating error %v\n", err) internal.Logger.Printf("Request creating error %v\n", err)
return return

View File

@ -17,14 +17,14 @@ import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/elazarl/goproxy" "github.com/elazarl/goproxy"
"github.com/fortytw2/leaktest" "github.com/fortytw2/leaktest"
"github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/cache"
"github.com/geziyor/geziyor/cache/diskcache"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/internal"
"github.com/geziyor/geziyor/metrics"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"softdown.com/shusou/geziyor"
"softdown.com/shusou/geziyor/cache"
"softdown.com/shusou/geziyor/cache/diskcache"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/export"
"softdown.com/shusou/geziyor/internal"
"softdown.com/shusou/geziyor/metrics"
) )
func TestSimple(t *testing.T) { func TestSimple(t *testing.T) {

33
go.mod
View File

@ -1,24 +1,45 @@
module github.com/geziyor/geziyor module softdown.com/shusou/geziyor
go 1.15 go 1.22
require ( require (
github.com/PuerkitoBio/goquery v1.9.2 github.com/PuerkitoBio/goquery v1.9.2
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/chromedp/cdproto v0.0.0-20240810084448-b931b754e476 github.com/chromedp/cdproto v0.0.0-20240810084448-b931b754e476
github.com/chromedp/chromedp v0.10.0 github.com/chromedp/chromedp v0.10.0
github.com/elazarl/goproxy v0.0.0-20210801061803-8e322dfb79c4 github.com/elazarl/goproxy v0.0.0-20210801061803-8e322dfb79c4
github.com/fortytw2/leaktest v1.3.0 github.com/fortytw2/leaktest v1.3.0
github.com/go-kit/kit v0.13.0 github.com/go-kit/kit v0.13.0
github.com/golang/protobuf v1.5.4 // indirect
github.com/peterbourgon/diskv v2.0.1+incompatible github.com/peterbourgon/diskv v2.0.1+incompatible
github.com/prometheus/client_golang v1.19.1 github.com/prometheus/client_golang v1.19.1
github.com/prometheus/common v0.55.0 // indirect
github.com/stretchr/testify v1.9.0 github.com/stretchr/testify v1.9.0
github.com/syndtr/goleveldb v1.0.0 github.com/syndtr/goleveldb v1.0.0
github.com/temoto/robotstxt v1.1.2 github.com/temoto/robotstxt v1.1.2
golang.org/x/net v0.28.0 golang.org/x/net v0.28.0
golang.org/x/sys v0.24.0 // indirect
golang.org/x/text v0.17.0 golang.org/x/text v0.17.0
golang.org/x/time v0.6.0 golang.org/x/time v0.6.0
) )
require (
github.com/VividCortex/gohistogram v1.0.0 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect
github.com/google/btree v1.1.3 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
golang.org/x/sys v0.24.0 // indirect
google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

1880
go.sum

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,8 @@
package middleware package middleware
import ( import (
"github.com/geziyor/geziyor/client" "softdown.com/shusou/geziyor/client"
"github.com/geziyor/geziyor/internal" "softdown.com/shusou/geziyor/internal"
"sync" "sync"
) )

View File

@ -1,8 +1,8 @@
package middleware package middleware
import ( import (
"github.com/geziyor/geziyor/client"
"math/rand" "math/rand"
"softdown.com/shusou/geziyor/client"
"time" "time"
) )

View File

@ -1,8 +1,8 @@
package middleware package middleware
import ( import (
"github.com/geziyor/geziyor/client" "softdown.com/shusou/geziyor/client"
"github.com/geziyor/geziyor/internal" "softdown.com/shusou/geziyor/internal"
"sync" "sync"
) )

View File

@ -1,8 +1,8 @@
package middleware package middleware
import ( import (
"github.com/geziyor/geziyor/client"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"softdown.com/shusou/geziyor/client"
"strings" "strings"
"testing" "testing"
) )

View File

@ -1,7 +1,7 @@
package middleware package middleware
import ( import (
"github.com/geziyor/geziyor/client" "softdown.com/shusou/geziyor/client"
) )
// Headers sets default request headers // Headers sets default request headers

View File

@ -1,8 +1,8 @@
package middleware package middleware
import ( import (
"github.com/geziyor/geziyor/client" "softdown.com/shusou/geziyor/client"
"github.com/geziyor/geziyor/internal" "softdown.com/shusou/geziyor/internal"
) )
// LogStats logs responses // LogStats logs responses

View File

@ -1,8 +1,8 @@
package middleware package middleware
import ( import (
"github.com/geziyor/geziyor/client" "softdown.com/shusou/geziyor/client"
"github.com/geziyor/geziyor/metrics" "softdown.com/shusou/geziyor/metrics"
"strconv" "strconv"
) )

View File

@ -1,7 +1,7 @@
package middleware package middleware
import ( import (
"github.com/geziyor/geziyor/client" "softdown.com/shusou/geziyor/client"
) )
// RequestResponseProcessor interface is for middlewares that needs to process both requests and responses // RequestResponseProcessor interface is for middlewares that needs to process both requests and responses

View File

@ -3,8 +3,8 @@ package middleware
import ( import (
"bytes" "bytes"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor/client" "softdown.com/shusou/geziyor/client"
"github.com/geziyor/geziyor/internal" "softdown.com/shusou/geziyor/internal"
) )
// ParseHTML parses response if response is HTML // ParseHTML parses response if response is HTML

View File

@ -1,10 +1,11 @@
package middleware package middleware
import ( import (
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal"
"github.com/geziyor/geziyor/metrics"
"github.com/temoto/robotstxt" "github.com/temoto/robotstxt"
"net/http"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/internal"
"softdown.com/shusou/geziyor/metrics"
"strconv" "strconv"
"sync" "sync"
) )
@ -38,7 +39,7 @@ func (m *RobotsTxt) ProcessRequest(r *client.Request) {
m.mut.RUnlock() m.mut.RUnlock()
if !exists { if !exists {
robotsReq, err := client.NewRequest("GET", r.URL.Scheme+"://"+r.Host+"/robots.txt", nil) robotsReq, err := client.NewRequest(http.MethodGet, r.URL.Scheme+"://"+r.Host+"/robots.txt", "", nil)
if err != nil { if err != nil {
return // Don't Do anything return // Don't Do anything
} }

View File

@ -2,13 +2,13 @@ package geziyor
import ( import (
"github.com/chromedp/chromedp" "github.com/chromedp/chromedp"
"github.com/geziyor/geziyor/cache"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/metrics"
"github.com/geziyor/geziyor/middleware"
"net/http" "net/http"
"net/url" "net/url"
"softdown.com/shusou/geziyor/cache"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/export"
"softdown.com/shusou/geziyor/metrics"
"softdown.com/shusou/geziyor/middleware"
"time" "time"
) )
@ -123,4 +123,7 @@ type Options struct {
// User Agent. // User Agent.
// Default: "Geziyor 1.0" // Default: "Geziyor 1.0"
UserAgent string UserAgent string
// 网页编码
Encoding string
} }