初始化

This commit is contained in:
Administrator 2024-09-04 16:48:42 +08:00
parent 229b8ca83a
commit 688c516c9f
26 changed files with 126 additions and 1936 deletions

3
cache/cache.go vendored
View File

@ -11,11 +11,11 @@ import (
"bufio"
"bytes"
"errors"
"github.com/geziyor/geziyor/cache/memorycache"
"io"
"io/ioutil"
"net/http"
"net/http/httputil"
"softdown.com/shusou/geziyor/cache/memorycache"
"strings"
"testing"
"time"
@ -122,7 +122,6 @@ func varyMatches(cachedResp *http.Response, req *http.Request) bool {
// RoundTrip is a wrapper for caching requests.
// If there is a fresh Response already in cache, then it will be returned without connecting to
// the server.
//
func (t *Transport) RoundTrip(req *http.Request) (resp *http.Response, err error) {
if t.Policy == Dummy {
return t.RoundTripDummy(req)

2
cache/cache_test.go vendored
View File

@ -4,12 +4,12 @@ import (
"bytes"
"errors"
"flag"
"github.com/geziyor/geziyor/cache/memorycache"
"io"
"io/ioutil"
"net/http"
"net/http/httptest"
"os"
"softdown.com/shusou/geziyor/cache/memorycache"
"strconv"
"testing"
"time"

View File

@ -1,9 +1,9 @@
package diskcache
import (
"github.com/geziyor/geziyor/cache"
"io/ioutil"
"os"
"softdown.com/shusou/geziyor/cache"
"testing"
)

View File

@ -1,10 +1,10 @@
package leveldbcache
import (
"github.com/geziyor/geziyor/cache"
"io/ioutil"
"os"
"path/filepath"
"softdown.com/shusou/geziyor/cache"
"testing"
)

View File

@ -13,9 +13,9 @@ import (
"github.com/chromedp/cdproto/dom"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
"github.com/geziyor/geziyor/internal"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
"softdown.com/shusou/geziyor/internal"
)
var (
@ -47,7 +47,7 @@ type Options struct {
// Default values for client
const (
DefaultUserAgent = "Geziyor 1.0"
DefaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB
DefaultRetryTimes = 2
)

View File

@ -6,7 +6,7 @@ import (
"net/url"
"sync/atomic"
"github.com/geziyor/geziyor/internal"
"softdown.com/shusou/geziyor/internal"
)
type ProxyURLKey int

View File

@ -40,15 +40,16 @@ func (r *Request) Cancel() {
}
// NewRequest returns a new Request given a method, URL, and optional body.
func NewRequest(method, url string, body io.Reader) (*Request, error) {
func NewRequest(method, url, encoding string, body io.Reader) (*Request, error) {
req, err := http.NewRequest(method, url, body)
if err != nil {
return nil, err
}
request := Request{
Request: req,
Meta: make(map[string]interface{}),
Request: req,
Encoding: encoding,
Meta: make(map[string]interface{}),
}
return &request, nil

View File

@ -6,7 +6,7 @@ import (
)
func TestMeta(t *testing.T) {
req, err := NewRequest("GET", "https://github.com/geziyor/geziyor", nil)
req, err := NewRequest("GET", "https://softdown.com/shusou/geziyor", nil)
assert.NoError(t, err)
req.Meta["key"] = "value"

31
cmd/main.go Normal file
View File

@ -0,0 +1,31 @@
package main
import (
"github.com/PuerkitoBio/goquery"
"softdown.com/shusou/geziyor"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/export"
)
func main() {
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://dytt.dytt8.net/index.htm"},
Encoding: "gb2312",
ParseFunc: quotesParse,
Exporters: []export.Exporter{&export.JSON{}},
}).Start()
}
func quotesParse(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div.co_content2 ul a").Each(func(i int, s *goquery.Selection) {
//fmt.Println(s.Html())
var url, _ = s.Attr("href")
g.Exports <- map[string]interface{}{
"title": s.Text(),
"url": url,
}
})
//if href, ok := r.HTMLDoc.Find("li.next > a").Attr("href"); ok {
// g.Get(r.JoinURL(href), quotesParse)
//}
}

View File

@ -3,9 +3,9 @@ package export
import (
"encoding/csv"
"fmt"
"github.com/geziyor/geziyor/internal"
"os"
"reflect"
"softdown.com/shusou/geziyor/internal"
"sort"
)

View File

@ -4,8 +4,8 @@ import (
"bytes"
"encoding/json"
"fmt"
"github.com/geziyor/geziyor/internal"
"os"
"softdown.com/shusou/geziyor/internal"
)
// JSONLine exports response data as JSON streaming file

View File

@ -2,13 +2,13 @@ package geziyor
import (
"github.com/chromedp/chromedp"
"github.com/geziyor/geziyor/cache"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/internal"
"github.com/geziyor/geziyor/metrics"
"github.com/geziyor/geziyor/middleware"
"golang.org/x/time/rate"
"softdown.com/shusou/geziyor/cache"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/export"
"softdown.com/shusou/geziyor/internal"
"softdown.com/shusou/geziyor/metrics"
"softdown.com/shusou/geziyor/middleware"
"io"
"io/ioutil"
@ -162,7 +162,7 @@ func (g *Geziyor) Start() {
g.Opt.StartRequestsFunc(g)
} else {
for _, startURL := range g.Opt.StartURLs {
g.Get(startURL, g.Opt.ParseFunc)
g.Get(startURL, g.Opt.Encoding, g.Opt.ParseFunc)
}
}
@ -174,8 +174,8 @@ func (g *Geziyor) Start() {
}
// Get issues a GET to the specified URL.
func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("GET", url, nil)
func (g *Geziyor) Get(url, encoding string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("GET", url, encoding, nil)
if err != nil {
internal.Logger.Printf("Request creating error %v\n", err)
return
@ -186,8 +186,8 @@ func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *client.Response))
// GetRendered issues GET request using headless browser
// Opens up a new Chrome instance, makes request, waits for rendering HTML DOM and closed.
// Rendered requests only supported for GET requests.
func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("GET", url, nil)
func (g *Geziyor) GetRendered(url, encoding string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("GET", url, encoding, nil)
if err != nil {
internal.Logger.Printf("Request creating error %v\n", err)
return
@ -198,7 +198,7 @@ func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *client.Re
// Head issues a HEAD to the specified URL
func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("HEAD", url, nil)
req, err := client.NewRequest("HEAD", url, "", nil)
if err != nil {
internal.Logger.Printf("Request creating error %v\n", err)
return
@ -208,7 +208,7 @@ func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)
// Post issues a POST to the specified URL
func (g *Geziyor) Post(url string, body io.Reader, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("POST", url, body)
req, err := client.NewRequest("POST", url, "", body)
if err != nil {
internal.Logger.Printf("Request creating error %v\n", err)
return

View File

@ -17,14 +17,14 @@ import (
"github.com/PuerkitoBio/goquery"
"github.com/elazarl/goproxy"
"github.com/fortytw2/leaktest"
"github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/cache"
"github.com/geziyor/geziyor/cache/diskcache"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/internal"
"github.com/geziyor/geziyor/metrics"
"github.com/stretchr/testify/assert"
"softdown.com/shusou/geziyor"
"softdown.com/shusou/geziyor/cache"
"softdown.com/shusou/geziyor/cache/diskcache"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/export"
"softdown.com/shusou/geziyor/internal"
"softdown.com/shusou/geziyor/metrics"
)
func TestSimple(t *testing.T) {

33
go.mod
View File

@ -1,24 +1,45 @@
module github.com/geziyor/geziyor
module softdown.com/shusou/geziyor
go 1.15
go 1.22
require (
github.com/PuerkitoBio/goquery v1.9.2
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/chromedp/cdproto v0.0.0-20240810084448-b931b754e476
github.com/chromedp/chromedp v0.10.0
github.com/elazarl/goproxy v0.0.0-20210801061803-8e322dfb79c4
github.com/fortytw2/leaktest v1.3.0
github.com/go-kit/kit v0.13.0
github.com/golang/protobuf v1.5.4 // indirect
github.com/peterbourgon/diskv v2.0.1+incompatible
github.com/prometheus/client_golang v1.19.1
github.com/prometheus/common v0.55.0 // indirect
github.com/stretchr/testify v1.9.0
github.com/syndtr/goleveldb v1.0.0
github.com/temoto/robotstxt v1.1.2
golang.org/x/net v0.28.0
golang.org/x/sys v0.24.0 // indirect
golang.org/x/text v0.17.0
golang.org/x/time v0.6.0
)
require (
github.com/VividCortex/gohistogram v1.0.0 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect
github.com/google/btree v1.1.3 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
golang.org/x/sys v0.24.0 // indirect
google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

1880
go.sum

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,8 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/internal"
"sync"
)

View File

@ -1,8 +1,8 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"math/rand"
"softdown.com/shusou/geziyor/client"
"time"
)

View File

@ -1,8 +1,8 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/internal"
"sync"
)

View File

@ -1,8 +1,8 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"github.com/stretchr/testify/assert"
"softdown.com/shusou/geziyor/client"
"strings"
"testing"
)

View File

@ -1,7 +1,7 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"softdown.com/shusou/geziyor/client"
)
// Headers sets default request headers

View File

@ -1,8 +1,8 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/internal"
)
// LogStats logs responses

View File

@ -1,8 +1,8 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/metrics"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/metrics"
"strconv"
)

View File

@ -1,7 +1,7 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"softdown.com/shusou/geziyor/client"
)
// RequestResponseProcessor interface is for middlewares that needs to process both requests and responses

View File

@ -3,8 +3,8 @@ package middleware
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/internal"
)
// ParseHTML parses response if response is HTML

View File

@ -1,10 +1,11 @@
package middleware
import (
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal"
"github.com/geziyor/geziyor/metrics"
"github.com/temoto/robotstxt"
"net/http"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/internal"
"softdown.com/shusou/geziyor/metrics"
"strconv"
"sync"
)
@ -38,7 +39,7 @@ func (m *RobotsTxt) ProcessRequest(r *client.Request) {
m.mut.RUnlock()
if !exists {
robotsReq, err := client.NewRequest("GET", r.URL.Scheme+"://"+r.Host+"/robots.txt", nil)
robotsReq, err := client.NewRequest(http.MethodGet, r.URL.Scheme+"://"+r.Host+"/robots.txt", "", nil)
if err != nil {
return // Don't Do anything
}

View File

@ -2,13 +2,13 @@ package geziyor
import (
"github.com/chromedp/chromedp"
"github.com/geziyor/geziyor/cache"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/metrics"
"github.com/geziyor/geziyor/middleware"
"net/http"
"net/url"
"softdown.com/shusou/geziyor/cache"
"softdown.com/shusou/geziyor/client"
"softdown.com/shusou/geziyor/export"
"softdown.com/shusou/geziyor/metrics"
"softdown.com/shusou/geziyor/middleware"
"time"
)
@ -123,4 +123,7 @@ type Options struct {
// User Agent.
// Default: "Geziyor 1.0"
UserAgent string
// 网页编码
Encoding string
}