Maximum redirection option added. Performance improvement on exports. Duplicate requests only checked on GET requests.

This commit is contained in:
Musab Gültekin 2019-07-01 15:44:28 +03:00
parent 80f3500a69
commit c0dd0393e6
6 changed files with 71 additions and 16 deletions

View File

@ -168,14 +168,13 @@ geziyor.NewGeziyor(&geziyor.Options{
See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for this benchmark function: See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for this benchmark function:
```bash ```bash
>> go test -run none -bench . -benchtime 10s >> go test -run none -bench Requests -benchtime 10s
goos: darwin goos: darwin
goarch: amd64 goarch: amd64
pkg: github.com/geziyor/geziyor pkg: github.com/geziyor/geziyor
BenchmarkGeziyor_Do-8 200000 112493 ns/op BenchmarkRequests-8 200000 108710 ns/op
PASS PASS
ok github.com/geziyor/geziyor 23.662s ok github.com/geziyor/geziyor 22.861s
``` ```
## Roadmap ## Roadmap

View File

@ -207,3 +207,13 @@ func ConvertMapToHeader(m map[string]interface{}) http.Header {
} }
return header return header
} }
// NewRedirectionHandler returns maximum allowed redirection function with provided maxRedirect
func NewRedirectionHandler(maxRedirect int) func(req *http.Request, via []*http.Request) error {
return func(req *http.Request, via []*http.Request) error {
if len(via) >= maxRedirect {
return errors.Errorf("stopped after %d redirects", maxRedirect)
}
return nil
}
}

View File

@ -48,7 +48,7 @@ func NewGeziyor(opt *Options) *Geziyor {
geziyor := &Geziyor{ geziyor := &Geziyor{
Client: client.NewClient(), Client: client.NewClient(),
Opt: opt, Opt: opt,
Exports: make(chan interface{}), Exports: make(chan interface{}, 1),
requestMiddlewares: []RequestMiddleware{ requestMiddlewares: []RequestMiddleware{
allowedDomainsMiddleware, allowedDomainsMiddleware,
duplicateRequestsMiddleware, duplicateRequestsMiddleware,
@ -81,6 +81,9 @@ func NewGeziyor(opt *Options) *Geziyor {
if !opt.CookiesDisabled { if !opt.CookiesDisabled {
geziyor.Client.Jar, _ = cookiejar.New(nil) geziyor.Client.Jar, _ = cookiejar.New(nil)
} }
if opt.MaxRedirect != 0 {
geziyor.Client.CheckRedirect = client.NewRedirectionHandler(opt.MaxRedirect)
}
if opt.ConcurrentRequests != 0 { if opt.ConcurrentRequests != 0 {
geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests) geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests)
} }

View File

@ -202,8 +202,29 @@ func TestCharsetDetection(t *testing.T) {
}).Start() }).Start()
} }
func TestRedirect(t *testing.T) {
defer leaktest.Check(t)()
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://httpbin.org/absolute-redirect/1"},
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
//t.Fail()
},
MaxRedirect: -1,
}).Start()
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://httpbin.org/absolute-redirect/1"},
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
if r.StatusCode == 302 {
t.Fail()
}
},
MaxRedirect: 0,
}).Start()
}
// Make sure to increase open file descriptor limits before running // Make sure to increase open file descriptor limits before running
func BenchmarkGeziyor_Do(b *testing.B) { func BenchmarkRequests(b *testing.B) {
// Create Server // Create Server
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@ -230,3 +251,23 @@ func BenchmarkGeziyor_Do(b *testing.B) {
LogDisabled: true, LogDisabled: true,
}).Start() }).Start()
} }
func BenchmarkWhole(b *testing.B) {
for i := 0; i < b.N; i++ {
geziyor.NewGeziyor(&geziyor.Options{
AllowedDomains: []string{"quotes.toscrape.com"},
StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
g.Exports <- []string{r.Request.URL.String()}
r.HTMLDoc.Find("a").Each(func(i int, s *goquery.Selection) {
if href, ok := s.Attr("href"); ok {
g.Get(r.JoinURL(href), g.Opt.ParseFunc)
}
})
},
Exporters: []geziyor.Exporter{&export.CSV{}},
//MetricsType: metrics.Prometheus,
LogDisabled: true,
}).Start()
}
}

View File

@ -46,9 +46,8 @@ func allowedDomainsMiddleware(g *Geziyor, r *client.Request) {
// duplicateRequestsMiddleware checks for already visited URLs // duplicateRequestsMiddleware checks for already visited URLs
func duplicateRequestsMiddleware(g *Geziyor, r *client.Request) { func duplicateRequestsMiddleware(g *Geziyor, r *client.Request) {
if !g.Opt.URLRevisitEnabled { if !g.Opt.URLRevisitEnabled && r.Request.Method == "GET" {
key := r.Request.URL.String() + r.Request.Method if _, visited := g.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
if _, visited := g.visitedURLs.LoadOrStore(key, struct{}{}); visited {
//log.Printf("URL already visited %s\n", rawURL) //log.Printf("URL already visited %s\n", rawURL)
r.Cancel() r.Cancel()
} }

View File

@ -61,6 +61,9 @@ type Options struct {
// Max body reading size in bytes. Default: 1GB // Max body reading size in bytes. Default: 1GB
MaxBodySize int64 MaxBodySize int64
// Maximum redirection time. Default: 10
MaxRedirect int
// Charset Detection disable // Charset Detection disable
CharsetDetectDisabled bool CharsetDetectDisabled bool