Maximum redirection option added. Performance improvement on exports. Duplicate requests only checked on GET requests.
This commit is contained in:
parent
80f3500a69
commit
c0dd0393e6
21
README.md
21
README.md
@ -111,13 +111,13 @@ You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor)
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
|
||||
Extractors: []geziyor.Extractor{
|
||||
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||
&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
|
||||
&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
|
||||
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
|
||||
&extract.Text{Name: "title", Selector: ".c-page-title"},
|
||||
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
|
||||
&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
|
||||
&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
|
||||
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
|
||||
&extract.Text{Name: "content", Selector: ".c-entry-content"},
|
||||
},
|
||||
Exporters: []geziyor.Exporter{&export.JSON{}},
|
||||
}).Start()
|
||||
@ -168,14 +168,13 @@ geziyor.NewGeziyor(&geziyor.Options{
|
||||
See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for this benchmark function:
|
||||
|
||||
```bash
|
||||
>> go test -run none -bench . -benchtime 10s
|
||||
>> go test -run none -bench Requests -benchtime 10s
|
||||
goos: darwin
|
||||
goarch: amd64
|
||||
pkg: github.com/geziyor/geziyor
|
||||
BenchmarkGeziyor_Do-8 200000 112493 ns/op
|
||||
|
||||
BenchmarkRequests-8 200000 108710 ns/op
|
||||
PASS
|
||||
ok github.com/geziyor/geziyor 23.662s
|
||||
ok github.com/geziyor/geziyor 22.861s
|
||||
```
|
||||
|
||||
## Roadmap
|
||||
|
@ -207,3 +207,13 @@ func ConvertMapToHeader(m map[string]interface{}) http.Header {
|
||||
}
|
||||
return header
|
||||
}
|
||||
|
||||
// NewRedirectionHandler returns maximum allowed redirection function with provided maxRedirect
|
||||
func NewRedirectionHandler(maxRedirect int) func(req *http.Request, via []*http.Request) error {
|
||||
return func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= maxRedirect {
|
||||
return errors.Errorf("stopped after %d redirects", maxRedirect)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
@ -48,7 +48,7 @@ func NewGeziyor(opt *Options) *Geziyor {
|
||||
geziyor := &Geziyor{
|
||||
Client: client.NewClient(),
|
||||
Opt: opt,
|
||||
Exports: make(chan interface{}),
|
||||
Exports: make(chan interface{}, 1),
|
||||
requestMiddlewares: []RequestMiddleware{
|
||||
allowedDomainsMiddleware,
|
||||
duplicateRequestsMiddleware,
|
||||
@ -81,6 +81,9 @@ func NewGeziyor(opt *Options) *Geziyor {
|
||||
if !opt.CookiesDisabled {
|
||||
geziyor.Client.Jar, _ = cookiejar.New(nil)
|
||||
}
|
||||
if opt.MaxRedirect != 0 {
|
||||
geziyor.Client.CheckRedirect = client.NewRedirectionHandler(opt.MaxRedirect)
|
||||
}
|
||||
if opt.ConcurrentRequests != 0 {
|
||||
geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests)
|
||||
}
|
||||
|
@ -202,8 +202,29 @@ func TestCharsetDetection(t *testing.T) {
|
||||
}).Start()
|
||||
}
|
||||
|
||||
func TestRedirect(t *testing.T) {
|
||||
defer leaktest.Check(t)()
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"https://httpbin.org/absolute-redirect/1"},
|
||||
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
||||
//t.Fail()
|
||||
},
|
||||
MaxRedirect: -1,
|
||||
}).Start()
|
||||
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"https://httpbin.org/absolute-redirect/1"},
|
||||
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
||||
if r.StatusCode == 302 {
|
||||
t.Fail()
|
||||
}
|
||||
},
|
||||
MaxRedirect: 0,
|
||||
}).Start()
|
||||
}
|
||||
|
||||
// Make sure to increase open file descriptor limits before running
|
||||
func BenchmarkGeziyor_Do(b *testing.B) {
|
||||
func BenchmarkRequests(b *testing.B) {
|
||||
|
||||
// Create Server
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
@ -230,3 +251,23 @@ func BenchmarkGeziyor_Do(b *testing.B) {
|
||||
LogDisabled: true,
|
||||
}).Start()
|
||||
}
|
||||
|
||||
func BenchmarkWhole(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
AllowedDomains: []string{"quotes.toscrape.com"},
|
||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
||||
g.Exports <- []string{r.Request.URL.String()}
|
||||
r.HTMLDoc.Find("a").Each(func(i int, s *goquery.Selection) {
|
||||
if href, ok := s.Attr("href"); ok {
|
||||
g.Get(r.JoinURL(href), g.Opt.ParseFunc)
|
||||
}
|
||||
})
|
||||
},
|
||||
Exporters: []geziyor.Exporter{&export.CSV{}},
|
||||
//MetricsType: metrics.Prometheus,
|
||||
LogDisabled: true,
|
||||
}).Start()
|
||||
}
|
||||
}
|
||||
|
@ -46,9 +46,8 @@ func allowedDomainsMiddleware(g *Geziyor, r *client.Request) {
|
||||
|
||||
// duplicateRequestsMiddleware checks for already visited URLs
|
||||
func duplicateRequestsMiddleware(g *Geziyor, r *client.Request) {
|
||||
if !g.Opt.URLRevisitEnabled {
|
||||
key := r.Request.URL.String() + r.Request.Method
|
||||
if _, visited := g.visitedURLs.LoadOrStore(key, struct{}{}); visited {
|
||||
if !g.Opt.URLRevisitEnabled && r.Request.Method == "GET" {
|
||||
if _, visited := g.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
|
||||
//log.Printf("URL already visited %s\n", rawURL)
|
||||
r.Cancel()
|
||||
}
|
||||
|
@ -61,6 +61,9 @@ type Options struct {
|
||||
// Max body reading size in bytes. Default: 1GB
|
||||
MaxBodySize int64
|
||||
|
||||
// Maximum redirection time. Default: 10
|
||||
MaxRedirect int
|
||||
|
||||
// Charset Detection disable
|
||||
CharsetDetectDisabled bool
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user