diff --git a/README.md b/README.md index fcbfbf5..d50da9a 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use [![GoDoc](https://godoc.org/github.com/geziyor/geziyor?status.svg)](https://godoc.org/github.com/geziyor/geziyor) [![report card](https://goreportcard.com/badge/github.com/geziyor/geziyor)](http://goreportcard.com/report/geziyor/geziyor) +[![Code Coverage](https://img.shields.io/codecov/c/github/geziyor/geziyor/master.svg)](https://codecov.io/github/geziyor/geziyor?branch=master) ## Features - 5.000+ Requests/Sec @@ -163,7 +164,7 @@ geziyor.NewGeziyor(&geziyor.Options{ ## Benchmark -**8452 request per seconds** on *Macbook Pro 15" 2016* +**8748 request per seconds** on *Macbook Pro 15" 2016* See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for this benchmark function: diff --git a/client/client_test.go b/client/client_test.go new file mode 100644 index 0000000..2e2731c --- /dev/null +++ b/client/client_test.go @@ -0,0 +1,91 @@ +package client + +import ( + "net/http" + "reflect" + "testing" +) + +func TestSetDefaultHeader(t *testing.T) { + type args struct { + header http.Header + key string + value string + } + tests := []struct { + name string + args args + want http.Header + }{ + { + name: "Simple", + args: args{http.Header{}, "key", "value"}, + want: http.Header{"Key": []string{"value"}}, + }, + { + name: "Dont Override", + args: args{http.Header{"Key": []string{"value"}}, "key", "new value"}, + want: http.Header{"Key": []string{"value"}}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := SetDefaultHeader(tt.args.header, tt.args.key, tt.args.value); !reflect.DeepEqual(got, tt.want) { + t.Errorf("SetDefaultHeader() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestConvertHeaderToMap(t *testing.T) { + type args struct { + header http.Header + } + tests := []struct { + name string + args args + want map[string]interface{} + }{ + { + name: "Simple", + args: args{http.Header{"Key": []string{"value"}}}, + want: map[string]interface{}{"Key": "value"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := ConvertHeaderToMap(tt.args.header); !reflect.DeepEqual(got, tt.want) { + t.Errorf("ConvertHeaderToMap() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestConvertMapToHeader(t *testing.T) { + type args struct { + m map[string]interface{} + } + tests := []struct { + name string + args args + want http.Header + }{ + { + name: "Simple", + args: args{map[string]interface{}{"Key": "value"}}, + want: http.Header{"Key": []string{"value"}}, + }, + { + name: "Non standard key", + args: args{map[string]interface{}{"key": "value"}}, + want: http.Header{"Key": []string{"value"}}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := ConvertMapToHeader(tt.args.m); !reflect.DeepEqual(got, tt.want) { + t.Errorf("ConvertMapToHeader() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/client/response.go b/client/response.go index aa09706..c9646cd 100644 --- a/client/response.go +++ b/client/response.go @@ -28,7 +28,7 @@ func (r *Response) JoinURL(relativeURL string) string { return joinedURL.String() } -// IsHTML checks if response content is HTML by looking to content-type header +// IsHTML checks if response content is HTML by looking content-type header func (r *Response) IsHTML() bool { contentType := r.Header.Get("Content-Type") for _, htmlContentType := range []string{"text/html", "application/xhtml+xml", "application/vnd.wap.xhtml+xml"} { diff --git a/export/csv_test.go b/export/csv_test.go index f11a966..a7340de 100644 --- a/export/csv_test.go +++ b/export/csv_test.go @@ -1,17 +1,29 @@ package export -import "testing" +import ( + "io/ioutil" + "os" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) func TestCSVExporter_Export(t *testing.T) { - ch := make(chan interface{}) - defer close(ch) - exporter := &CSV{ FileName: "out.csv", Comma: ';', } - go exporter.Export(ch) + _ = os.Remove(exporter.FileName) + exports := make(chan interface{}) + go exporter.Export(exports) - ch <- []string{"1", "2"} - ch <- map[string]string{"key1": "value1", "key2": "value2"} + exports <- []string{"1", "2"} + exports <- map[string]string{"key1": "value1", "key2": "value2"} + close(exports) + time.Sleep(time.Millisecond) + + contents, err := ioutil.ReadFile(exporter.FileName) + assert.NoError(t, err) + assert.Equal(t, "1;2\nvalue1;value2\n", string(contents)) } diff --git a/export/json_test.go b/export/json_test.go index 70db411..b743579 100644 --- a/export/json_test.go +++ b/export/json_test.go @@ -1,16 +1,27 @@ package export -import "testing" +import ( + "github.com/stretchr/testify/assert" + "io/ioutil" + "os" + "testing" + "time" +) func TestJSONExporter_Export(t *testing.T) { - ch := make(chan interface{}) - defer close(ch) - exporter := &JSON{ FileName: "out.json", Indent: " ", } - go exporter.Export(ch) + _ = os.Remove(exporter.FileName) + exports := make(chan interface{}) + go exporter.Export(exports) - ch <- map[string]string{"key": "value"} + exports <- map[string]string{"key": "value"} + close(exports) + time.Sleep(time.Millisecond) + + contents, err := ioutil.ReadFile(exporter.FileName) + assert.NoError(t, err) + assert.Equal(t, "{\n \"key\": \"value\"\n}\n", string(contents)) } diff --git a/extract/html.go b/extract/html.go index 5878dd9..e612d9f 100644 --- a/extract/html.go +++ b/extract/html.go @@ -26,7 +26,6 @@ func (e *HTML) Extract(doc *goquery.Document) (interface{}, error) { ret += h return true }) - if err != nil { return nil, err } diff --git a/geziyor_test.go b/geziyor_test.go index 6583081..4bfbe01 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -92,7 +92,7 @@ func TestAllLinks(t *testing.T) { func TestStartRequestsFunc(t *testing.T) { geziyor.NewGeziyor(&geziyor.Options{ StartRequestsFunc: func(g *geziyor.Geziyor) { - g.Get("http://quotes.toscrape.com/", g.Opt.ParseFunc) + g.Get("http://quotes.toscrape.com/", nil) }, ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { r.HTMLDoc.Find("a").Each(func(_ int, s *goquery.Selection) { @@ -223,6 +223,15 @@ func TestRedirect(t *testing.T) { }).Start() } +func TestConcurrentRequests(t *testing.T) { + defer leaktest.Check(t)() + geziyor.NewGeziyor(&geziyor.Options{ + StartURLs: []string{"https://httpbin.org/delay/1", "https://httpbin.org/delay/2"}, + ConcurrentRequests: 1, + ConcurrentRequestsPerDomain: 1, + }).Start() +} + // Make sure to increase open file descriptor limits before running func BenchmarkRequests(b *testing.B) { diff --git a/go.mod b/go.mod index b2e85b9..ebb52bc 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/go-kit/kit v0.8.0 github.com/pkg/errors v0.8.1 github.com/prometheus/client_golang v1.0.0 + github.com/stretchr/testify v1.3.0 golang.org/x/net v0.0.0-20190522155817-f3200d17e092 golang.org/x/text v0.3.2 // indirect ) diff --git a/go.sum b/go.sum index 7621e13..8ccae9f 100644 --- a/go.sum +++ b/go.sum @@ -14,6 +14,7 @@ github.com/chromedp/cdproto v0.0.0-20190609032908-dd39f0bf0a54/go.mod h1:5NWqr1R github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05 h1:5iy45UjpWvkgTcd7GrGQSPr7sifrp9nNweI/eAsMjGE= github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05/go.mod h1:MsTqWB2yT7cErDFnF1F3y0PN8i/a/qQj+0GXKLW/I3s= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= @@ -49,6 +50,7 @@ github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRW github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0 h1:vrDKnkGzuGvhNAL56c7DBz29ZL+KxnoR0x7enabFceM= @@ -65,6 +67,7 @@ github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPx github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=