diff --git a/README.md b/README.md index f9efd1b..b268ddd 100644 --- a/README.md +++ b/README.md @@ -12,13 +12,13 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl - Request Delays (Constant/Randomized) - Automatic response decoding to UTF-8 -See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for customization. +See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for all custom settings. ## Status We highly recommend you to use go modules. As this project is in **development stage** right now and **API is not stable**. ## Usage -Simplest usage +Simple usage ```go geziyor.NewGeziyor(geziyor.Options{ @@ -29,6 +29,32 @@ geziyor.NewGeziyor(geziyor.Options{ }).Start() ``` +Advanced usage + +```go +func main() { + geziyor.NewGeziyor(geziyor.Options{ + StartURLs: []string{"http://quotes.toscrape.com/"}, + ParseFunc: quotesParse, + Exporters: []geziyor.Exporter{exporter.JSONExporter{}}, + }).Start() +} + +func quotesParse(r *geziyor.Response) { + r.DocHTML.Find("div.quote").Each(func(i int, s *goquery.Selection) { + r.Exports <- map[string]interface{}{ + "text": s.Find("span.text").Text(), + "author": s.Find("small.author").Text(), + } + }) + if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok { + go r.Geziyor.Get(r.JoinURL(href), quotesParse) + } +} +``` + +See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for more usage examples + ## Installation go get github.com/geziyor/geziyor diff --git a/geziyor_test.go b/geziyor_test.go index f84878b..6cf9da4 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -54,6 +54,7 @@ func quotesParse(r *geziyor.Response) { return s.Text() }), } + // Or, for CSV //r.Exports <- []string{s.Find("span.text").Text(), s.Find("small.author").Text()} }) @@ -97,9 +98,11 @@ func TestStartRequestsFunc(t *testing.T) { g.Requests <- &geziyor.Request{Request: req} }, ParseFunc: func(r *geziyor.Response) { - r.Exports <- []string{r.Status} + r.DocHTML.Find("a").Each(func(_ int, s *goquery.Selection) { + r.Exports <- s.AttrOr("href", "") + }) }, - Exporters: []geziyor.Exporter{exporter.CSVExporter{}}, + Exporters: []geziyor.Exporter{exporter.JSONExporter{}}, }).Start() }