diff --git a/.gitignore b/.gitignore index 38c9c13..8debaaa 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,6 @@ # Output files out.* + +# Cache files +.cache/ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 31eb28d..f31b3ba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,7 @@ language: go go: - - 1.10.x - - 1.11.x + - 1.12.x - tip env: diff --git a/README.md b/README.md index 021d702..edbe621 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ func main() { geziyor.NewGeziyor(&geziyor.Options{ StartURLs: []string{"http://quotes.toscrape.com/"}, ParseFunc: quotesParse, - Exporters: []geziyor.Exporter{exporter.JSONExporter{}}, + Exporters: []geziyor.Exporter{exporter.JSON{}}, }).Start() } @@ -100,6 +100,43 @@ geziyor.NewGeziyor(&geziyor.Options{ }).Start() ``` +### Extracting Data + +#### Extractors +You can add [Extractor]() to []Extractors option to extract structured data. +Exporters need to be defined in order to extractors work. + +```go +geziyor.NewGeziyor(&geziyor.Options{ + StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"}, + Extractors: []geziyor.Extractor{ + &extractor.Text{Name: "title", Selector: ".c-page-title"}, + &extractor.Text{Name: "byline", Selector: ".c-byline__item:nth-child(1) > a"}, + &extractor.Text{Name: "summary", Selector: ".c-entry-summary"}, + &extractor.Text{Name: "content", Selector: ".c-entry-content"}, + }, + Exporters: []geziyor.Exporter{&exporter.JSON{}}, +}).Start() +``` + +#### HTML selectors + +We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document). + +HTMLDoc can be accessible on Response if response is HTML and can be parsed using Go's built-in HTML [parser](https://godoc.org/golang.org/x/net/html#Parse) +If response isn't HTML, ```response.HTMLDoc``` would be ```nil```. + +```go +geziyor.NewGeziyor(&geziyor.Options{ + StartURLs: []string{"http://quotes.toscrape.com/"}, + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { + r.HTMLDoc.Find("div.quote").Each(func(_ int, s *goquery.Selection) { + log.Println(s.Find("span.text").Text(), s.Find("small.author").Text()) + }) + }, +}).Start() +``` + ### Exporting Data You can export data automatically using exporters. Just send data to ```Geziyor.Exports``` chan. @@ -116,7 +153,7 @@ geziyor.NewGeziyor(&geziyor.Options{ } }) }, - Exporters: []geziyor.Exporter{&exporter.JSONExporter{}}, + Exporters: []geziyor.Exporter{&exporter.JSON{}}, }).Start() ``` diff --git a/exporter/csv.go b/exporter/csv.go index c619a88..412bddd 100644 --- a/exporter/csv.go +++ b/exporter/csv.go @@ -10,15 +10,15 @@ import ( "sort" ) -// CSVExporter exports response data as CSV streaming file -type CSVExporter struct { +// CSV exports response data as CSV streaming file +type CSV struct { FileName string Comma rune UseCRLF bool } // Export exports response data as CSV streaming file -func (e *CSVExporter) Export(exports chan interface{}) { +func (e *CSV) Export(exports chan interface{}) { // Create or append file file, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.csv"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) diff --git a/exporter/csv_test.go b/exporter/csv_test.go index f8749c3..3b7ebea 100644 --- a/exporter/csv_test.go +++ b/exporter/csv_test.go @@ -6,7 +6,7 @@ func TestCSVExporter_Export(t *testing.T) { ch := make(chan interface{}) defer close(ch) - exporter := &CSVExporter{ + exporter := &CSV{ FileName: "out.csv", Comma: ';', } diff --git a/exporter/json.go b/exporter/json.go index 96f9b6a..aa01117 100644 --- a/exporter/json.go +++ b/exporter/json.go @@ -7,8 +7,8 @@ import ( "os" ) -// JSONExporter exports response data as JSON streaming file -type JSONExporter struct { +// JSON exports response data as JSON streaming file +type JSON struct { FileName string EscapeHTML bool Prefix string @@ -16,7 +16,7 @@ type JSONExporter struct { } // Export exports response data as JSON streaming file -func (e *JSONExporter) Export(exports chan interface{}) { +func (e *JSON) Export(exports chan interface{}) { // Create or append file file, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) diff --git a/exporter/json_test.go b/exporter/json_test.go index de7ec17..405f86c 100644 --- a/exporter/json_test.go +++ b/exporter/json_test.go @@ -6,7 +6,7 @@ func TestJSONExporter_Export(t *testing.T) { ch := make(chan interface{}) defer close(ch) - exporter := &JSONExporter{ + exporter := &JSON{ FileName: "out.json", Indent: " ", } diff --git a/extractor/text.go b/extractor/text.go new file mode 100644 index 0000000..c575c08 --- /dev/null +++ b/extractor/text.go @@ -0,0 +1,14 @@ +package extractor + +import "github.com/PuerkitoBio/goquery" + +// Text extracts texts from selected nodes +type Text struct { + Name string + Selector string +} + +// Extract extracts texts from selected nodes +func (e *Text) Extract(doc *goquery.Document) interface{} { + return map[string]string{e.Name: doc.Find(e.Selector).Text()} +} diff --git a/geziyor.go b/geziyor.go index 3b61a24..762b579 100644 --- a/geziyor.go +++ b/geziyor.go @@ -2,6 +2,7 @@ package geziyor import ( "context" + "github.com/PuerkitoBio/goquery" "github.com/chromedp/cdproto/dom" "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" @@ -20,7 +21,14 @@ import ( "sync" ) -// Exporter interface is for extracting data to external resources +// Extractor interface is for extracting data from HTML document +type Extractor interface { + Extract(doc *goquery.Document) interface{} +} + +// Exporter interface is for extracting data to external resources. +// Geziyor calls every extractors Export functions before any scraping starts. +// Export functions should wait for new data from exports chan. type Exporter interface { Export(exports chan interface{}) } @@ -61,6 +69,7 @@ func NewGeziyor(opt *Options) *Geziyor { responseMiddlewares: []ResponseMiddleware{ parseHTMLMiddleware, metricsResponseMiddleware, + extractorsMiddleware, }, metrics: metrics.NewMetrics(opt.MetricsType), } diff --git a/geziyor_test.go b/geziyor_test.go index 058926d..6f240d2 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -7,10 +7,9 @@ import ( "github.com/fpfeng/httpcache" "github.com/geziyor/geziyor" "github.com/geziyor/geziyor/exporter" + "github.com/geziyor/geziyor/extractor" "github.com/geziyor/geziyor/metrics" - "math/rand" "testing" - "time" ) func TestSimple(t *testing.T) { @@ -41,7 +40,7 @@ func TestQuotes(t *testing.T) { geziyor.NewGeziyor(&geziyor.Options{ StartURLs: []string{"http://quotes.toscrape.com/"}, ParseFunc: quotesParse, - Exporters: []geziyor.Exporter{&exporter.JSONExporter{}}, + Exporters: []geziyor.Exporter{&exporter.JSON{}}, }).Start() } @@ -78,20 +77,11 @@ func TestAllLinks(t *testing.T) { } }) }, - Exporters: []geziyor.Exporter{&exporter.CSVExporter{}}, + Exporters: []geziyor.Exporter{&exporter.CSV{}}, MetricsType: metrics.Prometheus, }).Start() } -func TestRandomDelay(t *testing.T) { - rand.Seed(time.Now().UnixNano()) - delay := time.Millisecond * 1000 - min := float64(delay) * 0.5 - max := float64(delay) * 1.5 - randomDelay := rand.Intn(int(max-min)) + int(min) - fmt.Println(time.Duration(randomDelay)) -} - func TestStartRequestsFunc(t *testing.T) { geziyor.NewGeziyor(&geziyor.Options{ StartRequestsFunc: func(g *geziyor.Geziyor) { @@ -102,7 +92,7 @@ func TestStartRequestsFunc(t *testing.T) { g.Exports <- s.AttrOr("href", "") }) }, - Exporters: []geziyor.Exporter{&exporter.JSONExporter{}}, + Exporters: []geziyor.Exporter{&exporter.JSON{}}, }).Start() } @@ -161,3 +151,16 @@ func TestBasicAuth(t *testing.T) { MetricsType: metrics.ExpVar, }).Start() } + +func TestExtractor(t *testing.T) { + geziyor.NewGeziyor(&geziyor.Options{ + StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"}, + Extractors: []geziyor.Extractor{ + &extractor.Text{Name: "title", Selector: ".c-page-title"}, + &extractor.Text{Name: "byline", Selector: ".c-byline__item:nth-child(1) > a"}, + &extractor.Text{Name: "summary", Selector: ".c-entry-summary"}, + &extractor.Text{Name: "content", Selector: ".c-entry-content"}, + }, + Exporters: []geziyor.Exporter{&exporter.JSON{}}, + }).Start() +} diff --git a/middleware.go b/middleware.go index 04b9ca2..487c294 100644 --- a/middleware.go +++ b/middleware.go @@ -2,12 +2,14 @@ package geziyor import ( "bytes" + "fmt" "github.com/PuerkitoBio/goquery" "github.com/geziyor/geziyor/http" "github.com/geziyor/geziyor/internal" "log" "math/rand" "os" + "reflect" "runtime/debug" "time" ) @@ -92,3 +94,27 @@ func parseHTMLMiddleware(g *Geziyor, r *Response) { func metricsResponseMiddleware(g *Geziyor, r *Response) { g.metrics.ResponseCounter.With("method", r.Request.Method).Add(1) } + +// extractorsMiddleware extracts data from loaders conf and exports it to exporters +func extractorsMiddleware(g *Geziyor, r *Response) { + + // Check if we have extractors and exporters + if len(g.Opt.Extractors) != 0 && len(g.Opt.Exporters) != 0 { + exports := map[string]interface{}{} + + for _, extractor := range g.Opt.Extractors { + extracted := extractor.Extract(r.HTMLDoc) + + // Check extracted data type and use it accordingly + val := reflect.ValueOf(extracted) + switch val.Kind() { + case reflect.Map: + r := val.MapRange() + for r.Next() { + exports[fmt.Sprint(r.Key())] = r.Value().Interface() + } + } + } + g.Exports <- exports + } +} diff --git a/middleware_test.go b/middleware_test.go new file mode 100644 index 0000000..55097d1 --- /dev/null +++ b/middleware_test.go @@ -0,0 +1,17 @@ +package geziyor + +import ( + "fmt" + "math/rand" + "testing" + "time" +) + +func TestRandomDelay(t *testing.T) { + rand.Seed(time.Now().UnixNano()) + delay := time.Millisecond * 1000 + min := float64(delay) * 0.5 + max := float64(delay) * 1.5 + randomDelay := rand.Intn(int(max-min)) + int(min) + fmt.Println(time.Duration(randomDelay)) +} diff --git a/options.go b/options.go index 99b1659..20c93d5 100644 --- a/options.go +++ b/options.go @@ -21,6 +21,9 @@ type Options struct { // ParseFunc is callback of StartURLs response. ParseFunc func(g *Geziyor, r *Response) + // Extractors extracts items from pages + Extractors []Extractor + // Timeout is global request timeout Timeout time.Duration