From 3790295658bdf5ce9004ea54e81550a1dad9c45d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Tue, 11 Jun 2019 16:10:49 +0300 Subject: [PATCH] Multiple Exporters and custom Exporters support added. --- README.md | 4 ++-- export.go | 30 +++--------------------------- exporter/json.go | 43 +++++++++++++++++++++++++++++++++++++++++++ geziyor.go | 14 ++++++++++---- options.go | 3 +++ 5 files changed, 61 insertions(+), 33 deletions(-) create mode 100644 exporter/json.go diff --git a/README.md b/README.md index b330752..0cc0fc6 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,9 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl ## Features - 1.000+ Requests/Sec - Caching (Memory/Disk) -- Automatic Data Exporting +- Automatic Data Exporting (JSON) - Limit Concurrency (Global/Per Domain) -- Request Delays (Constant/Randomized) +- Request Delays (Constant/Randomized) - Automatic response decoding to UTF-8 See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for customization. diff --git a/export.go b/export.go index 81f89c2..294af74 100644 --- a/export.go +++ b/export.go @@ -1,30 +1,6 @@ package geziyor -import ( - "encoding/json" - "fmt" - "os" - "sync" -) - -var file *os.File -var once sync.Once - -// Export exports response data as JSON streaming file -func Export(response *Response) { - once.Do(func() { - newFile, err := os.OpenFile("out.json", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) - if err != nil { - fmt.Fprintf(os.Stderr, "output file creation error: %v", err) - return - } - file = newFile - }) - - for res := range response.Exports { - //fmt.Println(res) - encoder := json.NewEncoder(file) - encoder.SetEscapeHTML(false) - encoder.Encode(res) - } +// Exporter interface is for extracting data to external resources +type Exporter interface { + Export(exports chan interface{}) } diff --git a/exporter/json.go b/exporter/json.go new file mode 100644 index 0000000..cafad54 --- /dev/null +++ b/exporter/json.go @@ -0,0 +1,43 @@ +package exporter + +import ( + "encoding/json" + "fmt" + "os" + "sync" +) + +// JSONExporter exports response data as JSON streaming file +type JSONExporter struct { + Filename string + EscapeHTML bool + + file *os.File + once sync.Once +} + +// Export exports response data as JSON streaming file +func (e JSONExporter) Export(exports chan interface{}) { + + // Default Filename + if e.Filename == "" { + e.Filename = "out.json" + } + + // Create File + e.once.Do(func() { + newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) + if err != nil { + fmt.Fprintf(os.Stderr, "output file creation error: %v", err) + return + } + e.file = newFile + }) + + // Export data as responses came + for res := range exports { + encoder := json.NewEncoder(e.file) + encoder.SetEscapeHTML(e.EscapeHTML) + encoder.Encode(res) + } +} diff --git a/geziyor.go b/geziyor.go index e4530bd..61bb7c9 100644 --- a/geziyor.go +++ b/geziyor.go @@ -4,6 +4,7 @@ import ( "bytes" "github.com/PuerkitoBio/goquery" "github.com/fpfeng/httpcache" + "github.com/geziyor/geziyor/exporter" "golang.org/x/net/html/charset" "io/ioutil" "log" @@ -66,6 +67,9 @@ func NewGeziyor(opt Options) *Geziyor { if opt.LogDisabled { log.SetOutput(ioutil.Discard) } + if len(opt.Exporters) == 0 { + geziyor.opt.Exporters = []Exporter{exporter.JSONExporter{}} + } return geziyor } @@ -106,7 +110,7 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) { defer g.wg.Done() defer func() { if r := recover(); r != nil { - log.Println(string(debug.Stack())) + log.Println(r, string(debug.Stack())) } }() @@ -168,7 +172,7 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) { Response: resp, Body: body, Geziyor: g, - Exports: make(chan interface{}, 1), + Exports: make(chan interface{}), } // Create HTML Document @@ -176,8 +180,10 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) { response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(body)) } - // Export Function - go Export(&response) + // Export Functions + for _, exp := range g.opt.Exporters { + go exp.Export(response.Exports) + } // Callbacks if callback != nil { diff --git a/options.go b/options.go index 4f3fcf2..d6ae9fc 100644 --- a/options.go +++ b/options.go @@ -40,4 +40,7 @@ type Options struct { // Disable logging by setting this true LogDisabled bool + + // For extracting data + Exporters []Exporter }