diff --git a/README.md b/README.md index 0cc0fc6..e8d4690 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl ## Features - 1.000+ Requests/Sec - Caching (Memory/Disk) -- Automatic Data Exporting (JSON) +- Automatic Data Exporting (JSON, CSV, or custom) - Limit Concurrency (Global/Per Domain) - Request Delays (Constant/Randomized) - Automatic response decoding to UTF-8 diff --git a/exporter/csv.go b/exporter/csv.go index b4d8f79..b4b3a47 100644 --- a/exporter/csv.go +++ b/exporter/csv.go @@ -4,6 +4,7 @@ import ( "encoding/csv" "fmt" "github.com/geziyor/geziyor" + "log" "os" "reflect" "sync" @@ -11,7 +12,7 @@ import ( // CSVExporter exports response data as CSV streaming file type CSVExporter struct { - Filename string + FileName string once sync.Once file *os.File @@ -20,14 +21,14 @@ type CSVExporter struct { func (e CSVExporter) Export(response *geziyor.Response) { - // Default Filename - if e.Filename == "" { - e.Filename = "out.csv" + // Default filename + if e.FileName == "" { + e.FileName = "out.csv" } - // Create File + // Create file e.once.Do(func() { - newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) + newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) if err != nil { fmt.Fprintf(os.Stderr, "output file creation error: %v", err) return @@ -40,22 +41,27 @@ func (e CSVExporter) Export(response *geziyor.Response) { for res := range response.Exports { var values []string + // Detect type and extract CSV values val := reflect.ValueOf(res) switch val.Kind() { - // TODO: Map type support is temporary. Ordering is wrong. Needs to be sorted by map keys (CSV headers). - case reflect.Map: - iter := val.MapRange() - for iter.Next() { - values = append(values, fmt.Sprint(iter.Value())) - } case reflect.Slice: for i := 0; i < val.Len(); i++ { values = append(values, fmt.Sprint(val.Index(i))) } + + // TODO: Map type support is incomplete. Ordering is wrong. Needs to be sorted by map keys (CSV headers). + case reflect.Map: + iter := val.MapRange() + for iter.Next() { + values = append(values, fmt.Sprint(iter.Value())) + } } - e.writer.Write(values) + // Write to file + if err := e.writer.Write(values); err != nil { + log.Printf("CSV writing error on exporter: %v\n", err) + } e.writer.Flush() } } diff --git a/exporter/json.go b/exporter/json.go index 610927c..a625d52 100644 --- a/exporter/json.go +++ b/exporter/json.go @@ -4,13 +4,14 @@ import ( "encoding/json" "fmt" "github.com/geziyor/geziyor" + "log" "os" "sync" ) // JSONExporter exports response data as JSON streaming file type JSONExporter struct { - Filename string + FileName string EscapeHTML bool file *os.File @@ -20,14 +21,14 @@ type JSONExporter struct { // Export exports response data as JSON streaming file func (e JSONExporter) Export(response *geziyor.Response) { - // Default Filename - if e.Filename == "" { - e.Filename = "out.json" + // Default filename + if e.FileName == "" { + e.FileName = "out.json" } - // Create File + // Create file e.once.Do(func() { - newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) + newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) if err != nil { fmt.Fprintf(os.Stderr, "output file creation error: %v", err) return @@ -39,6 +40,8 @@ func (e JSONExporter) Export(response *geziyor.Response) { for res := range response.Exports { encoder := json.NewEncoder(e.file) encoder.SetEscapeHTML(e.EscapeHTML) - encoder.Encode(res) + if err := encoder.Encode(res); err != nil { + log.Printf("JSON encoding error on exporter: %v\n", err) + } } } diff --git a/geziyor.go b/geziyor.go index a65b719..9bbb4cb 100644 --- a/geziyor.go +++ b/geziyor.go @@ -157,11 +157,13 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) { bodyReader := io.LimitReader(resp.Body, g.opt.MaxBodySize) // Start reading body and determine encoding - bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type")) - if err != nil { - log.Printf("Determine encoding error: %v\n", err) - g.releaseSem(req) - return + if !g.opt.CharsetDetectDisabled { + bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type")) + if err != nil { + log.Printf("Determine encoding error: %v\n", err) + g.releaseSem(req) + return + } } // Continue reading body diff --git a/options.go b/options.go index 713d6c7..6629c5f 100644 --- a/options.go +++ b/options.go @@ -46,4 +46,7 @@ type Options struct { // Max body reading size in bytes MaxBodySize int64 + + // Charset Detection disable + CharsetDetectDisabled bool }