Disabling charset detection implemented.

This commit is contained in:
Musab Gültekin 2019-06-12 11:44:31 +03:00
parent a311a0f998
commit 2f6cb06982
5 changed files with 40 additions and 26 deletions

View File

@ -7,7 +7,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
## Features ## Features
- 1.000+ Requests/Sec - 1.000+ Requests/Sec
- Caching (Memory/Disk) - Caching (Memory/Disk)
- Automatic Data Exporting (JSON) - Automatic Data Exporting (JSON, CSV, or custom)
- Limit Concurrency (Global/Per Domain) - Limit Concurrency (Global/Per Domain)
- Request Delays (Constant/Randomized) - Request Delays (Constant/Randomized)
- Automatic response decoding to UTF-8 - Automatic response decoding to UTF-8

View File

@ -4,6 +4,7 @@ import (
"encoding/csv" "encoding/csv"
"fmt" "fmt"
"github.com/geziyor/geziyor" "github.com/geziyor/geziyor"
"log"
"os" "os"
"reflect" "reflect"
"sync" "sync"
@ -11,7 +12,7 @@ import (
// CSVExporter exports response data as CSV streaming file // CSVExporter exports response data as CSV streaming file
type CSVExporter struct { type CSVExporter struct {
Filename string FileName string
once sync.Once once sync.Once
file *os.File file *os.File
@ -20,14 +21,14 @@ type CSVExporter struct {
func (e CSVExporter) Export(response *geziyor.Response) { func (e CSVExporter) Export(response *geziyor.Response) {
// Default Filename // Default filename
if e.Filename == "" { if e.FileName == "" {
e.Filename = "out.csv" e.FileName = "out.csv"
} }
// Create File // Create file
e.once.Do(func() { e.once.Do(func() {
newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "output file creation error: %v", err) fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
return return
@ -40,22 +41,27 @@ func (e CSVExporter) Export(response *geziyor.Response) {
for res := range response.Exports { for res := range response.Exports {
var values []string var values []string
// Detect type and extract CSV values
val := reflect.ValueOf(res) val := reflect.ValueOf(res)
switch val.Kind() { switch val.Kind() {
// TODO: Map type support is temporary. Ordering is wrong. Needs to be sorted by map keys (CSV headers).
case reflect.Map:
iter := val.MapRange()
for iter.Next() {
values = append(values, fmt.Sprint(iter.Value()))
}
case reflect.Slice: case reflect.Slice:
for i := 0; i < val.Len(); i++ { for i := 0; i < val.Len(); i++ {
values = append(values, fmt.Sprint(val.Index(i))) values = append(values, fmt.Sprint(val.Index(i)))
} }
// TODO: Map type support is incomplete. Ordering is wrong. Needs to be sorted by map keys (CSV headers).
case reflect.Map:
iter := val.MapRange()
for iter.Next() {
values = append(values, fmt.Sprint(iter.Value()))
}
} }
e.writer.Write(values) // Write to file
if err := e.writer.Write(values); err != nil {
log.Printf("CSV writing error on exporter: %v\n", err)
}
e.writer.Flush() e.writer.Flush()
} }
} }

View File

@ -4,13 +4,14 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"github.com/geziyor/geziyor" "github.com/geziyor/geziyor"
"log"
"os" "os"
"sync" "sync"
) )
// JSONExporter exports response data as JSON streaming file // JSONExporter exports response data as JSON streaming file
type JSONExporter struct { type JSONExporter struct {
Filename string FileName string
EscapeHTML bool EscapeHTML bool
file *os.File file *os.File
@ -20,14 +21,14 @@ type JSONExporter struct {
// Export exports response data as JSON streaming file // Export exports response data as JSON streaming file
func (e JSONExporter) Export(response *geziyor.Response) { func (e JSONExporter) Export(response *geziyor.Response) {
// Default Filename // Default filename
if e.Filename == "" { if e.FileName == "" {
e.Filename = "out.json" e.FileName = "out.json"
} }
// Create File // Create file
e.once.Do(func() { e.once.Do(func() {
newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "output file creation error: %v", err) fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
return return
@ -39,6 +40,8 @@ func (e JSONExporter) Export(response *geziyor.Response) {
for res := range response.Exports { for res := range response.Exports {
encoder := json.NewEncoder(e.file) encoder := json.NewEncoder(e.file)
encoder.SetEscapeHTML(e.EscapeHTML) encoder.SetEscapeHTML(e.EscapeHTML)
encoder.Encode(res) if err := encoder.Encode(res); err != nil {
log.Printf("JSON encoding error on exporter: %v\n", err)
}
} }
} }

View File

@ -157,11 +157,13 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
bodyReader := io.LimitReader(resp.Body, g.opt.MaxBodySize) bodyReader := io.LimitReader(resp.Body, g.opt.MaxBodySize)
// Start reading body and determine encoding // Start reading body and determine encoding
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type")) if !g.opt.CharsetDetectDisabled {
if err != nil { bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
log.Printf("Determine encoding error: %v\n", err) if err != nil {
g.releaseSem(req) log.Printf("Determine encoding error: %v\n", err)
return g.releaseSem(req)
return
}
} }
// Continue reading body // Continue reading body

View File

@ -46,4 +46,7 @@ type Options struct {
// Max body reading size in bytes // Max body reading size in bytes
MaxBodySize int64 MaxBodySize int64
// Charset Detection disable
CharsetDetectDisabled bool
} }