Disabling charset detection implemented.
This commit is contained in:
parent
a311a0f998
commit
2f6cb06982
@ -7,7 +7,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
|
||||
## Features
|
||||
- 1.000+ Requests/Sec
|
||||
- Caching (Memory/Disk)
|
||||
- Automatic Data Exporting (JSON)
|
||||
- Automatic Data Exporting (JSON, CSV, or custom)
|
||||
- Limit Concurrency (Global/Per Domain)
|
||||
- Request Delays (Constant/Randomized)
|
||||
- Automatic response decoding to UTF-8
|
||||
|
@ -4,6 +4,7 @@ import (
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"github.com/geziyor/geziyor"
|
||||
"log"
|
||||
"os"
|
||||
"reflect"
|
||||
"sync"
|
||||
@ -11,7 +12,7 @@ import (
|
||||
|
||||
// CSVExporter exports response data as CSV streaming file
|
||||
type CSVExporter struct {
|
||||
Filename string
|
||||
FileName string
|
||||
|
||||
once sync.Once
|
||||
file *os.File
|
||||
@ -20,14 +21,14 @@ type CSVExporter struct {
|
||||
|
||||
func (e CSVExporter) Export(response *geziyor.Response) {
|
||||
|
||||
// Default Filename
|
||||
if e.Filename == "" {
|
||||
e.Filename = "out.csv"
|
||||
// Default filename
|
||||
if e.FileName == "" {
|
||||
e.FileName = "out.csv"
|
||||
}
|
||||
|
||||
// Create File
|
||||
// Create file
|
||||
e.once.Do(func() {
|
||||
newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||
newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
||||
return
|
||||
@ -40,22 +41,27 @@ func (e CSVExporter) Export(response *geziyor.Response) {
|
||||
for res := range response.Exports {
|
||||
var values []string
|
||||
|
||||
// Detect type and extract CSV values
|
||||
val := reflect.ValueOf(res)
|
||||
switch val.Kind() {
|
||||
// TODO: Map type support is temporary. Ordering is wrong. Needs to be sorted by map keys (CSV headers).
|
||||
case reflect.Map:
|
||||
iter := val.MapRange()
|
||||
for iter.Next() {
|
||||
values = append(values, fmt.Sprint(iter.Value()))
|
||||
}
|
||||
|
||||
case reflect.Slice:
|
||||
for i := 0; i < val.Len(); i++ {
|
||||
values = append(values, fmt.Sprint(val.Index(i)))
|
||||
}
|
||||
|
||||
// TODO: Map type support is incomplete. Ordering is wrong. Needs to be sorted by map keys (CSV headers).
|
||||
case reflect.Map:
|
||||
iter := val.MapRange()
|
||||
for iter.Next() {
|
||||
values = append(values, fmt.Sprint(iter.Value()))
|
||||
}
|
||||
}
|
||||
|
||||
e.writer.Write(values)
|
||||
// Write to file
|
||||
if err := e.writer.Write(values); err != nil {
|
||||
log.Printf("CSV writing error on exporter: %v\n", err)
|
||||
}
|
||||
e.writer.Flush()
|
||||
}
|
||||
}
|
||||
|
@ -4,13 +4,14 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/geziyor/geziyor"
|
||||
"log"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// JSONExporter exports response data as JSON streaming file
|
||||
type JSONExporter struct {
|
||||
Filename string
|
||||
FileName string
|
||||
EscapeHTML bool
|
||||
|
||||
file *os.File
|
||||
@ -20,14 +21,14 @@ type JSONExporter struct {
|
||||
// Export exports response data as JSON streaming file
|
||||
func (e JSONExporter) Export(response *geziyor.Response) {
|
||||
|
||||
// Default Filename
|
||||
if e.Filename == "" {
|
||||
e.Filename = "out.json"
|
||||
// Default filename
|
||||
if e.FileName == "" {
|
||||
e.FileName = "out.json"
|
||||
}
|
||||
|
||||
// Create File
|
||||
// Create file
|
||||
e.once.Do(func() {
|
||||
newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||
newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
||||
return
|
||||
@ -39,6 +40,8 @@ func (e JSONExporter) Export(response *geziyor.Response) {
|
||||
for res := range response.Exports {
|
||||
encoder := json.NewEncoder(e.file)
|
||||
encoder.SetEscapeHTML(e.EscapeHTML)
|
||||
encoder.Encode(res)
|
||||
if err := encoder.Encode(res); err != nil {
|
||||
log.Printf("JSON encoding error on exporter: %v\n", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -157,12 +157,14 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
|
||||
bodyReader := io.LimitReader(resp.Body, g.opt.MaxBodySize)
|
||||
|
||||
// Start reading body and determine encoding
|
||||
if !g.opt.CharsetDetectDisabled {
|
||||
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
|
||||
if err != nil {
|
||||
log.Printf("Determine encoding error: %v\n", err)
|
||||
g.releaseSem(req)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Continue reading body
|
||||
body, err := ioutil.ReadAll(bodyReader)
|
||||
|
@ -46,4 +46,7 @@ type Options struct {
|
||||
|
||||
// Max body reading size in bytes
|
||||
MaxBodySize int64
|
||||
|
||||
// Charset Detection disable
|
||||
CharsetDetectDisabled bool
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user