Disabling charset detection implemented.

This commit is contained in:
Musab Gültekin 2019-06-12 11:44:31 +03:00
parent a311a0f998
commit 2f6cb06982
5 changed files with 40 additions and 26 deletions

View File

@ -7,7 +7,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
## Features
- 1.000+ Requests/Sec
- Caching (Memory/Disk)
- Automatic Data Exporting (JSON)
- Automatic Data Exporting (JSON, CSV, or custom)
- Limit Concurrency (Global/Per Domain)
- Request Delays (Constant/Randomized)
- Automatic response decoding to UTF-8

View File

@ -4,6 +4,7 @@ import (
"encoding/csv"
"fmt"
"github.com/geziyor/geziyor"
"log"
"os"
"reflect"
"sync"
@ -11,7 +12,7 @@ import (
// CSVExporter exports response data as CSV streaming file
type CSVExporter struct {
Filename string
FileName string
once sync.Once
file *os.File
@ -20,14 +21,14 @@ type CSVExporter struct {
func (e CSVExporter) Export(response *geziyor.Response) {
// Default Filename
if e.Filename == "" {
e.Filename = "out.csv"
// Default filename
if e.FileName == "" {
e.FileName = "out.csv"
}
// Create File
// Create file
e.once.Do(func() {
newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
return
@ -40,22 +41,27 @@ func (e CSVExporter) Export(response *geziyor.Response) {
for res := range response.Exports {
var values []string
// Detect type and extract CSV values
val := reflect.ValueOf(res)
switch val.Kind() {
// TODO: Map type support is temporary. Ordering is wrong. Needs to be sorted by map keys (CSV headers).
case reflect.Map:
iter := val.MapRange()
for iter.Next() {
values = append(values, fmt.Sprint(iter.Value()))
}
case reflect.Slice:
for i := 0; i < val.Len(); i++ {
values = append(values, fmt.Sprint(val.Index(i)))
}
// TODO: Map type support is incomplete. Ordering is wrong. Needs to be sorted by map keys (CSV headers).
case reflect.Map:
iter := val.MapRange()
for iter.Next() {
values = append(values, fmt.Sprint(iter.Value()))
}
}
e.writer.Write(values)
// Write to file
if err := e.writer.Write(values); err != nil {
log.Printf("CSV writing error on exporter: %v\n", err)
}
e.writer.Flush()
}
}

View File

@ -4,13 +4,14 @@ import (
"encoding/json"
"fmt"
"github.com/geziyor/geziyor"
"log"
"os"
"sync"
)
// JSONExporter exports response data as JSON streaming file
type JSONExporter struct {
Filename string
FileName string
EscapeHTML bool
file *os.File
@ -20,14 +21,14 @@ type JSONExporter struct {
// Export exports response data as JSON streaming file
func (e JSONExporter) Export(response *geziyor.Response) {
// Default Filename
if e.Filename == "" {
e.Filename = "out.json"
// Default filename
if e.FileName == "" {
e.FileName = "out.json"
}
// Create File
// Create file
e.once.Do(func() {
newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
return
@ -39,6 +40,8 @@ func (e JSONExporter) Export(response *geziyor.Response) {
for res := range response.Exports {
encoder := json.NewEncoder(e.file)
encoder.SetEscapeHTML(e.EscapeHTML)
encoder.Encode(res)
if err := encoder.Encode(res); err != nil {
log.Printf("JSON encoding error on exporter: %v\n", err)
}
}
}

View File

@ -157,12 +157,14 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
bodyReader := io.LimitReader(resp.Body, g.opt.MaxBodySize)
// Start reading body and determine encoding
if !g.opt.CharsetDetectDisabled {
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
if err != nil {
log.Printf("Determine encoding error: %v\n", err)
g.releaseSem(req)
return
}
}
// Continue reading body
body, err := ioutil.ReadAll(bodyReader)

View File

@ -46,4 +46,7 @@ type Options struct {
// Max body reading size in bytes
MaxBodySize int64
// Charset Detection disable
CharsetDetectDisabled bool
}