Disabling charset detection implemented.
This commit is contained in:
parent
a311a0f998
commit
2f6cb06982
@ -7,7 +7,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
|
|||||||
## Features
|
## Features
|
||||||
- 1.000+ Requests/Sec
|
- 1.000+ Requests/Sec
|
||||||
- Caching (Memory/Disk)
|
- Caching (Memory/Disk)
|
||||||
- Automatic Data Exporting (JSON)
|
- Automatic Data Exporting (JSON, CSV, or custom)
|
||||||
- Limit Concurrency (Global/Per Domain)
|
- Limit Concurrency (Global/Per Domain)
|
||||||
- Request Delays (Constant/Randomized)
|
- Request Delays (Constant/Randomized)
|
||||||
- Automatic response decoding to UTF-8
|
- Automatic response decoding to UTF-8
|
||||||
|
@ -4,6 +4,7 @@ import (
|
|||||||
"encoding/csv"
|
"encoding/csv"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/geziyor/geziyor"
|
"github.com/geziyor/geziyor"
|
||||||
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"reflect"
|
"reflect"
|
||||||
"sync"
|
"sync"
|
||||||
@ -11,7 +12,7 @@ import (
|
|||||||
|
|
||||||
// CSVExporter exports response data as CSV streaming file
|
// CSVExporter exports response data as CSV streaming file
|
||||||
type CSVExporter struct {
|
type CSVExporter struct {
|
||||||
Filename string
|
FileName string
|
||||||
|
|
||||||
once sync.Once
|
once sync.Once
|
||||||
file *os.File
|
file *os.File
|
||||||
@ -20,14 +21,14 @@ type CSVExporter struct {
|
|||||||
|
|
||||||
func (e CSVExporter) Export(response *geziyor.Response) {
|
func (e CSVExporter) Export(response *geziyor.Response) {
|
||||||
|
|
||||||
// Default Filename
|
// Default filename
|
||||||
if e.Filename == "" {
|
if e.FileName == "" {
|
||||||
e.Filename = "out.csv"
|
e.FileName = "out.csv"
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create File
|
// Create file
|
||||||
e.once.Do(func() {
|
e.once.Do(func() {
|
||||||
newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
||||||
return
|
return
|
||||||
@ -40,22 +41,27 @@ func (e CSVExporter) Export(response *geziyor.Response) {
|
|||||||
for res := range response.Exports {
|
for res := range response.Exports {
|
||||||
var values []string
|
var values []string
|
||||||
|
|
||||||
|
// Detect type and extract CSV values
|
||||||
val := reflect.ValueOf(res)
|
val := reflect.ValueOf(res)
|
||||||
switch val.Kind() {
|
switch val.Kind() {
|
||||||
// TODO: Map type support is temporary. Ordering is wrong. Needs to be sorted by map keys (CSV headers).
|
|
||||||
case reflect.Map:
|
|
||||||
iter := val.MapRange()
|
|
||||||
for iter.Next() {
|
|
||||||
values = append(values, fmt.Sprint(iter.Value()))
|
|
||||||
}
|
|
||||||
|
|
||||||
case reflect.Slice:
|
case reflect.Slice:
|
||||||
for i := 0; i < val.Len(); i++ {
|
for i := 0; i < val.Len(); i++ {
|
||||||
values = append(values, fmt.Sprint(val.Index(i)))
|
values = append(values, fmt.Sprint(val.Index(i)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Map type support is incomplete. Ordering is wrong. Needs to be sorted by map keys (CSV headers).
|
||||||
|
case reflect.Map:
|
||||||
|
iter := val.MapRange()
|
||||||
|
for iter.Next() {
|
||||||
|
values = append(values, fmt.Sprint(iter.Value()))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
e.writer.Write(values)
|
// Write to file
|
||||||
|
if err := e.writer.Write(values); err != nil {
|
||||||
|
log.Printf("CSV writing error on exporter: %v\n", err)
|
||||||
|
}
|
||||||
e.writer.Flush()
|
e.writer.Flush()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,13 +4,14 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/geziyor/geziyor"
|
"github.com/geziyor/geziyor"
|
||||||
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
// JSONExporter exports response data as JSON streaming file
|
// JSONExporter exports response data as JSON streaming file
|
||||||
type JSONExporter struct {
|
type JSONExporter struct {
|
||||||
Filename string
|
FileName string
|
||||||
EscapeHTML bool
|
EscapeHTML bool
|
||||||
|
|
||||||
file *os.File
|
file *os.File
|
||||||
@ -20,14 +21,14 @@ type JSONExporter struct {
|
|||||||
// Export exports response data as JSON streaming file
|
// Export exports response data as JSON streaming file
|
||||||
func (e JSONExporter) Export(response *geziyor.Response) {
|
func (e JSONExporter) Export(response *geziyor.Response) {
|
||||||
|
|
||||||
// Default Filename
|
// Default filename
|
||||||
if e.Filename == "" {
|
if e.FileName == "" {
|
||||||
e.Filename = "out.json"
|
e.FileName = "out.json"
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create File
|
// Create file
|
||||||
e.once.Do(func() {
|
e.once.Do(func() {
|
||||||
newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
||||||
return
|
return
|
||||||
@ -39,6 +40,8 @@ func (e JSONExporter) Export(response *geziyor.Response) {
|
|||||||
for res := range response.Exports {
|
for res := range response.Exports {
|
||||||
encoder := json.NewEncoder(e.file)
|
encoder := json.NewEncoder(e.file)
|
||||||
encoder.SetEscapeHTML(e.EscapeHTML)
|
encoder.SetEscapeHTML(e.EscapeHTML)
|
||||||
encoder.Encode(res)
|
if err := encoder.Encode(res); err != nil {
|
||||||
|
log.Printf("JSON encoding error on exporter: %v\n", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
12
geziyor.go
12
geziyor.go
@ -157,11 +157,13 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
|
|||||||
bodyReader := io.LimitReader(resp.Body, g.opt.MaxBodySize)
|
bodyReader := io.LimitReader(resp.Body, g.opt.MaxBodySize)
|
||||||
|
|
||||||
// Start reading body and determine encoding
|
// Start reading body and determine encoding
|
||||||
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
|
if !g.opt.CharsetDetectDisabled {
|
||||||
if err != nil {
|
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
|
||||||
log.Printf("Determine encoding error: %v\n", err)
|
if err != nil {
|
||||||
g.releaseSem(req)
|
log.Printf("Determine encoding error: %v\n", err)
|
||||||
return
|
g.releaseSem(req)
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Continue reading body
|
// Continue reading body
|
||||||
|
@ -46,4 +46,7 @@ type Options struct {
|
|||||||
|
|
||||||
// Max body reading size in bytes
|
// Max body reading size in bytes
|
||||||
MaxBodySize int64
|
MaxBodySize int64
|
||||||
|
|
||||||
|
// Charset Detection disable
|
||||||
|
CharsetDetectDisabled bool
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user