Multiple Exporters and custom Exporters support added.

This commit is contained in:
Musab Gültekin 2019-06-11 16:10:49 +03:00
parent e4e8723426
commit 3790295658
5 changed files with 61 additions and 33 deletions

View File

@ -7,7 +7,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
## Features
- 1.000+ Requests/Sec
- Caching (Memory/Disk)
- Automatic Data Exporting
- Automatic Data Exporting (JSON)
- Limit Concurrency (Global/Per Domain)
- Request Delays (Constant/Randomized)
- Automatic response decoding to UTF-8

View File

@ -1,30 +1,6 @@
package geziyor
import (
"encoding/json"
"fmt"
"os"
"sync"
)
var file *os.File
var once sync.Once
// Export exports response data as JSON streaming file
func Export(response *Response) {
once.Do(func() {
newFile, err := os.OpenFile("out.json", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
return
}
file = newFile
})
for res := range response.Exports {
//fmt.Println(res)
encoder := json.NewEncoder(file)
encoder.SetEscapeHTML(false)
encoder.Encode(res)
}
// Exporter interface is for extracting data to external resources
type Exporter interface {
Export(exports chan interface{})
}

43
exporter/json.go Normal file
View File

@ -0,0 +1,43 @@
package exporter
import (
"encoding/json"
"fmt"
"os"
"sync"
)
// JSONExporter exports response data as JSON streaming file
type JSONExporter struct {
Filename string
EscapeHTML bool
file *os.File
once sync.Once
}
// Export exports response data as JSON streaming file
func (e JSONExporter) Export(exports chan interface{}) {
// Default Filename
if e.Filename == "" {
e.Filename = "out.json"
}
// Create File
e.once.Do(func() {
newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
return
}
e.file = newFile
})
// Export data as responses came
for res := range exports {
encoder := json.NewEncoder(e.file)
encoder.SetEscapeHTML(e.EscapeHTML)
encoder.Encode(res)
}
}

View File

@ -4,6 +4,7 @@ import (
"bytes"
"github.com/PuerkitoBio/goquery"
"github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor/exporter"
"golang.org/x/net/html/charset"
"io/ioutil"
"log"
@ -66,6 +67,9 @@ func NewGeziyor(opt Options) *Geziyor {
if opt.LogDisabled {
log.SetOutput(ioutil.Discard)
}
if len(opt.Exporters) == 0 {
geziyor.opt.Exporters = []Exporter{exporter.JSONExporter{}}
}
return geziyor
}
@ -106,7 +110,7 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
defer g.wg.Done()
defer func() {
if r := recover(); r != nil {
log.Println(string(debug.Stack()))
log.Println(r, string(debug.Stack()))
}
}()
@ -168,7 +172,7 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
Response: resp,
Body: body,
Geziyor: g,
Exports: make(chan interface{}, 1),
Exports: make(chan interface{}),
}
// Create HTML Document
@ -176,8 +180,10 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(body))
}
// Export Function
go Export(&response)
// Export Functions
for _, exp := range g.opt.Exporters {
go exp.Export(response.Exports)
}
// Callbacks
if callback != nil {

View File

@ -40,4 +40,7 @@ type Options struct {
// Disable logging by setting this true
LogDisabled bool
// For extracting data
Exporters []Exporter
}