Multiple Exporters and custom Exporters support added.
This commit is contained in:
parent
e4e8723426
commit
3790295658
@ -7,7 +7,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
|
||||
## Features
|
||||
- 1.000+ Requests/Sec
|
||||
- Caching (Memory/Disk)
|
||||
- Automatic Data Exporting
|
||||
- Automatic Data Exporting (JSON)
|
||||
- Limit Concurrency (Global/Per Domain)
|
||||
- Request Delays (Constant/Randomized)
|
||||
- Automatic response decoding to UTF-8
|
||||
|
30
export.go
30
export.go
@ -1,30 +1,6 @@
|
||||
package geziyor
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
var file *os.File
|
||||
var once sync.Once
|
||||
|
||||
// Export exports response data as JSON streaming file
|
||||
func Export(response *Response) {
|
||||
once.Do(func() {
|
||||
newFile, err := os.OpenFile("out.json", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
||||
return
|
||||
}
|
||||
file = newFile
|
||||
})
|
||||
|
||||
for res := range response.Exports {
|
||||
//fmt.Println(res)
|
||||
encoder := json.NewEncoder(file)
|
||||
encoder.SetEscapeHTML(false)
|
||||
encoder.Encode(res)
|
||||
}
|
||||
// Exporter interface is for extracting data to external resources
|
||||
type Exporter interface {
|
||||
Export(exports chan interface{})
|
||||
}
|
||||
|
43
exporter/json.go
Normal file
43
exporter/json.go
Normal file
@ -0,0 +1,43 @@
|
||||
package exporter
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// JSONExporter exports response data as JSON streaming file
|
||||
type JSONExporter struct {
|
||||
Filename string
|
||||
EscapeHTML bool
|
||||
|
||||
file *os.File
|
||||
once sync.Once
|
||||
}
|
||||
|
||||
// Export exports response data as JSON streaming file
|
||||
func (e JSONExporter) Export(exports chan interface{}) {
|
||||
|
||||
// Default Filename
|
||||
if e.Filename == "" {
|
||||
e.Filename = "out.json"
|
||||
}
|
||||
|
||||
// Create File
|
||||
e.once.Do(func() {
|
||||
newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
||||
return
|
||||
}
|
||||
e.file = newFile
|
||||
})
|
||||
|
||||
// Export data as responses came
|
||||
for res := range exports {
|
||||
encoder := json.NewEncoder(e.file)
|
||||
encoder.SetEscapeHTML(e.EscapeHTML)
|
||||
encoder.Encode(res)
|
||||
}
|
||||
}
|
14
geziyor.go
14
geziyor.go
@ -4,6 +4,7 @@ import (
|
||||
"bytes"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/fpfeng/httpcache"
|
||||
"github.com/geziyor/geziyor/exporter"
|
||||
"golang.org/x/net/html/charset"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
@ -66,6 +67,9 @@ func NewGeziyor(opt Options) *Geziyor {
|
||||
if opt.LogDisabled {
|
||||
log.SetOutput(ioutil.Discard)
|
||||
}
|
||||
if len(opt.Exporters) == 0 {
|
||||
geziyor.opt.Exporters = []Exporter{exporter.JSONExporter{}}
|
||||
}
|
||||
|
||||
return geziyor
|
||||
}
|
||||
@ -106,7 +110,7 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
|
||||
defer g.wg.Done()
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Println(string(debug.Stack()))
|
||||
log.Println(r, string(debug.Stack()))
|
||||
}
|
||||
}()
|
||||
|
||||
@ -168,7 +172,7 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
|
||||
Response: resp,
|
||||
Body: body,
|
||||
Geziyor: g,
|
||||
Exports: make(chan interface{}, 1),
|
||||
Exports: make(chan interface{}),
|
||||
}
|
||||
|
||||
// Create HTML Document
|
||||
@ -176,8 +180,10 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
|
||||
response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||
}
|
||||
|
||||
// Export Function
|
||||
go Export(&response)
|
||||
// Export Functions
|
||||
for _, exp := range g.opt.Exporters {
|
||||
go exp.Export(response.Exports)
|
||||
}
|
||||
|
||||
// Callbacks
|
||||
if callback != nil {
|
||||
|
@ -40,4 +40,7 @@ type Options struct {
|
||||
|
||||
// Disable logging by setting this true
|
||||
LogDisabled bool
|
||||
|
||||
// For extracting data
|
||||
Exporters []Exporter
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user