Multiple Exporters and custom Exporters support added.
This commit is contained in:
parent
e4e8723426
commit
3790295658
@ -7,7 +7,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
|
|||||||
## Features
|
## Features
|
||||||
- 1.000+ Requests/Sec
|
- 1.000+ Requests/Sec
|
||||||
- Caching (Memory/Disk)
|
- Caching (Memory/Disk)
|
||||||
- Automatic Data Exporting
|
- Automatic Data Exporting (JSON)
|
||||||
- Limit Concurrency (Global/Per Domain)
|
- Limit Concurrency (Global/Per Domain)
|
||||||
- Request Delays (Constant/Randomized)
|
- Request Delays (Constant/Randomized)
|
||||||
- Automatic response decoding to UTF-8
|
- Automatic response decoding to UTF-8
|
||||||
|
30
export.go
30
export.go
@ -1,30 +1,6 @@
|
|||||||
package geziyor
|
package geziyor
|
||||||
|
|
||||||
import (
|
// Exporter interface is for extracting data to external resources
|
||||||
"encoding/json"
|
type Exporter interface {
|
||||||
"fmt"
|
Export(exports chan interface{})
|
||||||
"os"
|
|
||||||
"sync"
|
|
||||||
)
|
|
||||||
|
|
||||||
var file *os.File
|
|
||||||
var once sync.Once
|
|
||||||
|
|
||||||
// Export exports response data as JSON streaming file
|
|
||||||
func Export(response *Response) {
|
|
||||||
once.Do(func() {
|
|
||||||
newFile, err := os.OpenFile("out.json", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
file = newFile
|
|
||||||
})
|
|
||||||
|
|
||||||
for res := range response.Exports {
|
|
||||||
//fmt.Println(res)
|
|
||||||
encoder := json.NewEncoder(file)
|
|
||||||
encoder.SetEscapeHTML(false)
|
|
||||||
encoder.Encode(res)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
43
exporter/json.go
Normal file
43
exporter/json.go
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
package exporter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
// JSONExporter exports response data as JSON streaming file
|
||||||
|
type JSONExporter struct {
|
||||||
|
Filename string
|
||||||
|
EscapeHTML bool
|
||||||
|
|
||||||
|
file *os.File
|
||||||
|
once sync.Once
|
||||||
|
}
|
||||||
|
|
||||||
|
// Export exports response data as JSON streaming file
|
||||||
|
func (e JSONExporter) Export(exports chan interface{}) {
|
||||||
|
|
||||||
|
// Default Filename
|
||||||
|
if e.Filename == "" {
|
||||||
|
e.Filename = "out.json"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create File
|
||||||
|
e.once.Do(func() {
|
||||||
|
newFile, err := os.OpenFile(e.Filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
e.file = newFile
|
||||||
|
})
|
||||||
|
|
||||||
|
// Export data as responses came
|
||||||
|
for res := range exports {
|
||||||
|
encoder := json.NewEncoder(e.file)
|
||||||
|
encoder.SetEscapeHTML(e.EscapeHTML)
|
||||||
|
encoder.Encode(res)
|
||||||
|
}
|
||||||
|
}
|
14
geziyor.go
14
geziyor.go
@ -4,6 +4,7 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"github.com/fpfeng/httpcache"
|
"github.com/fpfeng/httpcache"
|
||||||
|
"github.com/geziyor/geziyor/exporter"
|
||||||
"golang.org/x/net/html/charset"
|
"golang.org/x/net/html/charset"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
@ -66,6 +67,9 @@ func NewGeziyor(opt Options) *Geziyor {
|
|||||||
if opt.LogDisabled {
|
if opt.LogDisabled {
|
||||||
log.SetOutput(ioutil.Discard)
|
log.SetOutput(ioutil.Discard)
|
||||||
}
|
}
|
||||||
|
if len(opt.Exporters) == 0 {
|
||||||
|
geziyor.opt.Exporters = []Exporter{exporter.JSONExporter{}}
|
||||||
|
}
|
||||||
|
|
||||||
return geziyor
|
return geziyor
|
||||||
}
|
}
|
||||||
@ -106,7 +110,7 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
|
|||||||
defer g.wg.Done()
|
defer g.wg.Done()
|
||||||
defer func() {
|
defer func() {
|
||||||
if r := recover(); r != nil {
|
if r := recover(); r != nil {
|
||||||
log.Println(string(debug.Stack()))
|
log.Println(r, string(debug.Stack()))
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@ -168,7 +172,7 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
|
|||||||
Response: resp,
|
Response: resp,
|
||||||
Body: body,
|
Body: body,
|
||||||
Geziyor: g,
|
Geziyor: g,
|
||||||
Exports: make(chan interface{}, 1),
|
Exports: make(chan interface{}),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create HTML Document
|
// Create HTML Document
|
||||||
@ -176,8 +180,10 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
|
|||||||
response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(body))
|
response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Export Function
|
// Export Functions
|
||||||
go Export(&response)
|
for _, exp := range g.opt.Exporters {
|
||||||
|
go exp.Export(response.Exports)
|
||||||
|
}
|
||||||
|
|
||||||
// Callbacks
|
// Callbacks
|
||||||
if callback != nil {
|
if callback != nil {
|
||||||
|
@ -40,4 +40,7 @@ type Options struct {
|
|||||||
|
|
||||||
// Disable logging by setting this true
|
// Disable logging by setting this true
|
||||||
LogDisabled bool
|
LogDisabled bool
|
||||||
|
|
||||||
|
// For extracting data
|
||||||
|
Exporters []Exporter
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user