Exporters made optional, as some scrapers only want to see data in console.

This commit is contained in:
Musab Gültekin 2019-06-11 18:59:37 +03:00
parent b8305d5e1a
commit bbdc3bcacd
4 changed files with 19 additions and 13 deletions

View File

@ -1,6 +0,0 @@
package geziyor
// Exporter interface is for extracting data to external resources
type Exporter interface {
Export(exports chan interface{})
}

View File

@ -3,6 +3,7 @@ package exporter
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"github.com/geziyor/geziyor"
"os" "os"
"sync" "sync"
) )
@ -17,7 +18,7 @@ type JSONExporter struct {
} }
// Export exports response data as JSON streaming file // Export exports response data as JSON streaming file
func (e JSONExporter) Export(exports chan interface{}) { func (e JSONExporter) Export(response *geziyor.Response) {
// Default Filename // Default Filename
if e.Filename == "" { if e.Filename == "" {
@ -35,7 +36,7 @@ func (e JSONExporter) Export(exports chan interface{}) {
}) })
// Export data as responses came // Export data as responses came
for res := range exports { for res := range response.Exports {
encoder := json.NewEncoder(e.file) encoder := json.NewEncoder(e.file)
encoder.SetEscapeHTML(e.EscapeHTML) encoder.SetEscapeHTML(e.EscapeHTML)
encoder.Encode(res) encoder.Encode(res)

View File

@ -4,7 +4,6 @@ import (
"bytes" "bytes"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor/exporter"
"golang.org/x/net/html/charset" "golang.org/x/net/html/charset"
"io" "io"
"io/ioutil" "io/ioutil"
@ -18,6 +17,11 @@ import (
"time" "time"
) )
// Exporter interface is for extracting data to external resources
type Exporter interface {
Export(exports *Response)
}
// Geziyor is our main scraper type // Geziyor is our main scraper type
type Geziyor struct { type Geziyor struct {
client *http.Client client *http.Client
@ -68,9 +72,6 @@ func NewGeziyor(opt Options) *Geziyor {
if opt.LogDisabled { if opt.LogDisabled {
log.SetOutput(ioutil.Discard) log.SetOutput(ioutil.Discard)
} }
if len(opt.Exporters) == 0 {
geziyor.opt.Exporters = []Exporter{exporter.JSONExporter{}}
}
if opt.MaxBodySize == 0 { if opt.MaxBodySize == 0 {
geziyor.opt.MaxBodySize = 1024 * 1024 * 1024 // 1GB geziyor.opt.MaxBodySize = 1024 * 1024 * 1024 // 1GB
} }
@ -189,7 +190,15 @@ func (g *Geziyor) Do(req *http.Request, callback func(resp *Response)) {
// Export Functions // Export Functions
for _, exp := range g.opt.Exporters { for _, exp := range g.opt.Exporters {
go exp.Export(response.Exports) go exp.Export(&response)
}
// Drain exports chan if no exporters added
if len(g.opt.Exporters) == 0 {
go func() {
for range response.Exports {
}
}()
} }
// Callbacks // Callbacks

View File

@ -5,6 +5,7 @@ import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor" "github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/exporter"
"math/rand" "math/rand"
"testing" "testing"
"time" "time"
@ -36,6 +37,7 @@ func TestQuotes(t *testing.T) {
geziyor.NewGeziyor(geziyor.Options{ geziyor.NewGeziyor(geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/"}, StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: quotesParse, ParseFunc: quotesParse,
Exporters: []geziyor.Exporter{exporter.JSONExporter{}},
}).Start() }).Start()
} }