Middlewares and some string util functions refactored. Added partial Documentation.

This commit is contained in:
Musab Gültekin 2019-06-16 10:38:03 +03:00
parent 40f673f2e2
commit 80383ebd6f
12 changed files with 219 additions and 152 deletions

View File

@ -1,11 +1,6 @@
language: go language: go
go: go:
- 1.5.x
- 1.6.x
- 1.7.x
- 1.8.x
- 1.9.x
- 1.10.x - 1.10.x
- 1.11.x - 1.11.x
- tip - tip

View File

@ -19,13 +19,13 @@ See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for
## Status ## Status
Since the project is in **development phase**, **API may change in time**. Thus, we highly recommend you to use Geziyor with go modules. Since the project is in **development phase**, **API may change in time**. Thus, we highly recommend you to use Geziyor with go modules.
## Usage ## Examples
Simple usage Simple usage
```go ```go
geziyor.NewGeziyor(geziyor.Options{ geziyor.NewGeziyor(geziyor.Options{
StartURLs: []string{"http://api.ipify.org"}, StartURLs: []string{"http://api.ipify.org"},
ParseFunc: func(r *geziyor.Response) { ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
fmt.Println(string(r.Body)) fmt.Println(string(r.Body))
}, },
}).Start() }).Start()
@ -42,21 +42,76 @@ func main() {
}).Start() }).Start()
} }
func quotesParse(r *geziyor.Response) { func quotesParse(g *geziyor.Geziyor, r *geziyor.Response) {
r.DocHTML.Find("div.quote").Each(func(i int, s *goquery.Selection) { r.DocHTML.Find("div.quote").Each(func(i int, s *goquery.Selection) {
r.Geziyor.Exports <- map[string]interface{}{ g.Exports <- map[string]interface{}{
"text": s.Find("span.text").Text(), "text": s.Find("span.text").Text(),
"author": s.Find("small.author").Text(), "author": s.Find("small.author").Text(),
} }
}) })
if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok { if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok {
r.Geziyor.Get(r.JoinURL(href), quotesParse) g.Get(r.JoinURL(href), quotesParse)
} }
} }
``` ```
See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for more usage examples. See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for more usage examples.
## Installation
## Documentation
### Installation
go get github.com/geziyor/geziyor go get github.com/geziyor/geziyor
### Making Requests
Initial requests start with ```StartURLs []string``` field in ```Options```.
Geziyor makes concurrent requests to those URLs.
After reading response, ```ParseFunc func(g *Geziyor, r *Response)``` called.
```go
geziyor.NewGeziyor(geziyor.Options{
StartURLs: []string{"http://api.ipify.org"},
ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
fmt.Println(string(r.Body))
},
}).Start()
```
If you want to manually create first requests, set ```StartRequestsFunc```.
```StartURLs``` won't be used if you create requests manually.
You can make following requests using ```Geziyor``` methods:
- ```Get```: Make GET request
- ```GetRendered```: Make GET and render Javascript using Headless Browser.
As it opens up a real browser, it takes a couple of seconds to make requests.
- ```Head```: Make HEAD request
- ```Do```: Make custom request by providing *geziyor.Request
```go
geziyor.NewGeziyor(geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) {
g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
g.Head("https://httpbin.org/anything", g.Opt.ParseFunc)
},
ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
fmt.Println(string(r.Body))
},
}).Start()
```
## Roadmap
If you're interested in helping this project, please consider these features:
- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html))
- Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape))
- Deploying Scrapers to Cloud
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))
- Realtime metrics (Prometheus etc.)

View File

@ -3,37 +3,28 @@ package exporter
import ( import (
"encoding/csv" "encoding/csv"
"fmt" "fmt"
"github.com/geziyor/geziyor/internal"
"log" "log"
"os" "os"
"reflect" "reflect"
"sync"
) )
// CSVExporter exports response data as CSV streaming file // CSVExporter exports response data as CSV streaming file
type CSVExporter struct { type CSVExporter struct {
FileName string FileName string
once sync.Once
writer *csv.Writer
} }
// Export exports response data as CSV streaming file // Export exports response data as CSV streaming file
func (e *CSVExporter) Export(exports chan interface{}) { func (e *CSVExporter) Export(exports chan interface{}) {
// Default filename
if e.FileName == "" {
e.FileName = "out.csv"
}
// Create file // Create file
e.once.Do(func() { newFile, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.csv"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "output file creation error: %v", err) log.Printf("output file creation error: %v", err)
return return
} }
e.writer = csv.NewWriter(newFile)
}) writer := csv.NewWriter(newFile)
// Export data as responses came // Export data as responses came
for res := range exports { for res := range exports {
@ -55,10 +46,9 @@ func (e *CSVExporter) Export(exports chan interface{}) {
// } // }
} }
// Write to file if err := writer.Write(values); err != nil {
if err := e.writer.Write(values); err != nil {
log.Printf("CSV writing error on exporter: %v\n", err) log.Printf("CSV writing error on exporter: %v\n", err)
} }
} }
e.writer.Flush() writer.Flush()
} }

View File

@ -2,43 +2,33 @@ package exporter
import ( import (
"encoding/json" "encoding/json"
"fmt" "github.com/geziyor/geziyor/internal"
"log" "log"
"os" "os"
"sync"
) )
// JSONExporter exports response data as JSON streaming file // JSONExporter exports response data as JSON streaming file
type JSONExporter struct { type JSONExporter struct {
FileName string FileName string
EscapeHTML bool EscapeHTML bool
once sync.Once
encoder *json.Encoder
} }
// Export exports response data as JSON streaming file // Export exports response data as JSON streaming file
func (e *JSONExporter) Export(exports chan interface{}) { func (e *JSONExporter) Export(exports chan interface{}) {
// Default filename
if e.FileName == "" {
e.FileName = "out.json"
}
// Create file // Create file
e.once.Do(func() { newFile, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "output file creation error: %v", err) log.Printf("output file creation error: %v", err)
return return
} }
e.encoder = json.NewEncoder(newFile)
e.encoder.SetEscapeHTML(e.EscapeHTML) encoder := json.NewEncoder(newFile)
}) encoder.SetEscapeHTML(e.EscapeHTML)
// Export data as responses came // Export data as responses came
for res := range exports { for res := range exports {
if err := e.encoder.Encode(res); err != nil { if err := encoder.Encode(res); err != nil {
log.Printf("JSON encoding error on exporter: %v\n", err) log.Printf("JSON encoding error on exporter: %v\n", err)
} }
} }

View File

@ -7,12 +7,12 @@ import (
"github.com/chromedp/cdproto/dom" "github.com/chromedp/cdproto/dom"
"github.com/chromedp/chromedp" "github.com/chromedp/chromedp"
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor/internal"
"golang.org/x/net/html/charset" "golang.org/x/net/html/charset"
"io" "io"
"io/ioutil" "io/ioutil"
"log" "log"
"math/rand" "math/rand"
"net"
"net/http" "net/http"
"os" "os"
"runtime/debug" "runtime/debug"
@ -25,10 +25,6 @@ type Exporter interface {
Export(exports chan interface{}) Export(exports chan interface{})
} }
// RequestMiddleware called before requests made.
// Set request.Cancelled = true to cancel request
type RequestMiddleware func(g *Geziyor, r *Request)
// Geziyor is our main scraper type // Geziyor is our main scraper type
type Geziyor struct { type Geziyor struct {
Opt Options Opt Options
@ -54,22 +50,7 @@ func init() {
// If options provided, options // If options provided, options
func NewGeziyor(opt Options) *Geziyor { func NewGeziyor(opt Options) *Geziyor {
geziyor := &Geziyor{ geziyor := &Geziyor{
client: &http.Client{ client: internal.NewClient(),
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
DualStack: true,
}).DialContext,
MaxIdleConns: 0, // Default: 100
MaxIdleConnsPerHost: 1000, // Default: 2
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 10 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
},
Timeout: time.Second * 180, // Google's timeout
},
Opt: opt, Opt: opt,
Exports: make(chan interface{}), Exports: make(chan interface{}),
requestMiddlewares: []RequestMiddleware{ requestMiddlewares: []RequestMiddleware{
@ -79,6 +60,12 @@ func NewGeziyor(opt Options) *Geziyor {
}, },
} }
if opt.UserAgent == "" {
geziyor.Opt.UserAgent = "Geziyor 1.0"
}
if opt.MaxBodySize == 0 {
geziyor.Opt.MaxBodySize = 1024 * 1024 * 1024 // 1GB
}
if opt.Cache != nil { if opt.Cache != nil {
geziyor.client.Transport = &httpcache.Transport{ geziyor.client.Transport = &httpcache.Transport{
Transport: geziyor.client.Transport, Cache: opt.Cache, MarkCachedResponses: true} Transport: geziyor.client.Transport, Cache: opt.Cache, MarkCachedResponses: true}
@ -95,15 +82,9 @@ func NewGeziyor(opt Options) *Geziyor {
hostSems map[string]chan struct{} hostSems map[string]chan struct{}
}{hostSems: make(map[string]chan struct{})} }{hostSems: make(map[string]chan struct{})}
} }
if opt.UserAgent == "" {
geziyor.Opt.UserAgent = "Geziyor 1.0"
}
if opt.LogDisabled { if opt.LogDisabled {
log.SetOutput(ioutil.Discard) log.SetOutput(ioutil.Discard)
} }
if opt.MaxBodySize == 0 {
geziyor.Opt.MaxBodySize = 1024 * 1024 * 1024 // 1GB
}
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...) geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...)
return geziyor return geziyor
@ -113,6 +94,7 @@ func NewGeziyor(opt Options) *Geziyor {
func (g *Geziyor) Start() { func (g *Geziyor) Start() {
log.Println("Scraping Started") log.Println("Scraping Started")
// Start Exporters
if len(g.Opt.Exporters) != 0 { if len(g.Opt.Exporters) != 0 {
for _, exp := range g.Opt.Exporters { for _, exp := range g.Opt.Exporters {
go exp.Export(g.Exports) go exp.Export(g.Exports)
@ -124,6 +106,7 @@ func (g *Geziyor) Start() {
}() }()
} }
// Start Requests
if g.Opt.StartRequestsFunc == nil { if g.Opt.StartRequestsFunc == nil {
for _, startURL := range g.Opt.StartURLs { for _, startURL := range g.Opt.StartURLs {
g.Get(startURL, g.Opt.ParseFunc) g.Get(startURL, g.Opt.ParseFunc)
@ -138,7 +121,7 @@ func (g *Geziyor) Start() {
} }
// Get issues a GET to the specified URL. // Get issues a GET to the specified URL.
func (g *Geziyor) Get(url string, callback func(resp *Response)) { func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *Response)) {
req, err := http.NewRequest("GET", url, nil) req, err := http.NewRequest("GET", url, nil)
if err != nil { if err != nil {
log.Printf("Request creating error %v\n", err) log.Printf("Request creating error %v\n", err)
@ -150,7 +133,7 @@ func (g *Geziyor) Get(url string, callback func(resp *Response)) {
// GetRendered issues GET request using headless browser // GetRendered issues GET request using headless browser
// Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed. // Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed.
// Rendered requests only supported for GET requests. // Rendered requests only supported for GET requests.
func (g *Geziyor) GetRendered(url string, callback func(resp *Response)) { func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *Response)) {
req, err := http.NewRequest("GET", url, nil) req, err := http.NewRequest("GET", url, nil)
if err != nil { if err != nil {
log.Printf("Request creating error %v\n", err) log.Printf("Request creating error %v\n", err)
@ -160,7 +143,7 @@ func (g *Geziyor) GetRendered(url string, callback func(resp *Response)) {
} }
// Head issues a HEAD to the specified URL // Head issues a HEAD to the specified URL
func (g *Geziyor) Head(url string, callback func(resp *Response)) { func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *Response)) {
req, err := http.NewRequest("HEAD", url, nil) req, err := http.NewRequest("HEAD", url, nil)
if err != nil { if err != nil {
log.Printf("Request creating error %v\n", err) log.Printf("Request creating error %v\n", err)
@ -170,13 +153,13 @@ func (g *Geziyor) Head(url string, callback func(resp *Response)) {
} }
// Do sends an HTTP request // Do sends an HTTP request
func (g *Geziyor) Do(req *Request, callback func(resp *Response)) { func (g *Geziyor) Do(req *Request, callback func(g *Geziyor, r *Response)) {
g.wg.Add(1) g.wg.Add(1)
go g.do(req, callback) go g.do(req, callback)
} }
// Do sends an HTTP request // Do sends an HTTP request
func (g *Geziyor) do(req *Request, callback func(resp *Response)) { func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) {
defer g.wg.Done() defer g.wg.Done()
defer func() { defer func() {
if r := recover(); r != nil { if r := recover(); r != nil {
@ -209,10 +192,10 @@ func (g *Geziyor) do(req *Request, callback func(resp *Response)) {
// Callbacks // Callbacks
if callback != nil { if callback != nil {
callback(response) callback(g, response)
} else { } else {
if g.Opt.ParseFunc != nil { if g.Opt.ParseFunc != nil {
g.Opt.ParseFunc(response) g.Opt.ParseFunc(g, response)
} }
} }
} }
@ -239,7 +222,7 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
bodyReader := io.LimitReader(resp.Body, g.Opt.MaxBodySize) bodyReader := io.LimitReader(resp.Body, g.Opt.MaxBodySize)
// Start reading body and determine encoding // Start reading body and determine encoding
if !g.Opt.CharsetDetectDisabled { if !g.Opt.CharsetDetectDisabled && resp.Request.Method != "HEAD" {
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type")) bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
if err != nil { if err != nil {
log.Printf("Determine encoding error: %v\n", err) log.Printf("Determine encoding error: %v\n", err)
@ -257,7 +240,6 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
Response: resp, Response: resp,
Body: body, Body: body,
Meta: req.Meta, Meta: req.Meta,
Geziyor: g,
} }
return &response, nil return &response, nil
@ -290,14 +272,13 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
return nil, err return nil, err
} }
response := &Response{ response := Response{
//Response: resp, //Response: resp,
Body: []byte(res), Body: []byte(res),
Meta: req.Meta, Meta: req.Meta,
Geziyor: g,
} }
return response, nil return &response, nil
} }
func (g *Geziyor) acquireSem(req *Request) { func (g *Geziyor) acquireSem(req *Request) {

View File

@ -15,7 +15,7 @@ import (
func TestSimple(t *testing.T) { func TestSimple(t *testing.T) {
geziyor.NewGeziyor(geziyor.Options{ geziyor.NewGeziyor(geziyor.Options{
StartURLs: []string{"http://api.ipify.org"}, StartURLs: []string{"http://api.ipify.org"},
ParseFunc: func(r *geziyor.Response) { ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
fmt.Println(string(r.Body)) fmt.Println(string(r.Body))
}, },
}).Start() }).Start()
@ -23,16 +23,15 @@ func TestSimple(t *testing.T) {
func TestSimpleCache(t *testing.T) { func TestSimpleCache(t *testing.T) {
defer leaktest.Check(t)() defer leaktest.Check(t)()
gez := geziyor.NewGeziyor(geziyor.Options{ geziyor.NewGeziyor(geziyor.Options{
StartURLs: []string{"http://api.ipify.org"}, StartURLs: []string{"http://api.ipify.org"},
Cache: httpcache.NewMemoryCache(), Cache: httpcache.NewMemoryCache(),
ParseFunc: func(r *geziyor.Response) { ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
fmt.Println(string(r.Body)) fmt.Println(string(r.Body))
r.Geziyor.Exports <- string(r.Body) g.Exports <- string(r.Body)
r.Geziyor.Get("http://api.ipify.org", nil) g.Get("http://api.ipify.org", nil)
}, },
}) }).Start()
gez.Start()
} }
func TestQuotes(t *testing.T) { func TestQuotes(t *testing.T) {
@ -44,10 +43,10 @@ func TestQuotes(t *testing.T) {
}).Start() }).Start()
} }
func quotesParse(r *geziyor.Response) { func quotesParse(g *geziyor.Geziyor, r *geziyor.Response) {
r.DocHTML.Find("div.quote").Each(func(i int, s *goquery.Selection) { r.DocHTML.Find("div.quote").Each(func(i int, s *goquery.Selection) {
// Export Data // Export Data
r.Geziyor.Exports <- map[string]interface{}{ g.Exports <- map[string]interface{}{
"number": i, "number": i,
"text": s.Find("span.text").Text(), "text": s.Find("span.text").Text(),
"author": s.Find("small.author").Text(), "author": s.Find("small.author").Text(),
@ -59,7 +58,7 @@ func quotesParse(r *geziyor.Response) {
// Next Page // Next Page
if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok { if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok {
r.Geziyor.Get(r.JoinURL(href), quotesParse) g.Get(r.JoinURL(href), quotesParse)
} }
} }
@ -69,11 +68,11 @@ func TestAllLinks(t *testing.T) {
geziyor.NewGeziyor(geziyor.Options{ geziyor.NewGeziyor(geziyor.Options{
AllowedDomains: []string{"books.toscrape.com"}, AllowedDomains: []string{"books.toscrape.com"},
StartURLs: []string{"http://books.toscrape.com/"}, StartURLs: []string{"http://books.toscrape.com/"},
ParseFunc: func(r *geziyor.Response) { ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
r.Geziyor.Exports <- []string{r.Request.URL.String()} g.Exports <- []string{r.Request.URL.String()}
r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) { r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) {
if href, ok := s.Attr("href"); ok { if href, ok := s.Attr("href"); ok {
r.Geziyor.Get(r.JoinURL(href), r.Geziyor.Opt.ParseFunc) g.Get(r.JoinURL(href), g.Opt.ParseFunc)
} }
}) })
}, },
@ -95,9 +94,9 @@ func TestStartRequestsFunc(t *testing.T) {
StartRequestsFunc: func(g *geziyor.Geziyor) { StartRequestsFunc: func(g *geziyor.Geziyor) {
g.Get("http://quotes.toscrape.com/", g.Opt.ParseFunc) g.Get("http://quotes.toscrape.com/", g.Opt.ParseFunc)
}, },
ParseFunc: func(r *geziyor.Response) { ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
r.DocHTML.Find("a").Each(func(_ int, s *goquery.Selection) { r.DocHTML.Find("a").Each(func(_ int, s *goquery.Selection) {
r.Geziyor.Exports <- s.AttrOr("href", "") g.Exports <- s.AttrOr("href", "")
}) })
}, },
Exporters: []geziyor.Exporter{&exporter.JSONExporter{}}, Exporters: []geziyor.Exporter{&exporter.JSONExporter{}},
@ -109,9 +108,20 @@ func TestGetRendered(t *testing.T) {
StartRequestsFunc: func(g *geziyor.Geziyor) { StartRequestsFunc: func(g *geziyor.Geziyor) {
g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc) g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
}, },
ParseFunc: func(r *geziyor.Response) { ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
fmt.Println(string(r.Body)) fmt.Println(string(r.Body))
}, },
//URLRevisitEnabled: true, //URLRevisitEnabled: true,
}).Start() }).Start()
} }
func TestHEADRequest(t *testing.T) {
geziyor.NewGeziyor(geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) {
g.Head("https://httpbin.org/anything", g.Opt.ParseFunc)
},
ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
fmt.Println(string(r.Body))
},
}).Start()
}

35
internal/http.go Normal file
View File

@ -0,0 +1,35 @@
package internal
import (
"net"
"net/http"
"time"
)
// NewClient creates http.Client with modified values for typical web scraper
func NewClient() *http.Client {
return &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
DualStack: true,
}).DialContext,
MaxIdleConns: 0, // Default: 100
MaxIdleConnsPerHost: 1000, // Default: 2
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 10 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
},
Timeout: time.Second * 180, // Google's timeout
}
}
// SetDefaultHeader sets header if not exists before
func SetDefaultHeader(header http.Header, key string, value string) http.Header {
if header.Get(key) == "" {
header.Set(key, value)
}
return header
}

19
internal/strings.go Normal file
View File

@ -0,0 +1,19 @@
package internal
// PreferFirst returns first non-empty string
func PreferFirst(first string, second string) string {
if first != "" {
return first
}
return second
}
// Contains checks whether []string Contains string
func Contains(s []string, e string) bool {
for _, a := range s {
if a == e {
return true
}
}
return false
}

35
middleware.go Normal file
View File

@ -0,0 +1,35 @@
package geziyor
import "github.com/geziyor/geziyor/internal"
// RequestMiddleware called before requests made.
// Set request.Cancelled = true to cancel request
type RequestMiddleware func(g *Geziyor, r *Request)
// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains
func allowedDomainsMiddleware(g *Geziyor, r *Request) {
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
//log.Printf("Domain not allowed: %s\n", req.Host)
r.Cancelled = true
return
}
}
// duplicateRequestsMiddleware checks for already visited URLs
func duplicateRequestsMiddleware(g *Geziyor, r *Request) {
if !g.Opt.URLRevisitEnabled {
key := r.Request.URL.String() + r.Request.Method
if _, visited := g.visitedURLs.LoadOrStore(key, struct{}{}); visited {
//log.Printf("URL already visited %s\n", rawURL)
r.Cancelled = true
}
}
}
// defaultHeadersMiddleware sets default request headers
func defaultHeadersMiddleware(g *Geziyor, r *Request) {
r.Header = internal.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Header = internal.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
r.Header = internal.SetDefaultHeader(r.Header, "Accept-Language", "en")
r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
}

View File

@ -18,7 +18,7 @@ type Options struct {
StartRequestsFunc func(g *Geziyor) StartRequestsFunc func(g *Geziyor)
// ParseFunc is callback of StartURLs response. // ParseFunc is callback of StartURLs response.
ParseFunc func(r *Response) ParseFunc func(g *Geziyor, r *Response)
// Timeout is global request timeout // Timeout is global request timeout
Timeout time.Duration Timeout time.Duration
@ -33,7 +33,7 @@ type Options struct {
// Concurrent requests per domain limit // Concurrent requests per domain limit
ConcurrentRequestsPerDomain int ConcurrentRequestsPerDomain int
// User Agent // User Agent. Default: "Geziyor 1.0"
UserAgent string UserAgent string
// Request delays // Request delays
@ -50,7 +50,7 @@ type Options struct {
// Called before requests made to manipulate requests // Called before requests made to manipulate requests
RequestMiddlewares []RequestMiddleware RequestMiddlewares []RequestMiddleware
// Max body reading size in bytes // Max body reading size in bytes. Default: 1GB
MaxBodySize int64 MaxBodySize int64
// Charset Detection disable // Charset Detection disable

View File

@ -11,44 +11,3 @@ type Request struct {
Rendered bool Rendered bool
Cancelled bool Cancelled bool
} }
func allowedDomainsMiddleware(g *Geziyor, r *Request) {
if len(g.Opt.AllowedDomains) != 0 && !contains(g.Opt.AllowedDomains, r.Host) {
//log.Printf("Domain not allowed: %s\n", req.Host)
r.Cancelled = true
return
}
}
func duplicateRequestsMiddleware(g *Geziyor, r *Request) {
if !g.Opt.URLRevisitEnabled {
if _, visited := g.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited {
//log.Printf("URL already visited %s\n", rawURL)
r.Cancelled = true
}
}
}
func defaultHeadersMiddleware(g *Geziyor, r *Request) {
r.Header = headerSetDefault(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Header = headerSetDefault(r.Header, "Accept-Charset", "utf-8")
r.Header = headerSetDefault(r.Header, "Accept-Language", "en")
r.Header = headerSetDefault(r.Header, "User-Agent", g.Opt.UserAgent)
}
func headerSetDefault(header http.Header, key string, value string) http.Header {
if header.Get(key) == "" {
header.Set(key, value)
}
return header
}
// contains checks whether []string contains string
func contains(s []string, e string) bool {
for _, a := range s {
if a == e {
return true
}
}
return false
}

View File

@ -14,8 +14,6 @@ type Response struct {
Body []byte Body []byte
DocHTML *goquery.Document DocHTML *goquery.Document
Meta map[string]interface{} Meta map[string]interface{}
Geziyor *Geziyor
} }
// JoinURL joins base response URL and provided relative URL. // JoinURL joins base response URL and provided relative URL.