http package renamed to client to reduce cunfusion

This commit is contained in:
Musab Gültekin 2019-06-29 14:18:31 +03:00
parent 1e109c555d
commit bd6466a5f2
8 changed files with 73 additions and 73 deletions

View File

@ -27,7 +27,7 @@ Simple usage
```go ```go
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://api.ipify.org"}, StartURLs: []string{"http://api.ipify.org"},
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
fmt.Println(string(r.Body)) fmt.Println(string(r.Body))
}, },
}).Start() }).Start()
@ -44,7 +44,7 @@ func main() {
}).Start() }).Start()
} }
func quotesParse(g *geziyor.Geziyor, r *http.Response) { func quotesParse(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div.quote").Each(func(i int, s *goquery.Selection) { r.HTMLDoc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
g.Exports <- map[string]interface{}{ g.Exports <- map[string]interface{}{
"text": s.Find("span.text").Text(), "text": s.Find("span.text").Text(),
@ -78,7 +78,7 @@ After reading response, ```ParseFunc func(g *Geziyor, r *Response)``` called.
```go ```go
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://api.ipify.org"}, StartURLs: []string{"http://api.ipify.org"},
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
fmt.Println(string(r.Body)) fmt.Println(string(r.Body))
}, },
}).Start() }).Start()
@ -95,7 +95,7 @@ geziyor.NewGeziyor(&geziyor.Options{
g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc) g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
g.Head("https://httpbin.org/anything", g.Opt.ParseFunc) g.Head("https://httpbin.org/anything", g.Opt.ParseFunc)
}, },
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
fmt.Println(string(r.Body)) fmt.Println(string(r.Body))
}, },
}).Start() }).Start()
@ -130,7 +130,7 @@ If response isn't HTML, ```response.HTMLDoc``` would be ```nil```.
```go ```go
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/"}, StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div.quote").Each(func(_ int, s *goquery.Selection) { r.HTMLDoc.Find("div.quote").Each(func(_ int, s *goquery.Selection) {
log.Println(s.Find("span.text").Text(), s.Find("small.author").Text()) log.Println(s.Find("span.text").Text(), s.Find("small.author").Text())
}) })
@ -146,7 +146,7 @@ You can export data automatically using exporters. Just send data to ```Geziyor.
```go ```go
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/"}, StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div.quote").Each(func(_ int, s *goquery.Selection) { r.HTMLDoc.Find("div.quote").Each(func(_ int, s *goquery.Selection) {
g.Exports <- map[string]interface{}{ g.Exports <- map[string]interface{}{
"text": s.Find("span.text").Text(), "text": s.Find("span.text").Text(),

View File

@ -1,4 +1,4 @@
package http package client
import ( import (
"errors" "errors"

View File

@ -1,4 +1,4 @@
package http package client
import ( import (
"io" "io"

View File

@ -1,4 +1,4 @@
package http package client
import ( import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"

View File

@ -7,7 +7,7 @@ import (
"github.com/chromedp/cdproto/network" "github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp" "github.com/chromedp/chromedp"
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor/http" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/client_golang/prometheus/promhttp"
@ -15,7 +15,7 @@ import (
"io" "io"
"io/ioutil" "io/ioutil"
"log" "log"
stdhttp "net/http" "net/http"
"net/http/cookiejar" "net/http/cookiejar"
"net/url" "net/url"
"sync" "sync"
@ -36,7 +36,7 @@ type Exporter interface {
// Geziyor is our main scraper type // Geziyor is our main scraper type
type Geziyor struct { type Geziyor struct {
Opt *Options Opt *Options
Client *http.Client Client *client.Client
Exports chan interface{} Exports chan interface{}
metrics *metrics.Metrics metrics *metrics.Metrics
@ -55,7 +55,7 @@ type Geziyor struct {
// If options provided, options // If options provided, options
func NewGeziyor(opt *Options) *Geziyor { func NewGeziyor(opt *Options) *Geziyor {
geziyor := &Geziyor{ geziyor := &Geziyor{
Client: http.NewClient(), Client: client.NewClient(),
Opt: opt, Opt: opt,
Exports: make(chan interface{}), Exports: make(chan interface{}),
requestMiddlewares: []RequestMiddleware{ requestMiddlewares: []RequestMiddleware{
@ -114,10 +114,10 @@ func (g *Geziyor) Start() {
// Metrics // Metrics
if g.Opt.MetricsType == metrics.Prometheus { if g.Opt.MetricsType == metrics.Prometheus {
metricsServer := &stdhttp.Server{Addr: ":2112"} metricsServer := &http.Server{Addr: ":2112"}
defer metricsServer.Close() defer metricsServer.Close()
go func() { go func() {
stdhttp.Handle("/metrics", promhttp.Handler()) http.Handle("/metrics", promhttp.Handler())
metricsServer.ListenAndServe() metricsServer.ListenAndServe()
}() }()
} }
@ -149,39 +149,39 @@ func (g *Geziyor) Start() {
} }
// Get issues a GET to the specified URL. // Get issues a GET to the specified URL.
func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *http.Response)) { func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := stdhttp.NewRequest("GET", url, nil) req, err := http.NewRequest("GET", url, nil)
if err != nil { if err != nil {
log.Printf("Request creating error %v\n", err) log.Printf("Request creating error %v\n", err)
return return
} }
g.Do(&http.Request{Request: req}, callback) g.Do(&client.Request{Request: req}, callback)
} }
// GetRendered issues GET request using headless browser // GetRendered issues GET request using headless browser
// Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed. // Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed.
// Rendered requests only supported for GET requests. // Rendered requests only supported for GET requests.
func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *http.Response)) { func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := stdhttp.NewRequest("GET", url, nil) req, err := http.NewRequest("GET", url, nil)
if err != nil { if err != nil {
log.Printf("Request creating error %v\n", err) log.Printf("Request creating error %v\n", err)
return return
} }
g.Do(&http.Request{Request: req, Rendered: true}, callback) g.Do(&client.Request{Request: req, Rendered: true}, callback)
} }
// Head issues a HEAD to the specified URL // Head issues a HEAD to the specified URL
func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *http.Response)) { func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := stdhttp.NewRequest("HEAD", url, nil) req, err := http.NewRequest("HEAD", url, nil)
if err != nil { if err != nil {
log.Printf("Request creating error %v\n", err) log.Printf("Request creating error %v\n", err)
return return
} }
g.Do(&http.Request{Request: req}, callback) g.Do(&client.Request{Request: req}, callback)
} }
// Do sends an HTTP request // Do sends an HTTP request
func (g *Geziyor) Do(req *http.Request, callback func(g *Geziyor, r *http.Response)) { func (g *Geziyor) Do(req *client.Request, callback func(g *Geziyor, r *client.Response)) {
if req.Synchronized { if req.Synchronized {
g.do(req, callback) g.do(req, callback)
} else { } else {
@ -191,7 +191,7 @@ func (g *Geziyor) Do(req *http.Request, callback func(g *Geziyor, r *http.Respon
} }
// Do sends an HTTP request // Do sends an HTTP request
func (g *Geziyor) do(req *http.Request, callback func(g *Geziyor, r *http.Response)) { func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Response)) {
g.acquireSem(req) g.acquireSem(req)
defer g.releaseSem(req) defer g.releaseSem(req)
if !req.Synchronized { if !req.Synchronized {
@ -207,7 +207,7 @@ func (g *Geziyor) do(req *http.Request, callback func(g *Geziyor, r *http.Respon
} }
// Do request normal or Chrome and read response // Do request normal or Chrome and read response
var response *http.Response var response *client.Response
var err error var err error
if !req.Rendered { if !req.Rendered {
response, err = g.doRequestClient(req) response, err = g.doRequestClient(req)
@ -233,7 +233,7 @@ func (g *Geziyor) do(req *http.Request, callback func(g *Geziyor, r *http.Respon
} }
} }
func (g *Geziyor) doRequestClient(req *http.Request) (*http.Response, error) { func (g *Geziyor) doRequestClient(req *client.Request) (*client.Response, error) {
// Do request // Do request
resp, err := g.Client.Do(req.Request) resp, err := g.Client.Do(req.Request)
@ -260,7 +260,7 @@ func (g *Geziyor) doRequestClient(req *http.Request) (*http.Response, error) {
return nil, errors.Wrap(err, "Reading body error") return nil, errors.Wrap(err, "Reading body error")
} }
response := http.Response{ response := client.Response{
Response: resp, Response: resp,
Body: body, Body: body,
Meta: req.Meta, Meta: req.Meta,
@ -270,7 +270,7 @@ func (g *Geziyor) doRequestClient(req *http.Request) (*http.Response, error) {
return &response, nil return &response, nil
} }
func (g *Geziyor) doRequestChrome(req *http.Request) (*http.Response, error) { func (g *Geziyor) doRequestChrome(req *client.Request) (*client.Response, error) {
var body string var body string
var reqID network.RequestID var reqID network.RequestID
var res *network.Response var res *network.Response
@ -280,7 +280,7 @@ func (g *Geziyor) doRequestChrome(req *http.Request) (*http.Response, error) {
if err := chromedp.Run(ctx, if err := chromedp.Run(ctx,
network.Enable(), network.Enable(),
network.SetExtraHTTPHeaders(network.Headers(http.ConvertHeaderToMap(req.Header))), network.SetExtraHTTPHeaders(network.Headers(client.ConvertHeaderToMap(req.Header))),
chromedp.ActionFunc(func(ctx context.Context) error { chromedp.ActionFunc(func(ctx context.Context) error {
chromedp.ListenTarget(ctx, func(ev interface{}) { chromedp.ListenTarget(ctx, func(ev interface{}) {
switch ev.(type) { switch ev.(type) {
@ -317,11 +317,11 @@ func (g *Geziyor) doRequestChrome(req *http.Request) (*http.Response, error) {
// Set new URL in case of redirection // Set new URL in case of redirection
req.URL, _ = url.Parse(res.URL) req.URL, _ = url.Parse(res.URL)
response := http.Response{ response := client.Response{
Response: &stdhttp.Response{ Response: &http.Response{
Request: req.Request, Request: req.Request,
StatusCode: int(res.Status), StatusCode: int(res.Status),
Header: http.ConvertMapToHeader(res.Headers), Header: client.ConvertMapToHeader(res.Headers),
}, },
Body: []byte(body), Body: []byte(body),
Meta: req.Meta, Meta: req.Meta,
@ -331,7 +331,7 @@ func (g *Geziyor) doRequestChrome(req *http.Request) (*http.Response, error) {
return &response, nil return &response, nil
} }
func (g *Geziyor) acquireSem(req *http.Request) { func (g *Geziyor) acquireSem(req *client.Request) {
if g.Opt.ConcurrentRequests != 0 { if g.Opt.ConcurrentRequests != 0 {
g.semGlobal <- struct{}{} g.semGlobal <- struct{}{}
} }
@ -349,7 +349,7 @@ func (g *Geziyor) acquireSem(req *http.Request) {
} }
} }
func (g *Geziyor) releaseSem(req *http.Request) { func (g *Geziyor) releaseSem(req *client.Request) {
if g.Opt.ConcurrentRequests != 0 { if g.Opt.ConcurrentRequests != 0 {
<-g.semGlobal <-g.semGlobal
} }

View File

@ -6,11 +6,11 @@ import (
"github.com/fortytw2/leaktest" "github.com/fortytw2/leaktest"
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor" "github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/exporter" "github.com/geziyor/geziyor/exporter"
"github.com/geziyor/geziyor/extractor" "github.com/geziyor/geziyor/extractor"
"github.com/geziyor/geziyor/http"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
httpstd "net/http" "net/http"
"net/http/httptest" "net/http/httptest"
"testing" "testing"
"unicode/utf8" "unicode/utf8"
@ -20,7 +20,7 @@ func TestSimple(t *testing.T) {
defer leaktest.Check(t)() defer leaktest.Check(t)()
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://api.ipify.org"}, StartURLs: []string{"http://api.ipify.org"},
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
fmt.Println(string(r.Body)) fmt.Println(string(r.Body))
}, },
}).Start() }).Start()
@ -31,7 +31,7 @@ func TestSimpleCache(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://api.ipify.org"}, StartURLs: []string{"http://api.ipify.org"},
Cache: httpcache.NewMemoryCache(), Cache: httpcache.NewMemoryCache(),
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
fmt.Println(string(r.Body)) fmt.Println(string(r.Body))
g.Exports <- string(r.Body) g.Exports <- string(r.Body)
g.Get("http://api.ipify.org", nil) g.Get("http://api.ipify.org", nil)
@ -48,7 +48,7 @@ func TestQuotes(t *testing.T) {
}).Start() }).Start()
} }
func quotesParse(g *geziyor.Geziyor, r *http.Response) { func quotesParse(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("div.quote").Each(func(i int, s *goquery.Selection) { r.HTMLDoc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
// Export Data // Export Data
g.Exports <- map[string]interface{}{ g.Exports <- map[string]interface{}{
@ -73,7 +73,7 @@ func TestAllLinks(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
AllowedDomains: []string{"books.toscrape.com"}, AllowedDomains: []string{"books.toscrape.com"},
StartURLs: []string{"http://books.toscrape.com/"}, StartURLs: []string{"http://books.toscrape.com/"},
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
g.Exports <- []string{r.Request.URL.String()} g.Exports <- []string{r.Request.URL.String()}
r.HTMLDoc.Find("a").Each(func(i int, s *goquery.Selection) { r.HTMLDoc.Find("a").Each(func(i int, s *goquery.Selection) {
if href, ok := s.Attr("href"); ok { if href, ok := s.Attr("href"); ok {
@ -91,7 +91,7 @@ func TestStartRequestsFunc(t *testing.T) {
StartRequestsFunc: func(g *geziyor.Geziyor) { StartRequestsFunc: func(g *geziyor.Geziyor) {
g.Get("http://quotes.toscrape.com/", g.Opt.ParseFunc) g.Get("http://quotes.toscrape.com/", g.Opt.ParseFunc)
}, },
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
r.HTMLDoc.Find("a").Each(func(_ int, s *goquery.Selection) { r.HTMLDoc.Find("a").Each(func(_ int, s *goquery.Selection) {
g.Exports <- s.AttrOr("href", "") g.Exports <- s.AttrOr("href", "")
}) })
@ -105,7 +105,7 @@ func TestGetRendered(t *testing.T) {
StartRequestsFunc: func(g *geziyor.Geziyor) { StartRequestsFunc: func(g *geziyor.Geziyor) {
g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc) g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc)
}, },
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
fmt.Println(string(r.Body)) fmt.Println(string(r.Body))
fmt.Println(r.Header) fmt.Println(r.Header)
}, },
@ -118,7 +118,7 @@ func TestHEADRequest(t *testing.T) {
StartRequestsFunc: func(g *geziyor.Geziyor) { StartRequestsFunc: func(g *geziyor.Geziyor) {
g.Head("https://httpbin.org/anything", g.Opt.ParseFunc) g.Head("https://httpbin.org/anything", g.Opt.ParseFunc)
}, },
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
fmt.Println(string(r.Body)) fmt.Println(string(r.Body))
}, },
}).Start() }).Start()
@ -127,7 +127,7 @@ func TestHEADRequest(t *testing.T) {
func TestCookies(t *testing.T) { func TestCookies(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/login"}, StartURLs: []string{"http://quotes.toscrape.com/login"},
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
if len(g.Client.Cookies(r.Request.URL.String())) == 0 { if len(g.Client.Cookies(r.Request.URL.String())) == 0 {
t.Fatal("Cookies is Empty") t.Fatal("Cookies is Empty")
} }
@ -136,7 +136,7 @@ func TestCookies(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/login"}, StartURLs: []string{"http://quotes.toscrape.com/login"},
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
if len(g.Client.Cookies(r.Request.URL.String())) != 0 { if len(g.Client.Cookies(r.Request.URL.String())) != 0 {
t.Fatal("Cookies exist") t.Fatal("Cookies exist")
} }
@ -148,7 +148,7 @@ func TestCookies(t *testing.T) {
func TestBasicAuth(t *testing.T) { func TestBasicAuth(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) { StartRequestsFunc: func(g *geziyor.Geziyor) {
req, _ := http.NewRequest("GET", "https://httpbin.org/anything", nil) req, _ := client.NewRequest("GET", "https://httpbin.org/anything", nil)
req.SetBasicAuth("username", "password") req.SetBasicAuth("username", "password")
g.Do(req, nil) g.Do(req, nil)
}, },
@ -170,14 +170,14 @@ func TestExtractor(t *testing.T) {
} }
func TestCharsetDetection(t *testing.T) { func TestCharsetDetection(t *testing.T) {
ts := httptest.NewServer(httpstd.HandlerFunc(func(w httpstd.ResponseWriter, r *httpstd.Request) { ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, "\xf0ültekin") fmt.Fprint(w, "\xf0ültekin")
})) }))
defer ts.Close() defer ts.Close()
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{ts.URL}, StartURLs: []string{ts.URL},
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
if !utf8.Valid(r.Body) { if !utf8.Valid(r.Body) {
t.Fatal() t.Fatal()
} }
@ -187,7 +187,7 @@ func TestCharsetDetection(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{ts.URL}, StartURLs: []string{ts.URL},
ParseFunc: func(g *geziyor.Geziyor, r *http.Response) { ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
if utf8.Valid(r.Body) { if utf8.Valid(r.Body) {
t.Fatal() t.Fatal()
} }
@ -200,10 +200,10 @@ func TestCharsetDetection(t *testing.T) {
func BenchmarkGeziyor_Do(b *testing.B) { func BenchmarkGeziyor_Do(b *testing.B) {
// Create Server // Create Server
ts := httptest.NewServer(httpstd.HandlerFunc(func(w httpstd.ResponseWriter, r *httpstd.Request) { ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, "Hello, client") fmt.Fprint(w, "Hello, client")
})) }))
ts.Client().Transport = http.NewClient().Transport ts.Client().Transport = client.NewClient().Transport
defer ts.Close() defer ts.Close()
// As we don't benchmark creating a server, reset timer. // As we don't benchmark creating a server, reset timer.
@ -212,7 +212,7 @@ func BenchmarkGeziyor_Do(b *testing.B) {
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) { StartRequestsFunc: func(g *geziyor.Geziyor) {
// Create Synchronized request to benchmark requests accurately. // Create Synchronized request to benchmark requests accurately.
req, _ := http.NewRequest("GET", ts.URL, nil) req, _ := client.NewRequest("GET", ts.URL, nil)
req.Synchronized = true req.Synchronized = true
// We only bench here ! // We only bench here !

View File

@ -4,7 +4,7 @@ import (
"bytes" "bytes"
"fmt" "fmt"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor/http" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal" "github.com/geziyor/geziyor/internal"
"log" "log"
"math/rand" "math/rand"
@ -16,10 +16,10 @@ import (
// RequestMiddleware called before requests made. // RequestMiddleware called before requests made.
// Set request.Cancelled = true to cancel request // Set request.Cancelled = true to cancel request
type RequestMiddleware func(g *Geziyor, r *http.Request) type RequestMiddleware func(g *Geziyor, r *client.Request)
// ResponseMiddleware called after request response receive // ResponseMiddleware called after request response receive
type ResponseMiddleware func(g *Geziyor, r *http.Response) type ResponseMiddleware func(g *Geziyor, r *client.Response)
func init() { func init() {
log.SetOutput(os.Stdout) log.SetOutput(os.Stdout)
@ -28,7 +28,7 @@ func init() {
// recoverMiddleware recovers scraping being crashed. // recoverMiddleware recovers scraping being crashed.
// Logs error and stack trace // Logs error and stack trace
func recoverMiddleware(g *Geziyor, r *http.Request) { func recoverMiddleware(g *Geziyor, r *client.Request) {
if r := recover(); r != nil { if r := recover(); r != nil {
log.Println(r, string(debug.Stack())) log.Println(r, string(debug.Stack()))
g.metrics.PanicCounter.Add(1) g.metrics.PanicCounter.Add(1)
@ -36,7 +36,7 @@ func recoverMiddleware(g *Geziyor, r *http.Request) {
} }
// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains // allowedDomainsMiddleware checks for request host if it exists in AllowedDomains
func allowedDomainsMiddleware(g *Geziyor, r *http.Request) { func allowedDomainsMiddleware(g *Geziyor, r *client.Request) {
if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) { if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) {
//log.Printf("Domain not allowed: %s\n", req.Host) //log.Printf("Domain not allowed: %s\n", req.Host)
r.Cancel() r.Cancel()
@ -45,7 +45,7 @@ func allowedDomainsMiddleware(g *Geziyor, r *http.Request) {
} }
// duplicateRequestsMiddleware checks for already visited URLs // duplicateRequestsMiddleware checks for already visited URLs
func duplicateRequestsMiddleware(g *Geziyor, r *http.Request) { func duplicateRequestsMiddleware(g *Geziyor, r *client.Request) {
if !g.Opt.URLRevisitEnabled { if !g.Opt.URLRevisitEnabled {
key := r.Request.URL.String() + r.Request.Method key := r.Request.URL.String() + r.Request.Method
if _, visited := g.visitedURLs.LoadOrStore(key, struct{}{}); visited { if _, visited := g.visitedURLs.LoadOrStore(key, struct{}{}); visited {
@ -56,15 +56,15 @@ func duplicateRequestsMiddleware(g *Geziyor, r *http.Request) {
} }
// defaultHeadersMiddleware sets default request headers // defaultHeadersMiddleware sets default request headers
func defaultHeadersMiddleware(g *Geziyor, r *http.Request) { func defaultHeadersMiddleware(g *Geziyor, r *client.Request) {
r.Header = http.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Header = http.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8") r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
r.Header = http.SetDefaultHeader(r.Header, "Accept-Language", "en") r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en")
r.Header = http.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent) r.Header = client.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
} }
// delayMiddleware delays requests // delayMiddleware delays requests
func delayMiddleware(g *Geziyor, r *http.Request) { func delayMiddleware(g *Geziyor, r *client.Request) {
if g.Opt.RequestDelayRandomize { if g.Opt.RequestDelayRandomize {
min := float64(g.Opt.RequestDelay) * 0.5 min := float64(g.Opt.RequestDelay) * 0.5
max := float64(g.Opt.RequestDelay) * 1.5 max := float64(g.Opt.RequestDelay) * 1.5
@ -75,29 +75,29 @@ func delayMiddleware(g *Geziyor, r *http.Request) {
} }
// logMiddleware logs requests // logMiddleware logs requests
func logMiddleware(g *Geziyor, r *http.Request) { func logMiddleware(g *Geziyor, r *client.Request) {
log.Println("Fetching: ", r.URL.String()) log.Println("Fetching: ", r.URL.String())
} }
// metricsRequestMiddleware sets stats // metricsRequestMiddleware sets stats
func metricsRequestMiddleware(g *Geziyor, r *http.Request) { func metricsRequestMiddleware(g *Geziyor, r *client.Request) {
g.metrics.RequestCounter.With("method", r.Method).Add(1) g.metrics.RequestCounter.With("method", r.Method).Add(1)
} }
// parseHTMLMiddleware parses response if response is HTML // parseHTMLMiddleware parses response if response is HTML
func parseHTMLMiddleware(g *Geziyor, r *http.Response) { func parseHTMLMiddleware(g *Geziyor, r *client.Response) {
if !g.Opt.ParseHTMLDisabled && r.IsHTML() { if !g.Opt.ParseHTMLDisabled && r.IsHTML() {
r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
} }
} }
// metricsResponseMiddleware sets stats // metricsResponseMiddleware sets stats
func metricsResponseMiddleware(g *Geziyor, r *http.Response) { func metricsResponseMiddleware(g *Geziyor, r *client.Response) {
g.metrics.ResponseCounter.With("method", r.Request.Method).Add(1) g.metrics.ResponseCounter.With("method", r.Request.Method).Add(1)
} }
// extractorsMiddleware extracts data from loaders conf and exports it to exporters // extractorsMiddleware extracts data from loaders conf and exports it to exporters
func extractorsMiddleware(g *Geziyor, r *http.Response) { func extractorsMiddleware(g *Geziyor, r *client.Response) {
// Check if we have extractors and exporters // Check if we have extractors and exporters
if len(g.Opt.Extractors) != 0 && len(g.Opt.Exporters) != 0 { if len(g.Opt.Extractors) != 0 && len(g.Opt.Exporters) != 0 {

View File

@ -2,7 +2,7 @@ package geziyor
import ( import (
"github.com/fpfeng/httpcache" "github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor/http" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
"time" "time"
) )
@ -20,7 +20,7 @@ type Options struct {
StartRequestsFunc func(g *Geziyor) StartRequestsFunc func(g *Geziyor)
// ParseFunc is callback of StartURLs response. // ParseFunc is callback of StartURLs response.
ParseFunc func(g *Geziyor, r *http.Response) ParseFunc func(g *Geziyor, r *client.Response)
// Extractors extracts items from pages // Extractors extracts items from pages
Extractors []Extractor Extractors []Extractor