HTTP Client can be changed now. Docs updated.

This commit is contained in:
Musab Gültekin 2019-06-22 13:12:05 +03:00
parent 7bc782400c
commit a64a262554
6 changed files with 45 additions and 26 deletions

View File

@ -58,7 +58,6 @@ func quotesParse(g *geziyor.Geziyor, r *geziyor.Response) {
See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for more usage examples.
## Documentation
### Installation
@ -101,7 +100,25 @@ geziyor.NewGeziyor(&geziyor.Options{
}).Start()
```
### Exporting Data
You can export data automatically using exporters. Just send data to ```Geziyor.Exports``` chan.
[Available exporters](https://godoc.org/github.com/geziyor/geziyor/exporter)
```go
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
r.HTMLDoc.Find("div.quote").Each(func(_ int, s *goquery.Selection) {
g.Exports <- map[string]interface{}{
"text": s.Find("span.text").Text(),
"author": s.Find("small.author").Text(),
}
})
},
Exporters: []geziyor.Exporter{&exporter.JSONExporter{}},
}).Start()
```
## Roadmap
@ -112,6 +129,6 @@ If you're interested in helping this project, please consider these features:
- Deploying Scrapers to Cloud
- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~
- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html))
- Realtime metrics (Prometheus etc.)
- ~~Realtime metrics (Prometheus etc.)~~

View File

@ -6,7 +6,7 @@ import (
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
"github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor/internal"
"github.com/geziyor/geziyor/http"
"github.com/geziyor/geziyor/metrics"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus/promhttp"
@ -14,7 +14,7 @@ import (
"io"
"io/ioutil"
"log"
"net/http"
stdhttp "net/http"
"net/http/cookiejar"
"net/url"
"sync"
@ -28,7 +28,7 @@ type Exporter interface {
// Geziyor is our main scraper type
type Geziyor struct {
Opt *Options
Client *internal.Client
Client *http.Client
Exports chan interface{}
metrics *metrics.Metrics
@ -47,7 +47,7 @@ type Geziyor struct {
// If options provided, options
func NewGeziyor(opt *Options) *Geziyor {
geziyor := &Geziyor{
Client: internal.NewClient(),
Client: http.NewClient(),
Opt: opt,
Exports: make(chan interface{}),
requestMiddlewares: []RequestMiddleware{
@ -104,10 +104,10 @@ func (g *Geziyor) Start() {
log.Println("Scraping Started")
// Metrics
metricsServer := &http.Server{Addr: ":2112"}
metricsServer := &stdhttp.Server{Addr: ":2112"}
if g.Opt.MetricsType == metrics.Prometheus {
go func() {
http.Handle("/metrics", promhttp.Handler())
stdhttp.Handle("/metrics", promhttp.Handler())
metricsServer.ListenAndServe()
}()
}
@ -141,7 +141,7 @@ func (g *Geziyor) Start() {
// Get issues a GET to the specified URL.
func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *Response)) {
req, err := http.NewRequest("GET", url, nil)
req, err := stdhttp.NewRequest("GET", url, nil)
if err != nil {
log.Printf("Request creating error %v\n", err)
return
@ -153,7 +153,7 @@ func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *Response)) {
// Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed.
// Rendered requests only supported for GET requests.
func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *Response)) {
req, err := http.NewRequest("GET", url, nil)
req, err := stdhttp.NewRequest("GET", url, nil)
if err != nil {
log.Printf("Request creating error %v\n", err)
return
@ -163,7 +163,7 @@ func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *Response)
// Head issues a HEAD to the specified URL
func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *Response)) {
req, err := http.NewRequest("HEAD", url, nil)
req, err := stdhttp.NewRequest("HEAD", url, nil)
if err != nil {
log.Printf("Request creating error %v\n", err)
return
@ -265,7 +265,7 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
if err := chromedp.Run(ctx,
network.Enable(),
network.SetExtraHTTPHeaders(network.Headers(internal.ConvertHeaderToMap(req.Header))),
network.SetExtraHTTPHeaders(network.Headers(http.ConvertHeaderToMap(req.Header))),
chromedp.ActionFunc(func(ctx context.Context) error {
chromedp.ListenTarget(ctx, func(ev interface{}) {
switch ev.(type) {
@ -299,10 +299,10 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
req.URL, _ = url.Parse(res.URL)
response := Response{
Response: &http.Response{
Response: &stdhttp.Response{
Request: req.Request,
StatusCode: int(res.Status),
Header: internal.ConvertMapToHeader(res.Headers),
Header: http.ConvertMapToHeader(res.Headers),
},
Body: []byte(body),
Meta: req.Meta,

View File

@ -46,7 +46,7 @@ func TestQuotes(t *testing.T) {
}
func quotesParse(g *geziyor.Geziyor, r *geziyor.Response) {
r.DocHTML.Find("div.quote").Each(func(i int, s *goquery.Selection) {
r.HTMLDoc.Find("div.quote").Each(func(i int, s *goquery.Selection) {
// Export Data
g.Exports <- map[string]interface{}{
"number": i,
@ -59,7 +59,7 @@ func quotesParse(g *geziyor.Geziyor, r *geziyor.Response) {
})
// Next Page
if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok {
if href, ok := r.HTMLDoc.Find("li.next > a").Attr("href"); ok {
g.Get(r.JoinURL(href), quotesParse)
}
}
@ -72,13 +72,14 @@ func TestAllLinks(t *testing.T) {
StartURLs: []string{"http://books.toscrape.com/"},
ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
g.Exports <- []string{r.Request.URL.String()}
r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) {
r.HTMLDoc.Find("a").Each(func(i int, s *goquery.Selection) {
if href, ok := s.Attr("href"); ok {
g.Get(r.JoinURL(href), g.Opt.ParseFunc)
}
})
},
Exporters: []geziyor.Exporter{&exporter.CSVExporter{}},
MetricsType: metrics.Prometheus,
}).Start()
}
@ -97,7 +98,7 @@ func TestStartRequestsFunc(t *testing.T) {
g.Get("http://quotes.toscrape.com/", g.Opt.ParseFunc)
},
ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) {
r.DocHTML.Find("a").Each(func(_ int, s *goquery.Selection) {
r.HTMLDoc.Find("a").Each(func(_ int, s *goquery.Selection) {
g.Exports <- s.AttrOr("href", "")
})
},

View File

@ -1,4 +1,4 @@
package internal
package http
import (
"errors"

View File

@ -3,6 +3,7 @@ package geziyor
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor/http"
"github.com/geziyor/geziyor/internal"
"log"
"math/rand"
@ -53,10 +54,10 @@ func duplicateRequestsMiddleware(g *Geziyor, r *Request) {
// defaultHeadersMiddleware sets default request headers
func defaultHeadersMiddleware(g *Geziyor, r *Request) {
r.Header = internal.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Header = internal.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
r.Header = internal.SetDefaultHeader(r.Header, "Accept-Language", "en")
r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
r.Header = http.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Header = http.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8")
r.Header = http.SetDefaultHeader(r.Header, "Accept-Language", "en")
r.Header = http.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent)
}
// delayMiddleware delays requests
@ -83,7 +84,7 @@ func metricsRequestMiddleware(g *Geziyor, r *Request) {
// parseHTMLMiddleware parses response if response is HTML
func parseHTMLMiddleware(g *Geziyor, r *Response) {
if !g.Opt.ParseHTMLDisabled && r.isHTML() {
r.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
}
}

View File

@ -12,7 +12,7 @@ import (
type Response struct {
*http.Response
Body []byte
DocHTML *goquery.Document
HTMLDoc *goquery.Document
Meta map[string]interface{}
Request *Request
}