diff --git a/.travis.yml b/.travis.yml index 07b80a8..31eb28d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,6 @@ language: go go: - - 1.5.x - - 1.6.x - - 1.7.x - - 1.8.x - - 1.9.x - 1.10.x - 1.11.x - tip diff --git a/README.md b/README.md index 96aec71..8043905 100644 --- a/README.md +++ b/README.md @@ -19,13 +19,13 @@ See scraper [Options](https://godoc.org/github.com/geziyor/geziyor#Options) for ## Status Since the project is in **development phase**, **API may change in time**. Thus, we highly recommend you to use Geziyor with go modules. -## Usage +## Examples Simple usage ```go geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://api.ipify.org"}, - ParseFunc: func(r *geziyor.Response) { + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { fmt.Println(string(r.Body)) }, }).Start() @@ -42,21 +42,76 @@ func main() { }).Start() } -func quotesParse(r *geziyor.Response) { +func quotesParse(g *geziyor.Geziyor, r *geziyor.Response) { r.DocHTML.Find("div.quote").Each(func(i int, s *goquery.Selection) { - r.Geziyor.Exports <- map[string]interface{}{ + g.Exports <- map[string]interface{}{ "text": s.Find("span.text").Text(), "author": s.Find("small.author").Text(), } }) if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok { - r.Geziyor.Get(r.JoinURL(href), quotesParse) + g.Get(r.JoinURL(href), quotesParse) } } ``` See [tests](https://github.com/geziyor/geziyor/blob/master/geziyor_test.go) for more usage examples. -## Installation + +## Documentation + +### Installation go get github.com/geziyor/geziyor + +### Making Requests + +Initial requests start with ```StartURLs []string``` field in ```Options```. +Geziyor makes concurrent requests to those URLs. +After reading response, ```ParseFunc func(g *Geziyor, r *Response)``` called. + +```go +geziyor.NewGeziyor(geziyor.Options{ + StartURLs: []string{"http://api.ipify.org"}, + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { + fmt.Println(string(r.Body)) + }, +}).Start() +``` + +If you want to manually create first requests, set ```StartRequestsFunc```. +```StartURLs``` won't be used if you create requests manually. +You can make following requests using ```Geziyor``` methods: +- ```Get```: Make GET request +- ```GetRendered```: Make GET and render Javascript using Headless Browser. +As it opens up a real browser, it takes a couple of seconds to make requests. +- ```Head```: Make HEAD request +- ```Do```: Make custom request by providing *geziyor.Request + + +```go +geziyor.NewGeziyor(geziyor.Options{ + StartRequestsFunc: func(g *geziyor.Geziyor) { + g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc) + g.Head("https://httpbin.org/anything", g.Opt.ParseFunc) + }, + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { + fmt.Println(string(r.Body)) + }, +}).Start() +``` + + + +## Roadmap + +If you're interested in helping this project, please consider these features: + +- Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html)) +- Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape)) +- Deploying Scrapers to Cloud +- ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~ +- Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html)) +- Realtime metrics (Prometheus etc.) + + \ No newline at end of file diff --git a/exporter/csv.go b/exporter/csv.go index fb4b439..3b37530 100644 --- a/exporter/csv.go +++ b/exporter/csv.go @@ -3,37 +3,28 @@ package exporter import ( "encoding/csv" "fmt" + "github.com/geziyor/geziyor/internal" "log" "os" "reflect" - "sync" ) // CSVExporter exports response data as CSV streaming file type CSVExporter struct { FileName string - - once sync.Once - writer *csv.Writer } // Export exports response data as CSV streaming file func (e *CSVExporter) Export(exports chan interface{}) { - // Default filename - if e.FileName == "" { - e.FileName = "out.csv" + // Create file + newFile, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.csv"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) + if err != nil { + log.Printf("output file creation error: %v", err) + return } - // Create file - e.once.Do(func() { - newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) - if err != nil { - fmt.Fprintf(os.Stderr, "output file creation error: %v", err) - return - } - e.writer = csv.NewWriter(newFile) - }) + writer := csv.NewWriter(newFile) // Export data as responses came for res := range exports { @@ -55,10 +46,9 @@ func (e *CSVExporter) Export(exports chan interface{}) { // } } - // Write to file - if err := e.writer.Write(values); err != nil { + if err := writer.Write(values); err != nil { log.Printf("CSV writing error on exporter: %v\n", err) } } - e.writer.Flush() + writer.Flush() } diff --git a/exporter/json.go b/exporter/json.go index ff41eca..03c4352 100644 --- a/exporter/json.go +++ b/exporter/json.go @@ -2,43 +2,33 @@ package exporter import ( "encoding/json" - "fmt" + "github.com/geziyor/geziyor/internal" "log" "os" - "sync" ) // JSONExporter exports response data as JSON streaming file type JSONExporter struct { FileName string EscapeHTML bool - - once sync.Once - encoder *json.Encoder } // Export exports response data as JSON streaming file func (e *JSONExporter) Export(exports chan interface{}) { - // Default filename - if e.FileName == "" { - e.FileName = "out.json" + // Create file + newFile, err := os.OpenFile(internal.PreferFirst(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) + if err != nil { + log.Printf("output file creation error: %v", err) + return } - // Create file - e.once.Do(func() { - newFile, err := os.OpenFile(e.FileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) - if err != nil { - fmt.Fprintf(os.Stderr, "output file creation error: %v", err) - return - } - e.encoder = json.NewEncoder(newFile) - e.encoder.SetEscapeHTML(e.EscapeHTML) - }) + encoder := json.NewEncoder(newFile) + encoder.SetEscapeHTML(e.EscapeHTML) // Export data as responses came for res := range exports { - if err := e.encoder.Encode(res); err != nil { + if err := encoder.Encode(res); err != nil { log.Printf("JSON encoding error on exporter: %v\n", err) } } diff --git a/geziyor.go b/geziyor.go index 795fbdf..e44140c 100644 --- a/geziyor.go +++ b/geziyor.go @@ -7,12 +7,12 @@ import ( "github.com/chromedp/cdproto/dom" "github.com/chromedp/chromedp" "github.com/fpfeng/httpcache" + "github.com/geziyor/geziyor/internal" "golang.org/x/net/html/charset" "io" "io/ioutil" "log" "math/rand" - "net" "net/http" "os" "runtime/debug" @@ -25,10 +25,6 @@ type Exporter interface { Export(exports chan interface{}) } -// RequestMiddleware called before requests made. -// Set request.Cancelled = true to cancel request -type RequestMiddleware func(g *Geziyor, r *Request) - // Geziyor is our main scraper type type Geziyor struct { Opt Options @@ -54,22 +50,7 @@ func init() { // If options provided, options func NewGeziyor(opt Options) *Geziyor { geziyor := &Geziyor{ - client: &http.Client{ - Transport: &http.Transport{ - Proxy: http.ProxyFromEnvironment, - DialContext: (&net.Dialer{ - Timeout: 30 * time.Second, - KeepAlive: 30 * time.Second, - DualStack: true, - }).DialContext, - MaxIdleConns: 0, // Default: 100 - MaxIdleConnsPerHost: 1000, // Default: 2 - IdleConnTimeout: 90 * time.Second, - TLSHandshakeTimeout: 10 * time.Second, - ExpectContinueTimeout: 1 * time.Second, - }, - Timeout: time.Second * 180, // Google's timeout - }, + client: internal.NewClient(), Opt: opt, Exports: make(chan interface{}), requestMiddlewares: []RequestMiddleware{ @@ -79,6 +60,12 @@ func NewGeziyor(opt Options) *Geziyor { }, } + if opt.UserAgent == "" { + geziyor.Opt.UserAgent = "Geziyor 1.0" + } + if opt.MaxBodySize == 0 { + geziyor.Opt.MaxBodySize = 1024 * 1024 * 1024 // 1GB + } if opt.Cache != nil { geziyor.client.Transport = &httpcache.Transport{ Transport: geziyor.client.Transport, Cache: opt.Cache, MarkCachedResponses: true} @@ -95,15 +82,9 @@ func NewGeziyor(opt Options) *Geziyor { hostSems map[string]chan struct{} }{hostSems: make(map[string]chan struct{})} } - if opt.UserAgent == "" { - geziyor.Opt.UserAgent = "Geziyor 1.0" - } if opt.LogDisabled { log.SetOutput(ioutil.Discard) } - if opt.MaxBodySize == 0 { - geziyor.Opt.MaxBodySize = 1024 * 1024 * 1024 // 1GB - } geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...) return geziyor @@ -113,6 +94,7 @@ func NewGeziyor(opt Options) *Geziyor { func (g *Geziyor) Start() { log.Println("Scraping Started") + // Start Exporters if len(g.Opt.Exporters) != 0 { for _, exp := range g.Opt.Exporters { go exp.Export(g.Exports) @@ -124,6 +106,7 @@ func (g *Geziyor) Start() { }() } + // Start Requests if g.Opt.StartRequestsFunc == nil { for _, startURL := range g.Opt.StartURLs { g.Get(startURL, g.Opt.ParseFunc) @@ -138,7 +121,7 @@ func (g *Geziyor) Start() { } // Get issues a GET to the specified URL. -func (g *Geziyor) Get(url string, callback func(resp *Response)) { +func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *Response)) { req, err := http.NewRequest("GET", url, nil) if err != nil { log.Printf("Request creating error %v\n", err) @@ -150,7 +133,7 @@ func (g *Geziyor) Get(url string, callback func(resp *Response)) { // GetRendered issues GET request using headless browser // Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed. // Rendered requests only supported for GET requests. -func (g *Geziyor) GetRendered(url string, callback func(resp *Response)) { +func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *Response)) { req, err := http.NewRequest("GET", url, nil) if err != nil { log.Printf("Request creating error %v\n", err) @@ -160,7 +143,7 @@ func (g *Geziyor) GetRendered(url string, callback func(resp *Response)) { } // Head issues a HEAD to the specified URL -func (g *Geziyor) Head(url string, callback func(resp *Response)) { +func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *Response)) { req, err := http.NewRequest("HEAD", url, nil) if err != nil { log.Printf("Request creating error %v\n", err) @@ -170,13 +153,13 @@ func (g *Geziyor) Head(url string, callback func(resp *Response)) { } // Do sends an HTTP request -func (g *Geziyor) Do(req *Request, callback func(resp *Response)) { +func (g *Geziyor) Do(req *Request, callback func(g *Geziyor, r *Response)) { g.wg.Add(1) go g.do(req, callback) } // Do sends an HTTP request -func (g *Geziyor) do(req *Request, callback func(resp *Response)) { +func (g *Geziyor) do(req *Request, callback func(g *Geziyor, r *Response)) { defer g.wg.Done() defer func() { if r := recover(); r != nil { @@ -209,10 +192,10 @@ func (g *Geziyor) do(req *Request, callback func(resp *Response)) { // Callbacks if callback != nil { - callback(response) + callback(g, response) } else { if g.Opt.ParseFunc != nil { - g.Opt.ParseFunc(response) + g.Opt.ParseFunc(g, response) } } } @@ -239,7 +222,7 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) { bodyReader := io.LimitReader(resp.Body, g.Opt.MaxBodySize) // Start reading body and determine encoding - if !g.Opt.CharsetDetectDisabled { + if !g.Opt.CharsetDetectDisabled && resp.Request.Method != "HEAD" { bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type")) if err != nil { log.Printf("Determine encoding error: %v\n", err) @@ -257,7 +240,6 @@ func (g *Geziyor) doRequestClient(req *Request) (*Response, error) { Response: resp, Body: body, Meta: req.Meta, - Geziyor: g, } return &response, nil @@ -290,14 +272,13 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) { return nil, err } - response := &Response{ + response := Response{ //Response: resp, - Body: []byte(res), - Meta: req.Meta, - Geziyor: g, + Body: []byte(res), + Meta: req.Meta, } - return response, nil + return &response, nil } func (g *Geziyor) acquireSem(req *Request) { diff --git a/geziyor_test.go b/geziyor_test.go index 7f27507..7f34032 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -15,7 +15,7 @@ import ( func TestSimple(t *testing.T) { geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://api.ipify.org"}, - ParseFunc: func(r *geziyor.Response) { + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { fmt.Println(string(r.Body)) }, }).Start() @@ -23,16 +23,15 @@ func TestSimple(t *testing.T) { func TestSimpleCache(t *testing.T) { defer leaktest.Check(t)() - gez := geziyor.NewGeziyor(geziyor.Options{ + geziyor.NewGeziyor(geziyor.Options{ StartURLs: []string{"http://api.ipify.org"}, Cache: httpcache.NewMemoryCache(), - ParseFunc: func(r *geziyor.Response) { + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { fmt.Println(string(r.Body)) - r.Geziyor.Exports <- string(r.Body) - r.Geziyor.Get("http://api.ipify.org", nil) + g.Exports <- string(r.Body) + g.Get("http://api.ipify.org", nil) }, - }) - gez.Start() + }).Start() } func TestQuotes(t *testing.T) { @@ -44,10 +43,10 @@ func TestQuotes(t *testing.T) { }).Start() } -func quotesParse(r *geziyor.Response) { +func quotesParse(g *geziyor.Geziyor, r *geziyor.Response) { r.DocHTML.Find("div.quote").Each(func(i int, s *goquery.Selection) { // Export Data - r.Geziyor.Exports <- map[string]interface{}{ + g.Exports <- map[string]interface{}{ "number": i, "text": s.Find("span.text").Text(), "author": s.Find("small.author").Text(), @@ -59,7 +58,7 @@ func quotesParse(r *geziyor.Response) { // Next Page if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok { - r.Geziyor.Get(r.JoinURL(href), quotesParse) + g.Get(r.JoinURL(href), quotesParse) } } @@ -69,11 +68,11 @@ func TestAllLinks(t *testing.T) { geziyor.NewGeziyor(geziyor.Options{ AllowedDomains: []string{"books.toscrape.com"}, StartURLs: []string{"http://books.toscrape.com/"}, - ParseFunc: func(r *geziyor.Response) { - r.Geziyor.Exports <- []string{r.Request.URL.String()} + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { + g.Exports <- []string{r.Request.URL.String()} r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) { if href, ok := s.Attr("href"); ok { - r.Geziyor.Get(r.JoinURL(href), r.Geziyor.Opt.ParseFunc) + g.Get(r.JoinURL(href), g.Opt.ParseFunc) } }) }, @@ -95,9 +94,9 @@ func TestStartRequestsFunc(t *testing.T) { StartRequestsFunc: func(g *geziyor.Geziyor) { g.Get("http://quotes.toscrape.com/", g.Opt.ParseFunc) }, - ParseFunc: func(r *geziyor.Response) { + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { r.DocHTML.Find("a").Each(func(_ int, s *goquery.Selection) { - r.Geziyor.Exports <- s.AttrOr("href", "") + g.Exports <- s.AttrOr("href", "") }) }, Exporters: []geziyor.Exporter{&exporter.JSONExporter{}}, @@ -109,9 +108,20 @@ func TestGetRendered(t *testing.T) { StartRequestsFunc: func(g *geziyor.Geziyor) { g.GetRendered("https://httpbin.org/anything", g.Opt.ParseFunc) }, - ParseFunc: func(r *geziyor.Response) { + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { fmt.Println(string(r.Body)) }, //URLRevisitEnabled: true, }).Start() } + +func TestHEADRequest(t *testing.T) { + geziyor.NewGeziyor(geziyor.Options{ + StartRequestsFunc: func(g *geziyor.Geziyor) { + g.Head("https://httpbin.org/anything", g.Opt.ParseFunc) + }, + ParseFunc: func(g *geziyor.Geziyor, r *geziyor.Response) { + fmt.Println(string(r.Body)) + }, + }).Start() +} diff --git a/internal/http.go b/internal/http.go new file mode 100644 index 0000000..67bcc17 --- /dev/null +++ b/internal/http.go @@ -0,0 +1,35 @@ +package internal + +import ( + "net" + "net/http" + "time" +) + +// NewClient creates http.Client with modified values for typical web scraper +func NewClient() *http.Client { + return &http.Client{ + Transport: &http.Transport{ + Proxy: http.ProxyFromEnvironment, + DialContext: (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + DualStack: true, + }).DialContext, + MaxIdleConns: 0, // Default: 100 + MaxIdleConnsPerHost: 1000, // Default: 2 + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + }, + Timeout: time.Second * 180, // Google's timeout + } +} + +// SetDefaultHeader sets header if not exists before +func SetDefaultHeader(header http.Header, key string, value string) http.Header { + if header.Get(key) == "" { + header.Set(key, value) + } + return header +} diff --git a/internal/strings.go b/internal/strings.go new file mode 100644 index 0000000..2734e8e --- /dev/null +++ b/internal/strings.go @@ -0,0 +1,19 @@ +package internal + +// PreferFirst returns first non-empty string +func PreferFirst(first string, second string) string { + if first != "" { + return first + } + return second +} + +// Contains checks whether []string Contains string +func Contains(s []string, e string) bool { + for _, a := range s { + if a == e { + return true + } + } + return false +} diff --git a/middleware.go b/middleware.go new file mode 100644 index 0000000..8452b4f --- /dev/null +++ b/middleware.go @@ -0,0 +1,35 @@ +package geziyor + +import "github.com/geziyor/geziyor/internal" + +// RequestMiddleware called before requests made. +// Set request.Cancelled = true to cancel request +type RequestMiddleware func(g *Geziyor, r *Request) + +// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains +func allowedDomainsMiddleware(g *Geziyor, r *Request) { + if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) { + //log.Printf("Domain not allowed: %s\n", req.Host) + r.Cancelled = true + return + } +} + +// duplicateRequestsMiddleware checks for already visited URLs +func duplicateRequestsMiddleware(g *Geziyor, r *Request) { + if !g.Opt.URLRevisitEnabled { + key := r.Request.URL.String() + r.Request.Method + if _, visited := g.visitedURLs.LoadOrStore(key, struct{}{}); visited { + //log.Printf("URL already visited %s\n", rawURL) + r.Cancelled = true + } + } +} + +// defaultHeadersMiddleware sets default request headers +func defaultHeadersMiddleware(g *Geziyor, r *Request) { + r.Header = internal.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + r.Header = internal.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8") + r.Header = internal.SetDefaultHeader(r.Header, "Accept-Language", "en") + r.Header = internal.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent) +} diff --git a/options.go b/options.go index 16a1e5e..7e05593 100644 --- a/options.go +++ b/options.go @@ -18,7 +18,7 @@ type Options struct { StartRequestsFunc func(g *Geziyor) // ParseFunc is callback of StartURLs response. - ParseFunc func(r *Response) + ParseFunc func(g *Geziyor, r *Response) // Timeout is global request timeout Timeout time.Duration @@ -33,7 +33,7 @@ type Options struct { // Concurrent requests per domain limit ConcurrentRequestsPerDomain int - // User Agent + // User Agent. Default: "Geziyor 1.0" UserAgent string // Request delays @@ -50,7 +50,7 @@ type Options struct { // Called before requests made to manipulate requests RequestMiddlewares []RequestMiddleware - // Max body reading size in bytes + // Max body reading size in bytes. Default: 1GB MaxBodySize int64 // Charset Detection disable diff --git a/request.go b/request.go index a4362df..ec74626 100644 --- a/request.go +++ b/request.go @@ -11,44 +11,3 @@ type Request struct { Rendered bool Cancelled bool } - -func allowedDomainsMiddleware(g *Geziyor, r *Request) { - if len(g.Opt.AllowedDomains) != 0 && !contains(g.Opt.AllowedDomains, r.Host) { - //log.Printf("Domain not allowed: %s\n", req.Host) - r.Cancelled = true - return - } -} - -func duplicateRequestsMiddleware(g *Geziyor, r *Request) { - if !g.Opt.URLRevisitEnabled { - if _, visited := g.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited { - //log.Printf("URL already visited %s\n", rawURL) - r.Cancelled = true - } - } -} - -func defaultHeadersMiddleware(g *Geziyor, r *Request) { - r.Header = headerSetDefault(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") - r.Header = headerSetDefault(r.Header, "Accept-Charset", "utf-8") - r.Header = headerSetDefault(r.Header, "Accept-Language", "en") - r.Header = headerSetDefault(r.Header, "User-Agent", g.Opt.UserAgent) -} - -func headerSetDefault(header http.Header, key string, value string) http.Header { - if header.Get(key) == "" { - header.Set(key, value) - } - return header -} - -// contains checks whether []string contains string -func contains(s []string, e string) bool { - for _, a := range s { - if a == e { - return true - } - } - return false -} diff --git a/response.go b/response.go index 71076e1..2c970a9 100644 --- a/response.go +++ b/response.go @@ -14,8 +14,6 @@ type Response struct { Body []byte DocHTML *goquery.Document Meta map[string]interface{} - - Geziyor *Geziyor } // JoinURL joins base response URL and provided relative URL.