diff --git a/README.md b/README.md index d50da9a..ebc09f8 100644 --- a/README.md +++ b/README.md @@ -104,28 +104,6 @@ geziyor.NewGeziyor(&geziyor.Options{ ### Extracting Data -#### Extractors -You can add [Extractor](https://godoc.org/github.com/geziyor/geziyor/extractor) to ```[]Extractors``` option to extract structured data. -```Exporters``` need to be defined in order extractors to work. - -```go -geziyor.NewGeziyor(&geziyor.Options{ - StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"}, - Extractors: []extract.Extractor{ - &extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"}, - &extract.Text{Name: "title", Selector: ".c-page-title"}, - &extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"}, - &extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"}, - &extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"}, - &extract.Text{Name: "summary", Selector: ".c-entry-summary"}, - &extract.Text{Name: "content", Selector: ".c-entry-content"}, - }, - Exporters: []export.Exporter{&export.JSON{}}, -}).Start() -``` - -#### HTML selectors - We can extract HTML elements using ```response.HTMLDoc```. HTMLDoc is Goquery's [Document](https://godoc.org/github.com/PuerkitoBio/goquery#Document). HTMLDoc can be accessible on Response if response is HTML and can be parsed using Go's built-in HTML [parser](https://godoc.org/golang.org/x/net/html#Parse) @@ -183,7 +161,6 @@ ok github.com/geziyor/geziyor 22.861s If you're interested in helping this project, please consider these features: - Command line tool for: pausing and resuming scraper etc. (like [this](https://docs.scrapy.org/en/latest/topics/commands.html)) -- ~~Automatic item extractors (like [this](https://github.com/andrew-d/goscrape#goscrape))~~ - Deploying Scrapers to Cloud - ~~Automatically exporting extracted data to multiple places (AWS, FTP, DB, JSON, CSV etc)~~ - Downloading media (Images, Videos etc) (like [this](https://docs.scrapy.org/en/latest/topics/media-pipeline.html)) diff --git a/export/export.go b/export/export.go index 6d8cbf8..8f50bd3 100644 --- a/export/export.go +++ b/export/export.go @@ -1,7 +1,6 @@ package export // Exporter interface is for extracting data to external resources. -// Geziyor calls every extractors Export functions before any scraping starts. // Export functions should wait for new data from exports chan. type Exporter interface { Export(exports chan interface{}) diff --git a/extract/attr.go b/extract/attr.go deleted file mode 100644 index 90c3a9e..0000000 --- a/extract/attr.go +++ /dev/null @@ -1,24 +0,0 @@ -package extract - -import ( - "errors" - "github.com/PuerkitoBio/goquery" -) - -var ErrAttrNotExists = errors.New("attribute not exist") - -// Attr returns HTML attribute value of provided selector -type Attr struct { - Name string - Selector string - Attr string -} - -// Extract returns HTML attribute value of provided selector -func (e Attr) Extract(sel *goquery.Selection) (interface{}, error) { - attr, exists := sel.Find(e.Selector).Attr(e.Attr) - if !exists { - return nil, ErrAttrNotExists - } - return map[string]string{e.Name: attr}, nil -} diff --git a/extract/extract.go b/extract/extract.go deleted file mode 100644 index 0342193..0000000 --- a/extract/extract.go +++ /dev/null @@ -1,8 +0,0 @@ -package extract - -import "github.com/PuerkitoBio/goquery" - -// Extractor interface is for extracting data from HTML document -type Extractor interface { - Extract(sel *goquery.Selection) (interface{}, error) -} diff --git a/extract/html.go b/extract/html.go deleted file mode 100644 index 56b2d37..0000000 --- a/extract/html.go +++ /dev/null @@ -1,52 +0,0 @@ -package extract - -import ( - "bytes" - "github.com/PuerkitoBio/goquery" - "golang.org/x/net/html" -) - -// HTML extracts and returns the HTML from inside each element of the given selection. -type HTML struct { - Name string - Selector string -} - -// Extract extracts and returns the HTML from inside each element of the given selection. -func (e HTML) Extract(sel *goquery.Selection) (interface{}, error) { - var ret, h string - var err error - - sel.Find(e.Selector).EachWithBreak(func(i int, s *goquery.Selection) bool { - h, err = s.Html() - if err != nil { - return false - } - - ret += h - return true - }) - if err != nil { - return nil, err - } - - return map[string]string{e.Name: ret}, nil -} - -// OuterHTML extracts and returns the HTML of each element of the given selection. -type OuterHTML struct { - Name string - Selector string -} - -// Extract extracts and returns the HTML of each element of the given selection. -func (e OuterHTML) Extract(sel *goquery.Selection) (interface{}, error) { - output := bytes.NewBufferString("") - for _, node := range sel.Find(e.Selector).Nodes { - if err := html.Render(output, node); err != nil { - return nil, err - } - } - - return map[string]string{e.Name: output.String()}, nil -} diff --git a/extract/text.go b/extract/text.go deleted file mode 100644 index 9800ef5..0000000 --- a/extract/text.go +++ /dev/null @@ -1,22 +0,0 @@ -package extract - -import ( - "github.com/PuerkitoBio/goquery" - "strings" -) - -// Text returns the combined text contents of provided selector. -type Text struct { - Name string - Selector string - TrimSpace bool -} - -// Extract returns the combined text contents of provided selector. -func (e Text) Extract(sel *goquery.Selection) (interface{}, error) { - text := sel.Find(e.Selector).Text() - if e.TrimSpace { - text = strings.TrimSpace(text) - } - return map[string]string{e.Name: text}, nil -} diff --git a/geziyor.go b/geziyor.go index 341e21e..702e9fa 100644 --- a/geziyor.go +++ b/geziyor.go @@ -4,9 +4,12 @@ import ( "github.com/fpfeng/httpcache" "github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/metrics" + "github.com/geziyor/geziyor/middleware" "io/ioutil" "log" "net/http/cookiejar" + "os" + "runtime/debug" "sync" ) @@ -17,8 +20,8 @@ type Geziyor struct { Exports chan interface{} metrics *metrics.Metrics - requestMiddlewares []RequestMiddleware - responseMiddlewares []ResponseMiddleware + requestMiddlewares []middleware.RequestProcessor + responseMiddlewares []middleware.ResponseProcessor wgRequests sync.WaitGroup wgExporters sync.WaitGroup semGlobal chan struct{} @@ -26,7 +29,6 @@ type Geziyor struct { sync.RWMutex hostSems map[string]chan struct{} } - visitedURLs sync.Map } // NewGeziyor creates new Geziyor with default values. @@ -35,22 +37,23 @@ func NewGeziyor(opt *Options) *Geziyor { geziyor := &Geziyor{ Opt: opt, Exports: make(chan interface{}, 1), - requestMiddlewares: []RequestMiddleware{ - allowedDomainsMiddleware, - duplicateRequestsMiddleware, - defaultHeadersMiddleware, - delayMiddleware, - logMiddleware, - metricsRequestMiddleware, + requestMiddlewares: []middleware.RequestProcessor{ + &middleware.AllowedDomains{AllowedDomains: opt.AllowedDomains}, + &middleware.DuplicateRequests{RevisitEnabled: opt.URLRevisitEnabled}, + &middleware.Headers{UserAgent: opt.UserAgent}, + middleware.NewDelay(opt.RequestDelayRandomize, opt.RequestDelay), }, - responseMiddlewares: []ResponseMiddleware{ - parseHTMLMiddleware, - metricsResponseMiddleware, - extractorsMiddleware, + responseMiddlewares: []middleware.ResponseProcessor{ + &middleware.ParseHTML{ParseHTMLDisabled: opt.ParseHTMLDisabled}, + &middleware.LogStats{LogDisabled: opt.LogDisabled}, }, metrics: metrics.NewMetrics(opt.MetricsType), } + metricsMiddleware := &middleware.Metrics{Metrics: geziyor.metrics} + geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, metricsMiddleware) + geziyor.responseMiddlewares = append(geziyor.responseMiddlewares, metricsMiddleware) + // Default if opt.UserAgent == "" { opt.UserAgent = client.DefaultUserAgent @@ -95,6 +98,8 @@ func NewGeziyor(opt *Options) *Geziyor { // Logging if opt.LogDisabled { log.SetOutput(ioutil.Discard) + } else { + log.SetOutput(os.Stdout) } return geziyor @@ -193,10 +198,10 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re if !req.Synchronized { defer g.wgRequests.Done() } - defer recoverMiddleware(g, req) + defer g.recoverMe() for _, middlewareFunc := range g.requestMiddlewares { - middlewareFunc(g, req) + middlewareFunc.ProcessRequest(req) if req.Cancelled { return } @@ -209,7 +214,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re } for _, middlewareFunc := range g.responseMiddlewares { - middlewareFunc(g, res) + middlewareFunc.ProcessResponse(res) } // Callbacks @@ -248,3 +253,12 @@ func (g *Geziyor) releaseSem(req *client.Request) { <-g.semHosts.hostSems[req.Host] } } + +// recoverMe prevents scraping being crashed. +// Logs error and stack trace +func (g *Geziyor) recoverMe() { + if r := recover(); r != nil { + log.Println(r, string(debug.Stack())) + g.metrics.PanicCounter.Add(1) + } +} diff --git a/geziyor_test.go b/geziyor_test.go index e79d14b..41f8cd2 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -8,7 +8,6 @@ import ( "github.com/geziyor/geziyor" "github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/export" - "github.com/geziyor/geziyor/extract" "github.com/geziyor/geziyor/metrics" "net/http" "net/http/httptest" @@ -158,22 +157,6 @@ func TestBasicAuth(t *testing.T) { }).Start() } -func TestExtractor(t *testing.T) { - geziyor.NewGeziyor(&geziyor.Options{ - StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"}, - Extractors: []extract.Extractor{ - extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"}, - extract.Text{Name: "title", Selector: ".c-page-title"}, - extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"}, - extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"}, - extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"}, - extract.Text{Name: "summary", Selector: ".c-entry-summary"}, - extract.Text{Name: "content", Selector: ".c-entry-content"}, - }, - Exporters: []export.Exporter{&export.JSON{}}, - }).Start() -} - func TestRedirect(t *testing.T) { defer leaktest.Check(t)() geziyor.NewGeziyor(&geziyor.Options{ diff --git a/middleware.go b/middleware.go deleted file mode 100644 index 4bffb5a..0000000 --- a/middleware.go +++ /dev/null @@ -1,131 +0,0 @@ -package geziyor - -import ( - "bytes" - "fmt" - "github.com/PuerkitoBio/goquery" - "github.com/geziyor/geziyor/client" - "github.com/geziyor/geziyor/internal" - "log" - "math/rand" - "os" - "reflect" - "runtime/debug" - "time" -) - -// RequestMiddleware called before requests made. -// Set request.Cancelled = true to cancel request -type RequestMiddleware func(g *Geziyor, r *client.Request) - -// ResponseMiddleware called after request response receive -type ResponseMiddleware func(g *Geziyor, r *client.Response) - -func init() { - log.SetOutput(os.Stdout) - rand.Seed(time.Now().UnixNano()) -} - -// ---* REQUEST MIDDLEWARES *--- - -// recoverMiddleware recovers scraping being crashed. -// Logs error and stack trace -func recoverMiddleware(g *Geziyor, r *client.Request) { - if r := recover(); r != nil { - log.Println(r, string(debug.Stack())) - g.metrics.PanicCounter.Add(1) - } -} - -// allowedDomainsMiddleware checks for request host if it exists in AllowedDomains -func allowedDomainsMiddleware(g *Geziyor, r *client.Request) { - if len(g.Opt.AllowedDomains) != 0 && !internal.Contains(g.Opt.AllowedDomains, r.Host) { - //log.Printf("Domain not allowed: %s\n", req.Host) - r.Cancel() - return - } -} - -// duplicateRequestsMiddleware checks for already visited URLs -func duplicateRequestsMiddleware(g *Geziyor, r *client.Request) { - if !g.Opt.URLRevisitEnabled && r.Request.Method == "GET" { - if _, visited := g.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited { - //log.Printf("URL already visited %s\n", rawURL) - r.Cancel() - } - } -} - -// defaultHeadersMiddleware sets default request headers -func defaultHeadersMiddleware(g *Geziyor, r *client.Request) { - r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") - r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8") - r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en") - r.Header = client.SetDefaultHeader(r.Header, "User-Agent", g.Opt.UserAgent) -} - -// delayMiddleware delays requests -func delayMiddleware(g *Geziyor, r *client.Request) { - if g.Opt.RequestDelayRandomize { - min := float64(g.Opt.RequestDelay) * 0.5 - max := float64(g.Opt.RequestDelay) * 1.5 - time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min))) - } else { - time.Sleep(g.Opt.RequestDelay) - } -} - -// logMiddleware logs requests -func logMiddleware(g *Geziyor, r *client.Request) { - // LogDisabled check is not necessary, but done here for performance reasons - if !g.Opt.LogDisabled { - log.Println("Fetching: ", r.URL.String()) - } -} - -// metricsRequestMiddleware sets stats -func metricsRequestMiddleware(g *Geziyor, r *client.Request) { - g.metrics.RequestCounter.With("method", r.Method).Add(1) -} - -// ---* RESPONSE MIDDLEWARES *--- - -// parseHTMLMiddleware parses response if response is HTML -func parseHTMLMiddleware(g *Geziyor, r *client.Response) { - if !g.Opt.ParseHTMLDisabled && r.IsHTML() { - r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) - } -} - -// metricsResponseMiddleware sets stats -func metricsResponseMiddleware(g *Geziyor, r *client.Response) { - g.metrics.ResponseCounter.With("method", r.Request.Method).Add(1) -} - -// extractorsMiddleware extracts data from loaders conf and exports it to exporters -func extractorsMiddleware(g *Geziyor, r *client.Response) { - - // Check if we have extractors and exporters - if len(g.Opt.Extractors) != 0 && len(g.Opt.Exporters) != 0 { - exports := map[string]interface{}{} - - for _, extractor := range g.Opt.Extractors { - extracted, err := extractor.Extract(r.HTMLDoc.Selection) - if err != nil { - log.Println("extraction error: ", err) - continue - } - - // Check extracted data type and use it accordingly - val := reflect.ValueOf(extracted) - switch val.Kind() { - case reflect.Map: - r := val.MapRange() - for r.Next() { - exports[fmt.Sprint(r.Key())] = r.Value().Interface() - } - } - } - g.Exports <- exports - } -} diff --git a/middleware/allowed_domains.go b/middleware/allowed_domains.go new file mode 100644 index 0000000..88e1216 --- /dev/null +++ b/middleware/allowed_domains.go @@ -0,0 +1,19 @@ +package middleware + +import ( + "github.com/geziyor/geziyor/client" + "github.com/geziyor/geziyor/internal" +) + +// AllowedDomains checks for request host if it exists in AllowedDomains +type AllowedDomains struct { + AllowedDomains []string +} + +func (a *AllowedDomains) ProcessRequest(r *client.Request) { + if len(a.AllowedDomains) != 0 && !internal.Contains(a.AllowedDomains, r.Host) { + //log.Printf("Domain not allowed: %s\n", req.Host) + r.Cancel() + return + } +} diff --git a/middleware/delay.go b/middleware/delay.go new file mode 100644 index 0000000..aac57c0 --- /dev/null +++ b/middleware/delay.go @@ -0,0 +1,30 @@ +package middleware + +import ( + "github.com/geziyor/geziyor/client" + "math/rand" + "time" +) + +// delay delays requests +type delay struct { + requestDelayRandomize bool + requestDelay time.Duration +} + +func NewDelay(requestDelayRandomize bool, requestDelay time.Duration) RequestProcessor { + if requestDelayRandomize { + rand.Seed(time.Now().UnixNano()) + } + return &delay{requestDelayRandomize: requestDelayRandomize, requestDelay: requestDelay} +} + +func (a *delay) ProcessRequest(r *client.Request) { + if a.requestDelayRandomize { + min := float64(a.requestDelay) * 0.5 + max := float64(a.requestDelay) * 1.5 + time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min))) + } else { + time.Sleep(a.requestDelay) + } +} diff --git a/middleware_test.go b/middleware/delay_test.go similarity index 59% rename from middleware_test.go rename to middleware/delay_test.go index 55097d1..6913f26 100644 --- a/middleware_test.go +++ b/middleware/delay_test.go @@ -1,7 +1,7 @@ -package geziyor +package middleware import ( - "fmt" + "github.com/stretchr/testify/assert" "math/rand" "testing" "time" @@ -13,5 +13,7 @@ func TestRandomDelay(t *testing.T) { min := float64(delay) * 0.5 max := float64(delay) * 1.5 randomDelay := rand.Intn(int(max-min)) + int(min) - fmt.Println(time.Duration(randomDelay)) + + assert.True(t, time.Duration(randomDelay).Seconds() < 1.5) + assert.True(t, time.Duration(randomDelay).Seconds() > 0.5) } diff --git a/middleware/duplicate_requests.go b/middleware/duplicate_requests.go new file mode 100644 index 0000000..a120083 --- /dev/null +++ b/middleware/duplicate_requests.go @@ -0,0 +1,21 @@ +package middleware + +import ( + "github.com/geziyor/geziyor/client" + "sync" +) + +// DuplicateRequests checks for already visited URLs +type DuplicateRequests struct { + RevisitEnabled bool + visitedURLs sync.Map +} + +func (a *DuplicateRequests) ProcessRequest(r *client.Request) { + if !a.RevisitEnabled && r.Request.Method == "GET" { + if _, visited := a.visitedURLs.LoadOrStore(r.Request.URL.String(), struct{}{}); visited { + //log.Printf("URL already visited %s\n") + r.Cancel() + } + } +} diff --git a/middleware/headers.go b/middleware/headers.go new file mode 100644 index 0000000..40ec225 --- /dev/null +++ b/middleware/headers.go @@ -0,0 +1,17 @@ +package middleware + +import ( + "github.com/geziyor/geziyor/client" +) + +// Headers sets default request headers +type Headers struct { + UserAgent string +} + +func (a *Headers) ProcessRequest(r *client.Request) { + r.Header = client.SetDefaultHeader(r.Header, "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + r.Header = client.SetDefaultHeader(r.Header, "Accept-Charset", "utf-8") + r.Header = client.SetDefaultHeader(r.Header, "Accept-Language", "en") + r.Header = client.SetDefaultHeader(r.Header, "User-Agent", a.UserAgent) +} diff --git a/middleware/log_stats.go b/middleware/log_stats.go new file mode 100644 index 0000000..aa1ac3e --- /dev/null +++ b/middleware/log_stats.go @@ -0,0 +1,18 @@ +package middleware + +import ( + "github.com/geziyor/geziyor/client" + "log" +) + +// LogStats logs responses +type LogStats struct { + LogDisabled bool +} + +func (p *LogStats) ProcessResponse(r *client.Response) { + // LogDisabled check is not necessary, but done here for performance reasons + if !p.LogDisabled { + log.Printf("Crawled: (%d) <%s %s>", r.StatusCode, r.Request.Method, r.Request.URL.String()) + } +} diff --git a/middleware/metrics.go b/middleware/metrics.go new file mode 100644 index 0000000..b775850 --- /dev/null +++ b/middleware/metrics.go @@ -0,0 +1,19 @@ +package middleware + +import ( + "github.com/geziyor/geziyor/client" + "github.com/geziyor/geziyor/metrics" +) + +// Metrics sets stats for request and responses +type Metrics struct { + Metrics *metrics.Metrics +} + +func (a *Metrics) ProcessRequest(r *client.Request) { + a.Metrics.RequestCounter.With("method", r.Method).Add(1) +} + +func (a *Metrics) ProcessResponse(r *client.Response) { + a.Metrics.ResponseCounter.With("method", r.Request.Method).Add(1) +} diff --git a/middleware/middleware.go b/middleware/middleware.go new file mode 100644 index 0000000..04e27eb --- /dev/null +++ b/middleware/middleware.go @@ -0,0 +1,21 @@ +package middleware + +import ( + "github.com/geziyor/geziyor/client" +) + +type RequestResponseProcessor interface { + RequestProcessor + ResponseProcessor +} + +// RequestProcessor called before requests made. +// Set request.Cancelled = true to cancel request +type RequestProcessor interface { + ProcessRequest(r *client.Request) +} + +// ResponseProcessor called after request response receive +type ResponseProcessor interface { + ProcessResponse(r *client.Response) +} diff --git a/middleware/parse_html.go b/middleware/parse_html.go new file mode 100644 index 0000000..4d74184 --- /dev/null +++ b/middleware/parse_html.go @@ -0,0 +1,18 @@ +package middleware + +import ( + "bytes" + "github.com/PuerkitoBio/goquery" + "github.com/geziyor/geziyor/client" +) + +// ParseHTML parses response if response is HTML +type ParseHTML struct { + ParseHTMLDisabled bool +} + +func (p *ParseHTML) ProcessResponse(r *client.Response) { + if !p.ParseHTMLDisabled && r.IsHTML() { + r.HTMLDoc, _ = goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) + } +} diff --git a/options.go b/options.go index d56cdf9..8dd8fe6 100644 --- a/options.go +++ b/options.go @@ -4,8 +4,8 @@ import ( "github.com/fpfeng/httpcache" "github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/export" - "github.com/geziyor/geziyor/extract" "github.com/geziyor/geziyor/metrics" + "github.com/geziyor/geziyor/middleware" "time" ) @@ -24,9 +24,6 @@ type Options struct { // ParseFunc is callback of StartURLs response. ParseFunc func(g *Geziyor, r *client.Response) - // Extractors extracts items from pages - Extractors []extract.Extractor - // Timeout is global request timeout Timeout time.Duration @@ -56,10 +53,10 @@ type Options struct { Exporters []export.Exporter // Called before requests made to manipulate requests - RequestMiddlewares []RequestMiddleware + RequestMiddlewares []middleware.RequestProcessor // Called after response received - ResponseMiddlewares []ResponseMiddleware + ResponseMiddlewares []middleware.ResponseProcessor // Max body reading size in bytes. Default: 1GB MaxBodySize int64