From d19465c44a4ad318f5274aa8916ec941eb98789e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Mon, 8 Jul 2019 14:51:54 +0300 Subject: [PATCH] Robotstxt metrics added. --- README.md | 2 +- geziyor.go | 2 +- metrics/metrics.go | 44 +++++++++++++++++++++++++++++++---------- middleware/metrics.go | 3 ++- middleware/robotstxt.go | 12 ++++++++--- 5 files changed, 47 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index b64e715..147addd 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ geziyor.NewGeziyor(&geziyor.Options{ ### Exporting Data You can export data automatically using exporters. Just send data to ```Geziyor.Exports``` chan. -[Available exporters](https://godoc.org/github.com/geziyor/geziyor/exporter) +[Available exporters](https://godoc.org/github.com/geziyor/geziyor/export) ```go geziyor.NewGeziyor(&geziyor.Options{ diff --git a/geziyor.go b/geziyor.go index fb51d7b..01f68cb 100644 --- a/geziyor.go +++ b/geziyor.go @@ -102,7 +102,7 @@ func NewGeziyor(opt *Options) *Geziyor { geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, metricsMiddleware) geziyor.resMiddlewares = append(geziyor.resMiddlewares, metricsMiddleware) - robotsMiddleware := middleware.NewRobotsTxt(geziyor.Client, opt.RobotsTxtDisabled) + robotsMiddleware := middleware.NewRobotsTxt(geziyor.Client, geziyor.metrics, opt.RobotsTxtDisabled) geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, robotsMiddleware) // Custom Middlewares diff --git a/metrics/metrics.go b/metrics/metrics.go index d864fb2..07687b6 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -26,9 +26,12 @@ const ( // Metrics type stores metrics type Metrics struct { - RequestCounter metrics.Counter - ResponseCounter metrics.Counter - PanicCounter metrics.Counter + RequestCounter metrics.Counter + ResponseCounter metrics.Counter + PanicCounter metrics.Counter + RobotsTxtRequestCounter metrics.Counter + RobotsTxtResponseCounter metrics.Counter + RobotsTxtForbiddenCounter metrics.Counter } // NewMetrics creates new metrics with given metrics.Type @@ -36,15 +39,21 @@ func NewMetrics(metricsType Type) *Metrics { switch metricsType { case Discard: return &Metrics{ - RequestCounter: discard.NewCounter(), - ResponseCounter: discard.NewCounter(), - PanicCounter: discard.NewCounter(), + RequestCounter: discard.NewCounter(), + ResponseCounter: discard.NewCounter(), + PanicCounter: discard.NewCounter(), + RobotsTxtRequestCounter: discard.NewCounter(), + RobotsTxtResponseCounter: discard.NewCounter(), + RobotsTxtForbiddenCounter: discard.NewCounter(), } case ExpVar: return &Metrics{ - RequestCounter: expvar.NewCounter("request_count"), - ResponseCounter: expvar.NewCounter("response_count"), - PanicCounter: expvar.NewCounter("panic_count"), + RequestCounter: expvar.NewCounter("request_count"), + ResponseCounter: expvar.NewCounter("response_count"), + PanicCounter: expvar.NewCounter("panic_count"), + RobotsTxtRequestCounter: expvar.NewCounter("robotstxt_request_count"), + RobotsTxtResponseCounter: expvar.NewCounter("robotstxt_response_count"), + RobotsTxtForbiddenCounter: expvar.NewCounter("robotstxt_forbidden_count"), } case Prometheus: return &Metrics{ @@ -57,12 +66,27 @@ func NewMetrics(metricsType Type) *Metrics { Namespace: "geziyor", Name: "response_count", Help: "Response count", - }, []string{"method"}), + }, []string{"status"}), PanicCounter: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ Namespace: "geziyor", Name: "panic_count", Help: "Panic count", }, []string{}), + RobotsTxtRequestCounter: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: "geziyor", + Name: "robotstxt_request_count", + Help: "Robotstxt request count", + }, []string{}), + RobotsTxtResponseCounter: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: "geziyor", + Name: "robotstxt_response_count", + Help: "Robotstxt response count", + }, []string{"status"}), + RobotsTxtForbiddenCounter: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: "geziyor", + Name: "robotstxt_forbidden_count", + Help: "Robotstxt forbidden count", + }, []string{"method"}), } default: return nil diff --git a/middleware/metrics.go b/middleware/metrics.go index b775850..047bc7a 100644 --- a/middleware/metrics.go +++ b/middleware/metrics.go @@ -3,6 +3,7 @@ package middleware import ( "github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/metrics" + "strconv" ) // Metrics sets stats for request and responses @@ -15,5 +16,5 @@ func (a *Metrics) ProcessRequest(r *client.Request) { } func (a *Metrics) ProcessResponse(r *client.Response) { - a.Metrics.ResponseCounter.With("method", r.Request.Method).Add(1) + a.Metrics.ResponseCounter.With("status", strconv.Itoa(r.StatusCode)).Add(1) } diff --git a/middleware/robotstxt.go b/middleware/robotstxt.go index 3c8339b..d6bb4c6 100644 --- a/middleware/robotstxt.go +++ b/middleware/robotstxt.go @@ -2,21 +2,25 @@ package middleware import ( "github.com/geziyor/geziyor/client" + "github.com/geziyor/geziyor/metrics" "github.com/temoto/robotstxt" "log" + "strconv" "sync" ) // RobotsTxt middleware filters out requests forbidden by the robots.txt exclusion standard. type RobotsTxt struct { + metrics *metrics.Metrics robotsDisabled bool client *client.Client mut sync.RWMutex robotsMap map[string]*robotstxt.RobotsData } -func NewRobotsTxt(client *client.Client, robotsDisabled bool) RequestProcessor { +func NewRobotsTxt(client *client.Client, metrics *metrics.Metrics, robotsDisabled bool) RequestProcessor { return &RobotsTxt{ + metrics: metrics, robotsDisabled: robotsDisabled, client: client, robotsMap: make(map[string]*robotstxt.RobotsData), @@ -28,7 +32,7 @@ func (m *RobotsTxt) ProcessRequest(r *client.Request) { return } - // TODO: Locking like this improves performance but causes duplicate requests to robots.txt, + // TODO: Locking like this improves performance but sometimes it causes duplicate requests to robots.txt m.mut.RLock() robotsData, exists := m.robotsMap[r.Host] m.mut.RUnlock() @@ -39,10 +43,12 @@ func (m *RobotsTxt) ProcessRequest(r *client.Request) { return // Don't Do anything } + m.metrics.RobotsTxtRequestCounter.Add(1) robotsResp, err := m.client.DoRequestClient(robotsReq) if err != nil { return // Don't Do anything } + m.metrics.RobotsTxtResponseCounter.With("status", strconv.Itoa(robotsResp.StatusCode)).Add(1) robotsData, err = robotstxt.FromStatusAndBytes(robotsResp.StatusCode, robotsResp.Body) if err != nil { @@ -55,7 +61,7 @@ func (m *RobotsTxt) ProcessRequest(r *client.Request) { } if !robotsData.TestAgent(r.URL.Path, r.UserAgent()) { - // TODO: Forbidden requests metrics + m.metrics.RobotsTxtForbiddenCounter.With("method", r.Method).Add(1) log.Println("Forbidden by robots.txt:", r.URL.String()) r.Cancel() }