Robotstxt metrics added.

This commit is contained in:
Musab Gültekin 2019-07-08 14:51:54 +03:00
parent d3c4389c46
commit d19465c44a
5 changed files with 47 additions and 16 deletions

View File

@ -124,7 +124,7 @@ geziyor.NewGeziyor(&geziyor.Options{
### Exporting Data ### Exporting Data
You can export data automatically using exporters. Just send data to ```Geziyor.Exports``` chan. You can export data automatically using exporters. Just send data to ```Geziyor.Exports``` chan.
[Available exporters](https://godoc.org/github.com/geziyor/geziyor/exporter) [Available exporters](https://godoc.org/github.com/geziyor/geziyor/export)
```go ```go
geziyor.NewGeziyor(&geziyor.Options{ geziyor.NewGeziyor(&geziyor.Options{

View File

@ -102,7 +102,7 @@ func NewGeziyor(opt *Options) *Geziyor {
geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, metricsMiddleware) geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, metricsMiddleware)
geziyor.resMiddlewares = append(geziyor.resMiddlewares, metricsMiddleware) geziyor.resMiddlewares = append(geziyor.resMiddlewares, metricsMiddleware)
robotsMiddleware := middleware.NewRobotsTxt(geziyor.Client, opt.RobotsTxtDisabled) robotsMiddleware := middleware.NewRobotsTxt(geziyor.Client, geziyor.metrics, opt.RobotsTxtDisabled)
geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, robotsMiddleware) geziyor.reqMiddlewares = append(geziyor.reqMiddlewares, robotsMiddleware)
// Custom Middlewares // Custom Middlewares

View File

@ -26,9 +26,12 @@ const (
// Metrics type stores metrics // Metrics type stores metrics
type Metrics struct { type Metrics struct {
RequestCounter metrics.Counter RequestCounter metrics.Counter
ResponseCounter metrics.Counter ResponseCounter metrics.Counter
PanicCounter metrics.Counter PanicCounter metrics.Counter
RobotsTxtRequestCounter metrics.Counter
RobotsTxtResponseCounter metrics.Counter
RobotsTxtForbiddenCounter metrics.Counter
} }
// NewMetrics creates new metrics with given metrics.Type // NewMetrics creates new metrics with given metrics.Type
@ -36,15 +39,21 @@ func NewMetrics(metricsType Type) *Metrics {
switch metricsType { switch metricsType {
case Discard: case Discard:
return &Metrics{ return &Metrics{
RequestCounter: discard.NewCounter(), RequestCounter: discard.NewCounter(),
ResponseCounter: discard.NewCounter(), ResponseCounter: discard.NewCounter(),
PanicCounter: discard.NewCounter(), PanicCounter: discard.NewCounter(),
RobotsTxtRequestCounter: discard.NewCounter(),
RobotsTxtResponseCounter: discard.NewCounter(),
RobotsTxtForbiddenCounter: discard.NewCounter(),
} }
case ExpVar: case ExpVar:
return &Metrics{ return &Metrics{
RequestCounter: expvar.NewCounter("request_count"), RequestCounter: expvar.NewCounter("request_count"),
ResponseCounter: expvar.NewCounter("response_count"), ResponseCounter: expvar.NewCounter("response_count"),
PanicCounter: expvar.NewCounter("panic_count"), PanicCounter: expvar.NewCounter("panic_count"),
RobotsTxtRequestCounter: expvar.NewCounter("robotstxt_request_count"),
RobotsTxtResponseCounter: expvar.NewCounter("robotstxt_response_count"),
RobotsTxtForbiddenCounter: expvar.NewCounter("robotstxt_forbidden_count"),
} }
case Prometheus: case Prometheus:
return &Metrics{ return &Metrics{
@ -57,12 +66,27 @@ func NewMetrics(metricsType Type) *Metrics {
Namespace: "geziyor", Namespace: "geziyor",
Name: "response_count", Name: "response_count",
Help: "Response count", Help: "Response count",
}, []string{"method"}), }, []string{"status"}),
PanicCounter: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ PanicCounter: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: "geziyor", Namespace: "geziyor",
Name: "panic_count", Name: "panic_count",
Help: "Panic count", Help: "Panic count",
}, []string{}), }, []string{}),
RobotsTxtRequestCounter: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: "geziyor",
Name: "robotstxt_request_count",
Help: "Robotstxt request count",
}, []string{}),
RobotsTxtResponseCounter: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: "geziyor",
Name: "robotstxt_response_count",
Help: "Robotstxt response count",
}, []string{"status"}),
RobotsTxtForbiddenCounter: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: "geziyor",
Name: "robotstxt_forbidden_count",
Help: "Robotstxt forbidden count",
}, []string{"method"}),
} }
default: default:
return nil return nil

View File

@ -3,6 +3,7 @@ package middleware
import ( import (
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
"strconv"
) )
// Metrics sets stats for request and responses // Metrics sets stats for request and responses
@ -15,5 +16,5 @@ func (a *Metrics) ProcessRequest(r *client.Request) {
} }
func (a *Metrics) ProcessResponse(r *client.Response) { func (a *Metrics) ProcessResponse(r *client.Response) {
a.Metrics.ResponseCounter.With("method", r.Request.Method).Add(1) a.Metrics.ResponseCounter.With("status", strconv.Itoa(r.StatusCode)).Add(1)
} }

View File

@ -2,21 +2,25 @@ package middleware
import ( import (
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/metrics"
"github.com/temoto/robotstxt" "github.com/temoto/robotstxt"
"log" "log"
"strconv"
"sync" "sync"
) )
// RobotsTxt middleware filters out requests forbidden by the robots.txt exclusion standard. // RobotsTxt middleware filters out requests forbidden by the robots.txt exclusion standard.
type RobotsTxt struct { type RobotsTxt struct {
metrics *metrics.Metrics
robotsDisabled bool robotsDisabled bool
client *client.Client client *client.Client
mut sync.RWMutex mut sync.RWMutex
robotsMap map[string]*robotstxt.RobotsData robotsMap map[string]*robotstxt.RobotsData
} }
func NewRobotsTxt(client *client.Client, robotsDisabled bool) RequestProcessor { func NewRobotsTxt(client *client.Client, metrics *metrics.Metrics, robotsDisabled bool) RequestProcessor {
return &RobotsTxt{ return &RobotsTxt{
metrics: metrics,
robotsDisabled: robotsDisabled, robotsDisabled: robotsDisabled,
client: client, client: client,
robotsMap: make(map[string]*robotstxt.RobotsData), robotsMap: make(map[string]*robotstxt.RobotsData),
@ -28,7 +32,7 @@ func (m *RobotsTxt) ProcessRequest(r *client.Request) {
return return
} }
// TODO: Locking like this improves performance but causes duplicate requests to robots.txt, // TODO: Locking like this improves performance but sometimes it causes duplicate requests to robots.txt
m.mut.RLock() m.mut.RLock()
robotsData, exists := m.robotsMap[r.Host] robotsData, exists := m.robotsMap[r.Host]
m.mut.RUnlock() m.mut.RUnlock()
@ -39,10 +43,12 @@ func (m *RobotsTxt) ProcessRequest(r *client.Request) {
return // Don't Do anything return // Don't Do anything
} }
m.metrics.RobotsTxtRequestCounter.Add(1)
robotsResp, err := m.client.DoRequestClient(robotsReq) robotsResp, err := m.client.DoRequestClient(robotsReq)
if err != nil { if err != nil {
return // Don't Do anything return // Don't Do anything
} }
m.metrics.RobotsTxtResponseCounter.With("status", strconv.Itoa(robotsResp.StatusCode)).Add(1)
robotsData, err = robotstxt.FromStatusAndBytes(robotsResp.StatusCode, robotsResp.Body) robotsData, err = robotstxt.FromStatusAndBytes(robotsResp.StatusCode, robotsResp.Body)
if err != nil { if err != nil {
@ -55,7 +61,7 @@ func (m *RobotsTxt) ProcessRequest(r *client.Request) {
} }
if !robotsData.TestAgent(r.URL.Path, r.UserAgent()) { if !robotsData.TestAgent(r.URL.Path, r.UserAgent()) {
// TODO: Forbidden requests metrics m.metrics.RobotsTxtForbiddenCounter.With("method", r.Method).Add(1)
log.Println("Forbidden by robots.txt:", r.URL.String()) log.Println("Forbidden by robots.txt:", r.URL.String())
r.Cancel() r.Cancel()
} }