Added custom logger. Right now, not configurable.

This commit is contained in:
Musab Gültekin 2021-04-13 23:36:42 +03:00
parent 129402d754
commit e3d79e2574
10 changed files with 39 additions and 32 deletions

View File

@ -5,12 +5,12 @@ import (
"github.com/chromedp/cdproto/dom" "github.com/chromedp/cdproto/dom"
"github.com/chromedp/cdproto/network" "github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp" "github.com/chromedp/chromedp"
"github.com/geziyor/geziyor/internal"
"github.com/pkg/errors" "github.com/pkg/errors"
"golang.org/x/net/html/charset" "golang.org/x/net/html/charset"
"golang.org/x/text/transform" "golang.org/x/text/transform"
"io" "io"
"io/ioutil" "io/ioutil"
"log"
"net" "net"
"net/http" "net/http"
"net/url" "net/url"
@ -98,7 +98,7 @@ func (c *Client) DoRequest(req *Request) (resp *Response, err error) {
if err != nil { if err != nil {
if req.retryCounter < c.opt.RetryTimes { if req.retryCounter < c.opt.RetryTimes {
req.retryCounter++ req.retryCounter++
log.Println("Retrying:", req.URL.String()) internal.Logger.Println("Retrying:", req.URL.String())
return c.DoRequest(req) return c.DoRequest(req)
} }
return resp, errors.Wrap(err, "Response error") return resp, errors.Wrap(err, "Response error")
@ -109,7 +109,7 @@ func (c *Client) DoRequest(req *Request) (resp *Response, err error) {
if resp.StatusCode == statusCode { if resp.StatusCode == statusCode {
if req.retryCounter < c.opt.RetryTimes { if req.retryCounter < c.opt.RetryTimes {
req.retryCounter++ req.retryCounter++
log.Println("Retrying:", req.URL.String(), resp.StatusCode) internal.Logger.Println("Retrying:", req.URL.String(), resp.StatusCode)
return c.DoRequest(req) return c.DoRequest(req)
} }
} }

View File

@ -4,7 +4,6 @@ import (
"encoding/csv" "encoding/csv"
"fmt" "fmt"
"github.com/geziyor/geziyor/internal" "github.com/geziyor/geziyor/internal"
"log"
"os" "os"
"reflect" "reflect"
"sort" "sort"
@ -23,7 +22,7 @@ func (e *CSV) Export(exports chan interface{}) {
// Create or append file // Create or append file
file, err := os.OpenFile(internal.DefaultString(e.FileName, "out.csv"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) file, err := os.OpenFile(internal.DefaultString(e.FileName, "out.csv"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil { if err != nil {
log.Printf("Output file creation error: %v\n", err) internal.Logger.Printf("Output file creation error: %v\n", err)
return return
} }
defer file.Close() defer file.Close()
@ -50,7 +49,7 @@ func (e *CSV) Export(exports chan interface{}) {
sort.Strings(values) sort.Strings(values)
} }
if err := writer.Write(values); err != nil { if err := writer.Write(values); err != nil {
log.Printf("CSV writing error on exporter: %v\n", err) internal.Logger.Printf("CSV writing error on exporter: %v\n", err)
} }
} }
writer.Flush() writer.Flush()

View File

@ -4,7 +4,6 @@ import (
"bytes" "bytes"
"encoding/json" "encoding/json"
"github.com/geziyor/geziyor/internal" "github.com/geziyor/geziyor/internal"
"log"
"os" "os"
) )
@ -22,7 +21,7 @@ func (e *JSONLine) Export(exports chan interface{}) {
// Create or append file // Create or append file
file, err := os.OpenFile(internal.DefaultString(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) file, err := os.OpenFile(internal.DefaultString(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil { if err != nil {
log.Printf("Output file creation error: %v\n", err) internal.Logger.Printf("Output file creation error: %v\n", err)
return return
} }
defer file.Close() defer file.Close()
@ -34,7 +33,7 @@ func (e *JSONLine) Export(exports chan interface{}) {
// Export data as responses came // Export data as responses came
for res := range exports { for res := range exports {
if err := encoder.Encode(res); err != nil { if err := encoder.Encode(res); err != nil {
log.Printf("JSON encoding error on exporter: %v\n", err) internal.Logger.Printf("JSON encoding error on exporter: %v\n", err)
} }
} }
} }
@ -51,7 +50,7 @@ func (e *JSON) Export(exports chan interface{}) {
// Create or append file // Create or append file
file, err := os.OpenFile(internal.DefaultString(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) file, err := os.OpenFile(internal.DefaultString(e.FileName, "out.json"), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil { if err != nil {
log.Printf("Output file creation error: %v\n", err) internal.Logger.Printf("Output file creation error: %v\n", err)
return return
} }
defer file.Close() defer file.Close()
@ -62,7 +61,7 @@ func (e *JSON) Export(exports chan interface{}) {
for res := range exports { for res := range exports {
data, err := jsonMarshalLine(res, e.EscapeHTML) data, err := jsonMarshalLine(res, e.EscapeHTML)
if err != nil { if err != nil {
log.Printf("JSON encoding error on exporter: %v\n", err) internal.Logger.Printf("JSON encoding error on exporter: %v\n", err)
continue continue
} }
file.Write(data) file.Write(data)

View File

@ -3,10 +3,10 @@ package geziyor
import ( import (
"github.com/geziyor/geziyor/cache" "github.com/geziyor/geziyor/cache"
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
"github.com/geziyor/geziyor/middleware" "github.com/geziyor/geziyor/middleware"
"io/ioutil" "io/ioutil"
"log"
"net/http/cookiejar" "net/http/cookiejar"
"os" "os"
"os/signal" "os/signal"
@ -118,9 +118,9 @@ func NewGeziyor(opt *Options) *Geziyor {
// Logging // Logging
if opt.LogDisabled { if opt.LogDisabled {
log.SetOutput(ioutil.Discard) internal.Logger.SetOutput(ioutil.Discard)
} else { } else {
log.SetOutput(os.Stdout) internal.Logger.SetOutput(os.Stdout)
} }
return geziyor return geziyor
@ -128,7 +128,7 @@ func NewGeziyor(opt *Options) *Geziyor {
// Start starts scraping // Start starts scraping
func (g *Geziyor) Start() { func (g *Geziyor) Start() {
log.Println("Scraping Started") internal.Logger.Println("Scraping Started")
// Metrics // Metrics
if g.Opt.MetricsType == metrics.Prometheus || g.Opt.MetricsType == metrics.ExpVar { if g.Opt.MetricsType == metrics.Prometheus || g.Opt.MetricsType == metrics.ExpVar {
@ -171,7 +171,7 @@ func (g *Geziyor) Start() {
for { for {
select { select {
case <-shutdownChan: case <-shutdownChan:
log.Println("Received SIGINT, shutting down gracefully. Send again to force") internal.Logger.Println("Received SIGINT, shutting down gracefully. Send again to force")
g.shutdown = true g.shutdown = true
signal.Stop(shutdownChan) signal.Stop(shutdownChan)
case <-shutdownDoneChan: case <-shutdownDoneChan:
@ -184,14 +184,14 @@ func (g *Geziyor) Start() {
close(g.Exports) close(g.Exports)
g.wgExporters.Wait() g.wgExporters.Wait()
shutdownDoneChan <- struct{}{} shutdownDoneChan <- struct{}{}
log.Println("Scraping Finished") internal.Logger.Println("Scraping Finished")
} }
// Get issues a GET to the specified URL. // Get issues a GET to the specified URL.
func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *client.Response)) { func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("GET", url, nil) req, err := client.NewRequest("GET", url, nil)
if err != nil { if err != nil {
log.Printf("Request creating error %v\n", err) internal.Logger.Printf("Request creating error %v\n", err)
return return
} }
g.Do(req, callback) g.Do(req, callback)
@ -203,7 +203,7 @@ func (g *Geziyor) Get(url string, callback func(g *Geziyor, r *client.Response))
func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *client.Response)) { func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("GET", url, nil) req, err := client.NewRequest("GET", url, nil)
if err != nil { if err != nil {
log.Printf("Request creating error %v\n", err) internal.Logger.Printf("Request creating error %v\n", err)
return return
} }
req.Rendered = true req.Rendered = true
@ -214,7 +214,7 @@ func (g *Geziyor) GetRendered(url string, callback func(g *Geziyor, r *client.Re
func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)) { func (g *Geziyor) Head(url string, callback func(g *Geziyor, r *client.Response)) {
req, err := client.NewRequest("HEAD", url, nil) req, err := client.NewRequest("HEAD", url, nil)
if err != nil { if err != nil {
log.Printf("Request creating error %v\n", err) internal.Logger.Printf("Request creating error %v\n", err)
return return
} }
g.Do(req, callback) g.Do(req, callback)
@ -254,7 +254,7 @@ func (g *Geziyor) do(req *client.Request, callback func(g *Geziyor, r *client.Re
if g.Opt.ErrorFunc != nil { if g.Opt.ErrorFunc != nil {
g.Opt.ErrorFunc(g, req, err) g.Opt.ErrorFunc(g, req, err)
} else { } else {
log.Println(err) internal.Logger.Println(err)
} }
return return
} }
@ -304,7 +304,7 @@ func (g *Geziyor) releaseSem(req *client.Request) {
// Logs error and stack trace // Logs error and stack trace
func (g *Geziyor) recoverMe() { func (g *Geziyor) recoverMe() {
if r := recover(); r != nil { if r := recover(); r != nil {
log.Println(r, string(debug.Stack())) internal.Logger.Println(r, string(debug.Stack()))
g.metrics.PanicCounter.Add(1) g.metrics.PanicCounter.Add(1)
} }
} }

10
internal/logger.go Normal file
View File

@ -0,0 +1,10 @@
package internal
import (
"log"
"os"
)
var (
Logger = log.New(os.Stdout, "", 0)
)

View File

@ -3,7 +3,6 @@ package middleware
import ( import (
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal" "github.com/geziyor/geziyor/internal"
"log"
"sync" "sync"
) )
@ -16,7 +15,7 @@ type AllowedDomains struct {
func (a *AllowedDomains) ProcessRequest(r *client.Request) { func (a *AllowedDomains) ProcessRequest(r *client.Request) {
if len(a.AllowedDomains) != 0 && !internal.Contains(a.AllowedDomains, r.Host) { if len(a.AllowedDomains) != 0 && !internal.Contains(a.AllowedDomains, r.Host) {
if _, logged := a.logOnlyOnce.LoadOrStore(r.Host, struct{}{}); !logged { if _, logged := a.logOnlyOnce.LoadOrStore(r.Host, struct{}{}); !logged {
log.Printf("Domain not allowed: %s\n", r.Host) internal.Logger.Printf("Domain not allowed: %s\n", r.Host)
} }
r.Cancel() r.Cancel()
return return

View File

@ -2,7 +2,7 @@ package middleware
import ( import (
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"log" "github.com/geziyor/geziyor/internal"
"sync" "sync"
) )
@ -18,7 +18,7 @@ func (a *DuplicateRequests) ProcessRequest(r *client.Request) {
requestURL := r.Request.URL.String() requestURL := r.Request.URL.String()
if _, visited := a.visitedURLs.LoadOrStore(requestURL, struct{}{}); visited { if _, visited := a.visitedURLs.LoadOrStore(requestURL, struct{}{}); visited {
if _, logged := a.logOnlyOnce.LoadOrStore(requestURL, struct{}{}); !logged { if _, logged := a.logOnlyOnce.LoadOrStore(requestURL, struct{}{}); !logged {
log.Printf("URL already visited %s\n", requestURL) internal.Logger.Printf("URL already visited %s\n", requestURL)
} }
r.Cancel() r.Cancel()
} }

View File

@ -2,7 +2,7 @@ package middleware
import ( import (
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"log" "github.com/geziyor/geziyor/internal"
) )
// LogStats logs responses // LogStats logs responses
@ -13,6 +13,6 @@ type LogStats struct {
func (p *LogStats) ProcessResponse(r *client.Response) { func (p *LogStats) ProcessResponse(r *client.Response) {
// LogDisabled check is not necessary, but done here for performance reasons // LogDisabled check is not necessary, but done here for performance reasons
if !p.LogDisabled { if !p.LogDisabled {
log.Printf("Crawled: (%d) <%s %s>", r.StatusCode, r.Request.Method, r.Request.URL.String()) internal.Logger.Printf("Crawled: (%d) <%s %s>", r.StatusCode, r.Request.Method, r.Request.URL.String())
} }
} }

View File

@ -4,7 +4,7 @@ import (
"bytes" "bytes"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"log" "github.com/geziyor/geziyor/internal"
) )
// ParseHTML parses response if response is HTML // ParseHTML parses response if response is HTML
@ -16,7 +16,7 @@ func (p *ParseHTML) ProcessResponse(r *client.Response) {
if !p.ParseHTMLDisabled && r.IsHTML() { if !p.ParseHTMLDisabled && r.IsHTML() {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body)) doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r.Body))
if err != nil { if err != nil {
log.Println(err.Error()) internal.Logger.Println(err.Error())
return return
} }
r.HTMLDoc = doc r.HTMLDoc = doc

View File

@ -2,9 +2,9 @@ package middleware
import ( import (
"github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/internal"
"github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/metrics"
"github.com/temoto/robotstxt" "github.com/temoto/robotstxt"
"log"
"strconv" "strconv"
"sync" "sync"
) )
@ -62,7 +62,7 @@ func (m *RobotsTxt) ProcessRequest(r *client.Request) {
if !robotsData.TestAgent(r.URL.Path, r.UserAgent()) { if !robotsData.TestAgent(r.URL.Path, r.UserAgent()) {
m.metrics.RobotsTxtForbiddenCounter.With("method", r.Method).Add(1) m.metrics.RobotsTxtForbiddenCounter.With("method", r.Method).Add(1)
log.Println("Forbidden by robots.txt:", r.URL.String()) internal.Logger.Println("Forbidden by robots.txt:", r.URL.String())
r.Cancel() r.Cancel()
} }
} }