geziyor/geziyor.go
Musab Gültekin ddff3aee25 Request cancellations support added to Middlewares.
Some core functions refactored as middlewares.
Fixed race condition in exporting system. Now, only one goroutine will be responsible for exporting. This fixes concurrency issues on writing.
2019-06-15 22:27:46 +03:00

339 lines
7.6 KiB
Go

package geziyor
import (
"bytes"
"context"
"github.com/PuerkitoBio/goquery"
"github.com/chromedp/cdproto/dom"
"github.com/chromedp/chromedp"
"github.com/fpfeng/httpcache"
"golang.org/x/net/html/charset"
"io"
"io/ioutil"
"log"
"math/rand"
"net"
"net/http"
"os"
"runtime/debug"
"sync"
"time"
)
// Exporter interface is for extracting data to external resources
type Exporter interface {
Export(exports chan interface{})
}
// RequestMiddleware called before requests made.
// Set request.Cancelled = true to cancel request
type RequestMiddleware func(g *Geziyor, r *Request)
// Geziyor is our main scraper type
type Geziyor struct {
Opt Options
Exports chan interface{}
client *http.Client
wg sync.WaitGroup
semGlobal chan struct{}
semHosts struct {
sync.RWMutex
hostSems map[string]chan struct{}
}
visitedURLs sync.Map
requestMiddlewares []RequestMiddleware
}
func init() {
log.SetOutput(os.Stdout)
rand.Seed(time.Now().UnixNano())
}
// NewGeziyor creates new Geziyor with default values.
// If options provided, options
func NewGeziyor(opt Options) *Geziyor {
geziyor := &Geziyor{
client: &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
DualStack: true,
}).DialContext,
MaxIdleConns: 0, // Default: 100
MaxIdleConnsPerHost: 1000, // Default: 2
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 10 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
},
Timeout: time.Second * 180, // Google's timeout
},
Opt: opt,
Exports: make(chan interface{}),
requestMiddlewares: []RequestMiddleware{
allowedDomainsMiddleware,
duplicateRequestsMiddleware,
defaultHeadersMiddleware,
},
}
if opt.Cache != nil {
geziyor.client.Transport = &httpcache.Transport{
Transport: geziyor.client.Transport, Cache: opt.Cache, MarkCachedResponses: true}
}
if opt.Timeout != 0 {
geziyor.client.Timeout = opt.Timeout
}
if opt.ConcurrentRequests != 0 {
geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests)
}
if opt.ConcurrentRequestsPerDomain != 0 {
geziyor.semHosts = struct {
sync.RWMutex
hostSems map[string]chan struct{}
}{hostSems: make(map[string]chan struct{})}
}
if opt.UserAgent == "" {
geziyor.Opt.UserAgent = "Geziyor 1.0"
}
if opt.LogDisabled {
log.SetOutput(ioutil.Discard)
}
if opt.MaxBodySize == 0 {
geziyor.Opt.MaxBodySize = 1024 * 1024 * 1024 // 1GB
}
geziyor.requestMiddlewares = append(geziyor.requestMiddlewares, opt.RequestMiddlewares...)
return geziyor
}
// Start starts scraping
func (g *Geziyor) Start() {
log.Println("Scraping Started")
if len(g.Opt.Exporters) != 0 {
for _, exp := range g.Opt.Exporters {
go exp.Export(g.Exports)
}
} else {
go func() {
for range g.Exports {
}
}()
}
if g.Opt.StartRequestsFunc == nil {
for _, startURL := range g.Opt.StartURLs {
g.Get(startURL, g.Opt.ParseFunc)
}
} else {
g.Opt.StartRequestsFunc(g)
}
g.wg.Wait()
close(g.Exports)
log.Println("Scraping Finished")
}
// Get issues a GET to the specified URL.
func (g *Geziyor) Get(url string, callback func(resp *Response)) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
log.Printf("Request creating error %v\n", err)
return
}
g.Do(&Request{Request: req}, callback)
}
// GetRendered issues GET request using headless browser
// Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed.
// Rendered requests only supported for GET requests.
func (g *Geziyor) GetRendered(url string, callback func(resp *Response)) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
log.Printf("Request creating error %v\n", err)
return
}
g.Do(&Request{Request: req, Rendered: true}, callback)
}
// Head issues a HEAD to the specified URL
func (g *Geziyor) Head(url string, callback func(resp *Response)) {
req, err := http.NewRequest("HEAD", url, nil)
if err != nil {
log.Printf("Request creating error %v\n", err)
return
}
g.Do(&Request{Request: req}, callback)
}
// Do sends an HTTP request
func (g *Geziyor) Do(req *Request, callback func(resp *Response)) {
g.wg.Add(1)
go g.do(req, callback)
}
// Do sends an HTTP request
func (g *Geziyor) do(req *Request, callback func(resp *Response)) {
defer g.wg.Done()
defer func() {
if r := recover(); r != nil {
log.Println(r, string(debug.Stack()))
}
}()
for _, middlewareFunc := range g.requestMiddlewares {
middlewareFunc(g, req)
if req.Cancelled {
return
}
}
// Do request normal or Chrome and read response
var response *Response
var err error
if !req.Rendered {
response, err = g.doRequestClient(req)
} else {
response, err = g.doRequestChrome(req)
}
if err != nil {
return
}
if !g.Opt.ParseHTMLDisabled && response.isHTML() {
response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(response.Body))
}
// Callbacks
if callback != nil {
callback(response)
} else {
if g.Opt.ParseFunc != nil {
g.Opt.ParseFunc(response)
}
}
}
func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
g.acquireSem(req)
defer g.releaseSem(req)
g.delay()
log.Println("Fetching: ", req.URL.String())
// Do request
resp, err := g.client.Do(req.Request)
if resp != nil {
defer resp.Body.Close()
}
if err != nil {
log.Printf("Response error: %v\n", err)
return nil, err
}
// Limit response body reading
bodyReader := io.LimitReader(resp.Body, g.Opt.MaxBodySize)
// Start reading body and determine encoding
if !g.Opt.CharsetDetectDisabled {
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
if err != nil {
log.Printf("Determine encoding error: %v\n", err)
return nil, err
}
}
body, err := ioutil.ReadAll(bodyReader)
if err != nil {
log.Printf("Reading Body error: %v\n", err)
return nil, err
}
response := Response{
Response: resp,
Body: body,
Meta: req.Meta,
Geziyor: g,
}
return &response, nil
}
func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
g.acquireSem(req)
defer g.releaseSem(req)
g.delay()
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
var res string
if err := chromedp.Run(ctx,
chromedp.Navigate(req.URL.String()),
chromedp.Sleep(1*time.Second),
chromedp.ActionFunc(func(ctx context.Context) error {
node, err := dom.GetDocument().Do(ctx)
if err != nil {
return err
}
res, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx)
return err
}),
); err != nil {
log.Printf("Request getting rendered error: %v\n", err)
return nil, err
}
response := &Response{
//Response: resp,
Body: []byte(res),
Meta: req.Meta,
Geziyor: g,
}
return response, nil
}
func (g *Geziyor) acquireSem(req *Request) {
if g.Opt.ConcurrentRequests != 0 {
g.semGlobal <- struct{}{}
}
if g.Opt.ConcurrentRequestsPerDomain != 0 {
g.semHosts.RLock()
hostSem, exists := g.semHosts.hostSems[req.Host]
g.semHosts.RUnlock()
if !exists {
hostSem = make(chan struct{}, g.Opt.ConcurrentRequestsPerDomain)
g.semHosts.Lock()
g.semHosts.hostSems[req.Host] = hostSem
g.semHosts.Unlock()
}
hostSem <- struct{}{}
}
}
func (g *Geziyor) releaseSem(req *Request) {
if g.Opt.ConcurrentRequests != 0 {
<-g.semGlobal
}
if g.Opt.ConcurrentRequestsPerDomain != 0 {
<-g.semHosts.hostSems[req.Host]
}
}
func (g *Geziyor) delay() {
if g.Opt.RequestDelayRandomize {
min := float64(g.Opt.RequestDelay) * 0.5
max := float64(g.Opt.RequestDelay) * 1.5
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
} else {
time.Sleep(g.Opt.RequestDelay)
}
}