MaxIdleConns limit disabled to support unlimited requests to all hosts. MaxIdleConnsPerHost limit increased to speed up requests to same host.
375 lines
8.3 KiB
Go
375 lines
8.3 KiB
Go
package geziyor
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/chromedp/cdproto/dom"
|
|
"github.com/chromedp/chromedp"
|
|
"github.com/fpfeng/httpcache"
|
|
"golang.org/x/net/html/charset"
|
|
"io"
|
|
"io/ioutil"
|
|
"log"
|
|
"math/rand"
|
|
"net"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"runtime/debug"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// Exporter interface is for extracting data to external resources
|
|
type Exporter interface {
|
|
Export(response *Response)
|
|
}
|
|
|
|
// Geziyor is our main scraper type
|
|
type Geziyor struct {
|
|
Opt Options
|
|
|
|
client *http.Client
|
|
wg sync.WaitGroup
|
|
semGlobal chan struct{}
|
|
semHosts struct {
|
|
sync.RWMutex
|
|
hostSems map[string]chan struct{}
|
|
}
|
|
visitedURLS struct {
|
|
sync.RWMutex
|
|
visitedURLS []string
|
|
}
|
|
}
|
|
|
|
func init() {
|
|
log.SetOutput(os.Stdout)
|
|
rand.Seed(time.Now().UnixNano())
|
|
}
|
|
|
|
// NewGeziyor creates new Geziyor with default values.
|
|
// If options provided, options
|
|
func NewGeziyor(opt Options) *Geziyor {
|
|
geziyor := &Geziyor{
|
|
client: &http.Client{
|
|
Transport: &http.Transport{
|
|
Proxy: http.ProxyFromEnvironment,
|
|
DialContext: (&net.Dialer{
|
|
Timeout: 30 * time.Second,
|
|
KeepAlive: 30 * time.Second,
|
|
DualStack: true,
|
|
}).DialContext,
|
|
MaxIdleConns: 0, // Default: 100
|
|
MaxIdleConnsPerHost: 1000, // Default: 2
|
|
IdleConnTimeout: 90 * time.Second,
|
|
TLSHandshakeTimeout: 10 * time.Second,
|
|
ExpectContinueTimeout: 1 * time.Second,
|
|
},
|
|
Timeout: time.Second * 180, // Google's timeout
|
|
},
|
|
Opt: opt,
|
|
}
|
|
|
|
if opt.Cache != nil {
|
|
geziyor.client.Transport = &httpcache.Transport{
|
|
Transport: geziyor.client.Transport, Cache: opt.Cache, MarkCachedResponses: true}
|
|
}
|
|
if opt.Timeout != 0 {
|
|
geziyor.client.Timeout = opt.Timeout
|
|
}
|
|
if opt.ConcurrentRequests != 0 {
|
|
geziyor.semGlobal = make(chan struct{}, opt.ConcurrentRequests)
|
|
}
|
|
if opt.ConcurrentRequestsPerDomain != 0 {
|
|
geziyor.semHosts = struct {
|
|
sync.RWMutex
|
|
hostSems map[string]chan struct{}
|
|
}{hostSems: make(map[string]chan struct{})}
|
|
}
|
|
if opt.UserAgent == "" {
|
|
geziyor.Opt.UserAgent = "Geziyor 1.0"
|
|
}
|
|
if opt.LogDisabled {
|
|
log.SetOutput(ioutil.Discard)
|
|
}
|
|
if opt.MaxBodySize == 0 {
|
|
geziyor.Opt.MaxBodySize = 1024 * 1024 * 1024 // 1GB
|
|
}
|
|
|
|
return geziyor
|
|
}
|
|
|
|
// Start starts scraping
|
|
func (g *Geziyor) Start() {
|
|
log.Println("Scraping Started")
|
|
|
|
if g.Opt.StartRequestsFunc == nil {
|
|
for _, startURL := range g.Opt.StartURLs {
|
|
g.Get(startURL, g.Opt.ParseFunc)
|
|
}
|
|
} else {
|
|
g.Opt.StartRequestsFunc(g)
|
|
}
|
|
|
|
time.Sleep(time.Millisecond)
|
|
g.wg.Wait()
|
|
|
|
log.Println("Scraping Finished")
|
|
}
|
|
|
|
// Get issues a GET to the specified URL.
|
|
func (g *Geziyor) Get(url string, callback func(resp *Response)) {
|
|
req, err := http.NewRequest("GET", url, nil)
|
|
if err != nil {
|
|
log.Printf("Request creating error %v\n", err)
|
|
return
|
|
}
|
|
g.Do(&Request{Request: req}, callback)
|
|
}
|
|
|
|
// GetRendered issues GET request using headless browser
|
|
// Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed.
|
|
// Rendered requests only supported for GET requests.
|
|
func (g *Geziyor) GetRendered(url string, callback func(resp *Response)) {
|
|
req, err := http.NewRequest("GET", url, nil)
|
|
if err != nil {
|
|
log.Printf("Request creating error %v\n", err)
|
|
return
|
|
}
|
|
g.Do(&Request{Request: req, Rendered: true}, callback)
|
|
}
|
|
|
|
// Head issues a HEAD to the specified URL
|
|
func (g *Geziyor) Head(url string, callback func(resp *Response)) {
|
|
req, err := http.NewRequest("HEAD", url, nil)
|
|
if err != nil {
|
|
log.Printf("Request creating error %v\n", err)
|
|
return
|
|
}
|
|
g.Do(&Request{Request: req}, callback)
|
|
}
|
|
|
|
// Do sends an HTTP request
|
|
func (g *Geziyor) Do(req *Request, callback func(resp *Response)) {
|
|
g.wg.Add(1)
|
|
go g.do(req, callback)
|
|
}
|
|
|
|
// Do sends an HTTP request
|
|
func (g *Geziyor) do(req *Request, callback func(resp *Response)) {
|
|
defer g.wg.Done()
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Println(r, string(debug.Stack()))
|
|
}
|
|
}()
|
|
|
|
if !g.checkURL(req.URL) {
|
|
return
|
|
}
|
|
|
|
// Do request normal or Chrome and read response
|
|
var response *Response
|
|
var err error
|
|
if !req.Rendered {
|
|
response, err = g.doRequestClient(req)
|
|
} else {
|
|
response, err = g.doRequestChrome(req)
|
|
}
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
if response.isHTML() {
|
|
response.DocHTML, _ = goquery.NewDocumentFromReader(bytes.NewReader(response.Body))
|
|
}
|
|
|
|
// Exporter functions
|
|
for _, exp := range g.Opt.Exporters {
|
|
go exp.Export(response)
|
|
}
|
|
|
|
// Drain exports chan if no exporter functions added
|
|
if len(g.Opt.Exporters) == 0 {
|
|
go func() {
|
|
for range response.Exports {
|
|
}
|
|
}()
|
|
}
|
|
|
|
// Callbacks
|
|
if callback != nil {
|
|
callback(response)
|
|
} else {
|
|
if g.Opt.ParseFunc != nil {
|
|
g.Opt.ParseFunc(response)
|
|
}
|
|
}
|
|
time.Sleep(time.Millisecond)
|
|
}
|
|
|
|
func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
|
|
g.acquireSem(req)
|
|
defer g.releaseSem(req)
|
|
|
|
g.delay()
|
|
|
|
// Modify Request
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
|
req.Header.Set("Accept-Charset", "utf-8")
|
|
req.Header.Set("Accept-Language", "en")
|
|
req.Header.Set("User-Agent", g.Opt.UserAgent)
|
|
|
|
log.Println("Fetching: ", req.URL.String())
|
|
|
|
// Do request
|
|
resp, err := g.client.Do(req.Request)
|
|
if resp != nil {
|
|
defer resp.Body.Close()
|
|
}
|
|
if err != nil {
|
|
log.Printf("Response error: %v\n", err)
|
|
return nil, err
|
|
}
|
|
|
|
// Limit response body reading
|
|
bodyReader := io.LimitReader(resp.Body, g.Opt.MaxBodySize)
|
|
|
|
// Start reading body and determine encoding
|
|
if !g.Opt.CharsetDetectDisabled {
|
|
bodyReader, err = charset.NewReader(bodyReader, resp.Header.Get("Content-Type"))
|
|
if err != nil {
|
|
log.Printf("Determine encoding error: %v\n", err)
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
body, err := ioutil.ReadAll(bodyReader)
|
|
if err != nil {
|
|
log.Printf("Reading Body error: %v\n", err)
|
|
return nil, err
|
|
}
|
|
|
|
response := Response{
|
|
Response: resp,
|
|
Body: body,
|
|
Meta: req.Meta,
|
|
Geziyor: g,
|
|
Exports: make(chan interface{}),
|
|
}
|
|
|
|
return &response, nil
|
|
}
|
|
|
|
func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
|
|
g.acquireSem(req)
|
|
defer g.releaseSem(req)
|
|
|
|
g.delay()
|
|
|
|
ctx, cancel := chromedp.NewContext(context.Background())
|
|
defer cancel()
|
|
|
|
var res string
|
|
|
|
if err := chromedp.Run(ctx,
|
|
chromedp.Navigate(req.URL.String()),
|
|
chromedp.Sleep(1*time.Second),
|
|
chromedp.ActionFunc(func(ctx context.Context) error {
|
|
node, err := dom.GetDocument().Do(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
res, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx)
|
|
return err
|
|
}),
|
|
); err != nil {
|
|
log.Printf("Request getting rendered error: %v\n", err)
|
|
return nil, err
|
|
}
|
|
|
|
response := &Response{
|
|
//Response: resp,
|
|
Body: []byte(res),
|
|
Meta: req.Meta,
|
|
Geziyor: g,
|
|
Exports: make(chan interface{}),
|
|
}
|
|
|
|
return response, nil
|
|
}
|
|
|
|
func (g *Geziyor) acquireSem(req *Request) {
|
|
if g.Opt.ConcurrentRequests != 0 {
|
|
g.semGlobal <- struct{}{}
|
|
}
|
|
|
|
if g.Opt.ConcurrentRequestsPerDomain != 0 {
|
|
g.semHosts.RLock()
|
|
hostSem, exists := g.semHosts.hostSems[req.Host]
|
|
g.semHosts.RUnlock()
|
|
if !exists {
|
|
hostSem = make(chan struct{}, g.Opt.ConcurrentRequestsPerDomain)
|
|
g.semHosts.Lock()
|
|
g.semHosts.hostSems[req.Host] = hostSem
|
|
g.semHosts.Unlock()
|
|
}
|
|
hostSem <- struct{}{}
|
|
}
|
|
}
|
|
|
|
func (g *Geziyor) releaseSem(req *Request) {
|
|
if g.Opt.ConcurrentRequests != 0 {
|
|
<-g.semGlobal
|
|
}
|
|
if g.Opt.ConcurrentRequestsPerDomain != 0 {
|
|
<-g.semHosts.hostSems[req.Host]
|
|
}
|
|
}
|
|
|
|
func (g *Geziyor) checkURL(parsedURL *url.URL) bool {
|
|
rawURL := parsedURL.String()
|
|
// Check for allowed domains
|
|
if len(g.Opt.AllowedDomains) != 0 && !contains(g.Opt.AllowedDomains, parsedURL.Host) {
|
|
//log.Printf("Domain not allowed: %s\n", parsedURL.Host)
|
|
return false
|
|
}
|
|
|
|
// Check for duplicate requests
|
|
if !g.Opt.URLRevisitEnabled {
|
|
g.visitedURLS.RLock()
|
|
if contains(g.visitedURLS.visitedURLS, rawURL) {
|
|
g.visitedURLS.RUnlock()
|
|
//log.Printf("URL already visited %s\n", rawURL)
|
|
return false
|
|
}
|
|
g.visitedURLS.RUnlock()
|
|
g.visitedURLS.Lock()
|
|
g.visitedURLS.visitedURLS = append(g.visitedURLS.visitedURLS, rawURL)
|
|
g.visitedURLS.Unlock()
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func (g *Geziyor) delay() {
|
|
if g.Opt.RequestDelayRandomize {
|
|
min := float64(g.Opt.RequestDelay) * 0.5
|
|
max := float64(g.Opt.RequestDelay) * 1.5
|
|
time.Sleep(time.Duration(rand.Intn(int(max-min)) + int(min)))
|
|
} else {
|
|
time.Sleep(g.Opt.RequestDelay)
|
|
}
|
|
}
|
|
|
|
// contains checks whether []string contains string
|
|
func contains(s []string, e string) bool {
|
|
for _, a := range s {
|
|
if a == e {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|