Rendered field exported to support rendered requests on Do function. Data races fixed.

This commit is contained in:
Musab Gültekin 2019-06-14 15:23:56 +03:00
parent 1a7d480b36
commit 6caf1effd6
8 changed files with 47 additions and 68 deletions

View File

@ -49,7 +49,7 @@ func quotesParse(r *geziyor.Response) {
}
})
if href, ok := r.DocHTML.Find("li.next > a").Attr("href"); ok {
go r.Geziyor.Get(r.JoinURL(href), quotesParse)
r.Geziyor.Get(r.JoinURL(href), quotesParse)
}
}
```

View File

@ -19,7 +19,7 @@ type CSVExporter struct {
writer *csv.Writer
}
func (e CSVExporter) Export(response *geziyor.Response) {
func (e *CSVExporter) Export(response *geziyor.Response) {
// Default filename
if e.FileName == "" {

View File

@ -19,7 +19,7 @@ type JSONExporter struct {
}
// Export exports response data as JSON streaming file
func (e JSONExporter) Export(response *geziyor.Response) {
func (e *JSONExporter) Export(response *geziyor.Response) {
// Default filename
if e.FileName == "" {

View File

@ -29,14 +29,17 @@ type Exporter interface {
type Geziyor struct {
Opt Options
client *http.Client
wg sync.WaitGroup
visitedURLS []string
semGlobal chan struct{}
semHosts struct {
client *http.Client
wg sync.WaitGroup
semGlobal chan struct{}
semHosts struct {
sync.RWMutex
hostSems map[string]chan struct{}
}
visitedURLS struct {
sync.RWMutex
visitedURLS []string
}
}
func init() {
@ -88,7 +91,7 @@ func (g *Geziyor) Start() {
if g.Opt.StartRequestsFunc == nil {
for _, startURL := range g.Opt.StartURLs {
go g.Get(startURL, g.Opt.ParseFunc)
g.Get(startURL, g.Opt.ParseFunc)
}
} else {
g.Opt.StartRequestsFunc(g)
@ -112,13 +115,14 @@ func (g *Geziyor) Get(url string, callback func(resp *Response)) {
// GetRendered issues GET request using headless browser
// Opens up a new Chrome instance, makes request, waits for 1 second to render HTML DOM and closed.
// Rendered requests only supported for GET requests.
func (g *Geziyor) GetRendered(url string, callback func(resp *Response)) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
log.Printf("Request creating error %v\n", err)
return
}
g.Do(&Request{Request: req, rendered: true}, callback)
g.Do(&Request{Request: req, Rendered: true}, callback)
}
// Head issues a HEAD to the specified URL
@ -134,6 +138,11 @@ func (g *Geziyor) Head(url string, callback func(resp *Response)) {
// Do sends an HTTP request
func (g *Geziyor) Do(req *Request, callback func(resp *Response)) {
g.wg.Add(1)
go g.do(req, callback)
}
// Do sends an HTTP request
func (g *Geziyor) do(req *Request, callback func(resp *Response)) {
defer g.wg.Done()
defer func() {
if r := recover(); r != nil {
@ -145,11 +154,11 @@ func (g *Geziyor) Do(req *Request, callback func(resp *Response)) {
return
}
// Do request normal or chrome and read response
// Do request normal or Chrome and read response
var response *Response
var err error
if !req.rendered {
response, err = g.doRequest(req)
if !req.Rendered {
response, err = g.doRequestClient(req)
} else {
response, err = g.doRequestChrome(req)
}
@ -185,7 +194,7 @@ func (g *Geziyor) Do(req *Request, callback func(resp *Response)) {
time.Sleep(time.Millisecond)
}
func (g *Geziyor) doRequest(req *Request) (*Response, error) {
func (g *Geziyor) doRequestClient(req *Request) (*Response, error) {
g.acquireSem(req)
defer g.releaseSem(req)
@ -267,8 +276,8 @@ func (g *Geziyor) doRequestChrome(req *Request) (*Response, error) {
response := &Response{
//Response: resp,
Body: []byte(res),
//Meta: request.Meta,
Body: []byte(res),
Meta: req.Meta,
Geziyor: g,
Exports: make(chan interface{}),
}
@ -314,11 +323,16 @@ func (g *Geziyor) checkURL(parsedURL *url.URL) bool {
// Check for duplicate requests
if !g.Opt.URLRevisitEnabled {
if contains(g.visitedURLS, rawURL) {
g.visitedURLS.RLock()
if contains(g.visitedURLS.visitedURLS, rawURL) {
g.visitedURLS.RUnlock()
//log.Printf("URL already visited %s\n", rawURL)
return false
}
g.visitedURLS = append(g.visitedURLS, rawURL)
g.visitedURLS.RUnlock()
g.visitedURLS.Lock()
g.visitedURLS.visitedURLS = append(g.visitedURLS.visitedURLS, rawURL)
g.visitedURLS.Unlock()
}
return true

View File

@ -1,14 +1,12 @@
package geziyor_test
import (
"encoding/json"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/exporter"
"math/rand"
"net/http"
"testing"
"time"
)
@ -39,7 +37,7 @@ func TestQuotes(t *testing.T) {
geziyor.NewGeziyor(geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: quotesParse,
Exporters: []geziyor.Exporter{exporter.JSONExporter{}},
Exporters: []geziyor.Exporter{&exporter.JSONExporter{}},
}).Start()
}
@ -54,8 +52,6 @@ func quotesParse(r *geziyor.Response) {
return s.Text()
}),
}
// Or, for CSV
//r.Exports <- []string{s.Find("span.text").Text(), s.Find("small.author").Text()}
})
// Next Page
@ -72,11 +68,11 @@ func TestLinks(t *testing.T) {
r.Exports <- []string{r.Request.URL.String()}
r.DocHTML.Find("a").Each(func(i int, s *goquery.Selection) {
if href, ok := s.Attr("href"); ok {
go r.Geziyor.Get(r.JoinURL(href), r.Geziyor.Opt.ParseFunc)
r.Geziyor.Get(r.JoinURL(href), r.Geziyor.Opt.ParseFunc)
}
})
},
Exporters: []geziyor.Exporter{exporter.CSVExporter{}},
Exporters: []geziyor.Exporter{&exporter.CSVExporter{}},
}).Start()
}
@ -92,53 +88,17 @@ func TestRandomDelay(t *testing.T) {
func TestStartRequestsFunc(t *testing.T) {
geziyor.NewGeziyor(geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) {
go g.Get("http://quotes.toscrape.com/", g.Opt.ParseFunc)
g.Get("http://quotes.toscrape.com/", g.Opt.ParseFunc)
},
ParseFunc: func(r *geziyor.Response) {
r.DocHTML.Find("a").Each(func(_ int, s *goquery.Selection) {
r.Exports <- s.AttrOr("href", "")
})
},
Exporters: []geziyor.Exporter{exporter.JSONExporter{}},
Exporters: []geziyor.Exporter{&exporter.JSONExporter{}},
}).Start()
}
func TestAlmaany(t *testing.T) {
alphabet := "ab"
geziyor.NewGeziyor(geziyor.Options{
AllowedDomains: []string{"www.almaany.com"},
StartRequestsFunc: func(g *geziyor.Geziyor) {
base := "http://www.almaany.com/suggest.php?term=%c%c&lang=turkish&t=d"
for _, c1 := range alphabet {
for _, c2 := range alphabet {
req, _ := http.NewRequest("GET", fmt.Sprintf(base, c1, c2), nil)
go g.Do(&geziyor.Request{Request: req, Meta: map[string]interface{}{"word": string(c1) + string(c2)}}, parseAlmaany)
}
}
},
ConcurrentRequests: 10,
Exporters: []geziyor.Exporter{exporter.CSVExporter{}},
}).Start()
}
func parseAlmaany(r *geziyor.Response) {
var words []string
_ = json.Unmarshal(r.Body, &words)
r.Exports <- words
if len(words) == 20 {
alphabet := "ab"
base := "http://www.almaany.com/suggest.php?term=%s%c&lang=turkish&t=d"
for _, c := range alphabet {
req, _ := http.NewRequest("GET", fmt.Sprintf(base, r.Meta["word"], c), nil)
go r.Geziyor.Do(&geziyor.Request{Request: req, Meta: map[string]interface{}{"word": r.Meta["word"].(string) + string(c)}}, parseAlmaany)
}
}
}
func TestGetRendered(t *testing.T) {
geziyor.NewGeziyor(geziyor.Options{
StartRequestsFunc: func(g *geziyor.Geziyor) {

2
go.mod
View File

@ -7,6 +7,6 @@ require (
github.com/chromedp/cdproto v0.0.0-20190429085128-1aa4f57ff2a9
github.com/chromedp/chromedp v0.3.0
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a
golang.org/x/net v0.0.0-20190522155817-f3200d17e092
golang.org/x/text v0.3.2 // indirect
)

6
go.sum
View File

@ -19,10 +19,16 @@ github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307/go.mod h1:BjPj+aVjl9FW
github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983 h1:wL11wNW7dhKIcRCHSm4sHKPWz0tt4mwBsVodG7+Xyqg=
github.com/mailru/easyjson v0.0.0-20190403194419-1ea4449da983/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190522155817-f3200d17e092 h1:4QSRKanuywn15aTZvI/mIDEgPQpswuFndXpOj3rKEco=
golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190509141414-a5b02f93d862 h1:rM0ROo5vb9AdYJi1110yjWGMej9ITfKddS89P3Fkhug=
golang.org/x/sys v0.0.0-20190509141414-a5b02f93d862/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

View File

@ -4,10 +4,9 @@ import (
"net/http"
)
// Request is a small wrapper around *http.Request that contains Metadata
// Request is a small wrapper around *http.Request that contains Metadata and Rendering option
type Request struct {
*http.Request
Meta map[string]interface{}
rendered bool
Meta map[string]interface{}
Rendered bool
}