User-Agent change support added.

This commit is contained in:
Musab Gültekin 2019-06-09 13:43:17 +03:00
parent 9263877339
commit 2263108838
3 changed files with 17 additions and 10 deletions

View File

@ -6,7 +6,7 @@ Geziyor is a blazing fast web crawling and web scraping framework, used to crawl
## Features ## Features
- 1.000+ Requests/Sec - 1.000+ Requests/Sec
- Caching - Caching (Memory/Disk)
- Automatic Data Exporting - Automatic Data Exporting
- Limit Concurrency Global/Per Domain - Limit Concurrency Global/Per Domain
- Automatic response decoding to UTF-8 - Automatic response decoding to UTF-8

View File

@ -57,6 +57,9 @@ func NewGeziyor(opt Options) *Geziyor {
hostSems map[string]chan struct{} hostSems map[string]chan struct{}
}{hostSems: make(map[string]chan struct{})} }{hostSems: make(map[string]chan struct{})}
} }
if opt.UserAgent == "" {
geziyor.opt.UserAgent = "Geziyor 1.0"
}
return geziyor return geziyor
} }
@ -96,15 +99,16 @@ func (g *Geziyor) Do(req *http.Request) {
g.wg.Add(1) g.wg.Add(1)
defer g.wg.Done() defer g.wg.Done()
if !checkURL(req.URL, g) { if !g.checkURL(req.URL) {
return return
} }
// Modify Request // Modify Request
req.Header.Set("Accept-Charset", "utf-8") req.Header.Set("Accept-Charset", "utf-8")
req.Header.Set("User-Agent", g.opt.UserAgent)
// Acquire Semaphore // Acquire Semaphore
g.acquire(req) g.acquireSem(req)
// Log // Log
log.Println("Fetching: ", req.URL.String()) log.Println("Fetching: ", req.URL.String())
@ -116,7 +120,7 @@ func (g *Geziyor) Do(req *http.Request) {
} }
if err != nil { if err != nil {
log.Printf("Response error: %v\n", err) log.Printf("Response error: %v\n", err)
g.release(req) g.releaseSem(req)
return return
} }
@ -124,7 +128,7 @@ func (g *Geziyor) Do(req *http.Request) {
reader, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type")) reader, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
if err != nil { if err != nil {
log.Printf("Determine encoding error: %v\n", err) log.Printf("Determine encoding error: %v\n", err)
g.release(req) g.releaseSem(req)
return return
} }
@ -132,12 +136,12 @@ func (g *Geziyor) Do(req *http.Request) {
body, err := ioutil.ReadAll(reader) body, err := ioutil.ReadAll(reader)
if err != nil { if err != nil {
log.Printf("Reading Body error: %v\n", err) log.Printf("Reading Body error: %v\n", err)
g.release(req) g.releaseSem(req)
return return
} }
// Release Semaphore // Release Semaphore
g.release(req) g.releaseSem(req)
// Create Document // Create Document
doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(body)) doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(body))
@ -159,7 +163,7 @@ func (g *Geziyor) Do(req *http.Request) {
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
} }
func (g *Geziyor) acquire(req *http.Request) { func (g *Geziyor) acquireSem(req *http.Request) {
if g.opt.ConcurrentRequests != 0 { if g.opt.ConcurrentRequests != 0 {
g.semGlobal <- struct{}{} g.semGlobal <- struct{}{}
} }
@ -178,7 +182,7 @@ func (g *Geziyor) acquire(req *http.Request) {
} }
} }
func (g *Geziyor) release(req *http.Request) { func (g *Geziyor) releaseSem(req *http.Request) {
if g.opt.ConcurrentRequests != 0 { if g.opt.ConcurrentRequests != 0 {
<-g.semGlobal <-g.semGlobal
} }
@ -187,7 +191,7 @@ func (g *Geziyor) release(req *http.Request) {
} }
} }
func checkURL(parsedURL *url.URL, g *Geziyor) bool { func (g *Geziyor) checkURL(parsedURL *url.URL) bool {
rawURL := parsedURL.String() rawURL := parsedURL.String()
// Check for allowed domains // Check for allowed domains
if len(g.opt.AllowedDomains) != 0 && !contains(g.opt.AllowedDomains, parsedURL.Host) { if len(g.opt.AllowedDomains) != 0 && !contains(g.opt.AllowedDomains, parsedURL.Host) {

View File

@ -27,4 +27,7 @@ type Options struct {
// Concurrent requests per domain limit // Concurrent requests per domain limit
ConcurrentRequestsPerDomain int ConcurrentRequestsPerDomain int
// User Agent
UserAgent string
} }