From 9e61a96412db2475b79ab6a927cf409946eea4f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Fri, 7 Jun 2019 21:31:18 +0300 Subject: [PATCH] Sync and Async requests support added by using go keyword. --- export.go | 20 ++++++++++++++ gezer.go | 75 +++++++++++++++++++++++---------------------------- gezer_test.go | 17 +++++++++++- 3 files changed, 69 insertions(+), 43 deletions(-) create mode 100644 export.go diff --git a/export.go b/export.go new file mode 100644 index 0000000..c36a48f --- /dev/null +++ b/export.go @@ -0,0 +1,20 @@ +package gezer + +import ( + "encoding/json" + "fmt" + "os" +) + +func Export(response *Response) { + file, err := os.Create("out.json") + if err != nil { + fmt.Fprintf(os.Stderr, "output file creation error: %v", err) + return + } + + for res := range response.Exports { + //fmt.Println(res) + _ = json.NewEncoder(file).Encode(res) + } +} diff --git a/gezer.go b/gezer.go index 7be15bf..39cb0d4 100644 --- a/gezer.go +++ b/gezer.go @@ -2,7 +2,6 @@ package gezer import ( "bytes" - "encoding/json" "fmt" "github.com/PuerkitoBio/goquery" "io/ioutil" @@ -35,40 +34,19 @@ func NewGezer(opt Opt) *Gezer { } func (g *Gezer) Start() { - g.wg.Add(len(g.opt.StartURLs)) - - for _, url := range g.opt.StartURLs { - go g.getRequest(url) + for _, startURL := range g.opt.StartURLs { + go g.Get(startURL) } + time.Sleep(time.Millisecond) g.wg.Wait() } -func (g *Gezer) Get(url string) { +func (g *Gezer) Get(rawURL string) { g.wg.Add(1) - go g.getRequest(url) -} - -func (g *Gezer) getRequest(rawURL string) { defer g.wg.Done() - // Parse URL - parsedURL, err := url.Parse(rawURL) - if err != nil { - fmt.Fprintf(os.Stderr, "url parsing error: %v", err) - return - } - - // Check for allowed domains - var allowed bool - for _, domain := range g.opt.AllowedDomains { - if domain == parsedURL.Host { - allowed = true - break - } - } - if !allowed && len(g.opt.AllowedDomains) != 0 { - fmt.Fprintf(os.Stderr, "domain not allowed: %s", parsedURL.Host) + if !checkURL(rawURL, g.opt.AllowedDomains) { return } @@ -87,7 +65,7 @@ func (g *Gezer) getRequest(rawURL string) { // Read body body, err := ioutil.ReadAll(resp.Body) if err != nil { - fmt.Fprintf(os.Stderr, "reading body error: %v", err) + fmt.Fprintf(os.Stderr, "reading body error: %v\n", err) return } @@ -104,21 +82,34 @@ func (g *Gezer) getRequest(rawURL string) { } // Export Function - go func() { - file, err := os.Create("out.json") - if err != nil { - fmt.Fprintf(os.Stderr, "output file creation error: %v", err) - return - } - - for res := range response.Exports { - fmt.Println(res) - _ = json.NewEncoder(file).Encode(res) - } - - }() + go Export(&response) // ParseFunc response g.opt.ParseFunc(&response) - + time.Sleep(time.Millisecond) +} + +func checkURL(rawURL string, allowedDomains []string) bool { + + // Parse URL + parsedURL, err := url.Parse(rawURL) + if err != nil { + fmt.Fprintf(os.Stderr, "url parsing error: %v\n", err) + return false + } + + // Check for allowed domains + var allowed bool + for _, domain := range allowedDomains { + if domain == parsedURL.Host { + allowed = true + break + } + } + if !allowed && len(allowedDomains) != 0 { + fmt.Fprintf(os.Stderr, "domain not allowed: %s\n", parsedURL.Host) + return false + } + + return true } diff --git a/gezer_test.go b/gezer_test.go index 2a06ef6..4edf82c 100644 --- a/gezer_test.go +++ b/gezer_test.go @@ -30,9 +30,24 @@ func TestGezer_StartURLs_HTML(t *testing.T) { } }) if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok { - r.Gezer.Get(r.JoinURL(href)) + go r.Gezer.Get(r.JoinURL(href)) } }, }) gezer.Start() } + +func TestGezer_Concurrent_Requests(t *testing.T) { + gezer := NewGezer(Opt{ + AllowedDomains: []string{"quotes.toscrape.com"}, + StartURLs: []string{"http://quotes.toscrape.com/"}, + ParseFunc: func(r *Response) { + r.Doc.Find("a").Each(func(i int, s *goquery.Selection) { + if href, ok := s.Attr("href"); ok { + go r.Gezer.Get(r.JoinURL(href)) + } + }) + }, + }) + gezer.Start() +}