Sync and Async requests support added by using go keyword.

This commit is contained in:
Musab Gültekin 2019-06-07 21:31:18 +03:00
parent ee7d498f22
commit 9e61a96412
3 changed files with 69 additions and 43 deletions

20
export.go Normal file
View File

@ -0,0 +1,20 @@
package gezer
import (
"encoding/json"
"fmt"
"os"
)
func Export(response *Response) {
file, err := os.Create("out.json")
if err != nil {
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
return
}
for res := range response.Exports {
//fmt.Println(res)
_ = json.NewEncoder(file).Encode(res)
}
}

View File

@ -2,7 +2,6 @@ package gezer
import ( import (
"bytes" "bytes"
"encoding/json"
"fmt" "fmt"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"io/ioutil" "io/ioutil"
@ -35,40 +34,19 @@ func NewGezer(opt Opt) *Gezer {
} }
func (g *Gezer) Start() { func (g *Gezer) Start() {
g.wg.Add(len(g.opt.StartURLs)) for _, startURL := range g.opt.StartURLs {
go g.Get(startURL)
for _, url := range g.opt.StartURLs {
go g.getRequest(url)
} }
time.Sleep(time.Millisecond)
g.wg.Wait() g.wg.Wait()
} }
func (g *Gezer) Get(url string) { func (g *Gezer) Get(rawURL string) {
g.wg.Add(1) g.wg.Add(1)
go g.getRequest(url)
}
func (g *Gezer) getRequest(rawURL string) {
defer g.wg.Done() defer g.wg.Done()
// Parse URL if !checkURL(rawURL, g.opt.AllowedDomains) {
parsedURL, err := url.Parse(rawURL)
if err != nil {
fmt.Fprintf(os.Stderr, "url parsing error: %v", err)
return
}
// Check for allowed domains
var allowed bool
for _, domain := range g.opt.AllowedDomains {
if domain == parsedURL.Host {
allowed = true
break
}
}
if !allowed && len(g.opt.AllowedDomains) != 0 {
fmt.Fprintf(os.Stderr, "domain not allowed: %s", parsedURL.Host)
return return
} }
@ -87,7 +65,7 @@ func (g *Gezer) getRequest(rawURL string) {
// Read body // Read body
body, err := ioutil.ReadAll(resp.Body) body, err := ioutil.ReadAll(resp.Body)
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "reading body error: %v", err) fmt.Fprintf(os.Stderr, "reading body error: %v\n", err)
return return
} }
@ -104,21 +82,34 @@ func (g *Gezer) getRequest(rawURL string) {
} }
// Export Function // Export Function
go func() { go Export(&response)
file, err := os.Create("out.json")
if err != nil {
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
return
}
for res := range response.Exports {
fmt.Println(res)
_ = json.NewEncoder(file).Encode(res)
}
}()
// ParseFunc response // ParseFunc response
g.opt.ParseFunc(&response) g.opt.ParseFunc(&response)
time.Sleep(time.Millisecond)
}
func checkURL(rawURL string, allowedDomains []string) bool {
// Parse URL
parsedURL, err := url.Parse(rawURL)
if err != nil {
fmt.Fprintf(os.Stderr, "url parsing error: %v\n", err)
return false
}
// Check for allowed domains
var allowed bool
for _, domain := range allowedDomains {
if domain == parsedURL.Host {
allowed = true
break
}
}
if !allowed && len(allowedDomains) != 0 {
fmt.Fprintf(os.Stderr, "domain not allowed: %s\n", parsedURL.Host)
return false
}
return true
} }

View File

@ -30,9 +30,24 @@ func TestGezer_StartURLs_HTML(t *testing.T) {
} }
}) })
if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok { if href, ok := r.Doc.Find("li.next > a").Attr("href"); ok {
r.Gezer.Get(r.JoinURL(href)) go r.Gezer.Get(r.JoinURL(href))
} }
}, },
}) })
gezer.Start() gezer.Start()
} }
func TestGezer_Concurrent_Requests(t *testing.T) {
gezer := NewGezer(Opt{
AllowedDomains: []string{"quotes.toscrape.com"},
StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: func(r *Response) {
r.Doc.Find("a").Each(func(i int, s *goquery.Selection) {
if href, ok := s.Attr("href"); ok {
go r.Gezer.Get(r.JoinURL(href))
}
})
},
})
gezer.Start()
}