From 6358b87472c99c7634b0c167595382656a02ac15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Thu, 6 Jun 2019 22:48:57 +0300 Subject: [PATCH] Use parse function to parse responses, instead of channels. Parse response as HTML Document using goquery. Added simple README. --- README.md | 2 ++ gezer.go | 83 +++++++++++++++++++++++++++++++-------------------- gezer_test.go | 24 +++++++++++---- go.mod | 2 ++ go.sum | 7 +++++ 5 files changed, 80 insertions(+), 38 deletions(-) create mode 100644 README.md create mode 100644 go.sum diff --git a/README.md b/README.md new file mode 100644 index 0000000..03ccdd6 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# Gezer +Scraper and crawler framework for Golang diff --git a/gezer.go b/gezer.go index 82ba66f..ad97d6b 100644 --- a/gezer.go +++ b/gezer.go @@ -1,59 +1,78 @@ package gezer import ( + "bytes" + "github.com/PuerkitoBio/goquery" "io/ioutil" "net/http" + "sync" "time" ) type Gezer struct { - client *http.Client - Results chan *Response + client *http.Client + wg sync.WaitGroup + Parse func(response *Response) + + startURLs []string + startedProcessing int + finishedProcessing int } type Response struct { *http.Response Body []byte + Doc *goquery.Document } -func NewGezer() *Gezer { +func NewGezer(parse func(response *Response), startURLs ...string) *Gezer { return &Gezer{ client: &http.Client{ Timeout: time.Second * 10, }, - Results: make(chan *Response, 1), + Parse: parse, + startURLs: startURLs, } } -func (g *Gezer) StartURLs(urls ...string) { - for _, url := range urls { +func (g *Gezer) Start() { + g.wg.Add(len(g.startURLs)) - // Get request - resp, err := g.client.Get(url) - if err != nil { - if resp != nil { - _ = resp.Body.Close() - } - continue - } - - // Read body - body, err := ioutil.ReadAll(resp.Body) - if err != nil { - continue - } - _ = resp.Body.Close() - - // Create response - response := Response{ - Response: resp, - Body: body, - } - - // Send response - g.Results <- &response + for _, url := range g.startURLs { + go g.getRequest(url) } - // Close chan, as we finished sending all the results - close(g.Results) + g.wg.Wait() +} + +func (g *Gezer) getRequest(url string) { + defer g.wg.Done() + + // Get request + resp, err := g.client.Get(url) + if resp != nil { + defer resp.Body.Close() + } + if err != nil { + return + } + + // Read body + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return + } + + // Create Document + doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(body)) + + // Create response + response := Response{ + Response: resp, + Body: body, + Doc: doc, + } + + // Parse response + g.Parse(&response) } diff --git a/gezer_test.go b/gezer_test.go index f578253..bcc6868 100644 --- a/gezer_test.go +++ b/gezer_test.go @@ -5,10 +5,22 @@ import ( "testing" ) -func TestGezer_StartURLs(t *testing.T) { - gezer := NewGezer() - gezer.StartURLs("https://api.ipify.org") - for result := range gezer.Results { - fmt.Println(string(result.Body)) - } +func TestGezer_StartURLs_Simple(t *testing.T) { + gezer := NewGezer(parse, "https://api.ipify.org", "https://api.ipify.org") + gezer.Start() } + +func parse(response *Response) { + fmt.Println(string(response.Body)) +} + +//func TestGezer_StartURLs_HTML(t *testing.T) { +// gezer := NewGezer(parse, "http://quotes.toscrape.com/") +// gezer.Start() +// for result := range gezer.Results { +// result.Doc.Find("div.quote").Each(func(_ int, s *goquery.Selection) { +// fmt.Println(s.Find("span.text").Text()) +// fmt.Println(s.Find("small.author").Text()) +// }) +// } +//} diff --git a/go.mod b/go.mod index d0b9a6e..146b938 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/gogezer/gezer go 1.12 + +require github.com/PuerkitoBio/goquery v1.5.0 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..0327c72 --- /dev/null +++ b/go.sum @@ -0,0 +1,7 @@ +github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk= +github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= +github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= +github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U= +golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=