Use parse function to parse responses, instead of channels.

Parse response as HTML Document using goquery.
Added simple README.
This commit is contained in:
Musab Gültekin 2019-06-06 22:48:57 +03:00
parent 1c96048082
commit 6358b87472
5 changed files with 80 additions and 38 deletions

2
README.md Normal file
View File

@ -0,0 +1,2 @@
# Gezer
Scraper and crawler framework for Golang

View File

@ -1,59 +1,78 @@
package gezer
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"io/ioutil"
"net/http"
"sync"
"time"
)
type Gezer struct {
client *http.Client
Results chan *Response
wg sync.WaitGroup
Parse func(response *Response)
startURLs []string
startedProcessing int
finishedProcessing int
}
type Response struct {
*http.Response
Body []byte
Doc *goquery.Document
}
func NewGezer() *Gezer {
func NewGezer(parse func(response *Response), startURLs ...string) *Gezer {
return &Gezer{
client: &http.Client{
Timeout: time.Second * 10,
},
Results: make(chan *Response, 1),
Parse: parse,
startURLs: startURLs,
}
}
func (g *Gezer) StartURLs(urls ...string) {
for _, url := range urls {
func (g *Gezer) Start() {
g.wg.Add(len(g.startURLs))
for _, url := range g.startURLs {
go g.getRequest(url)
}
g.wg.Wait()
}
func (g *Gezer) getRequest(url string) {
defer g.wg.Done()
// Get request
resp, err := g.client.Get(url)
if err != nil {
if resp != nil {
_ = resp.Body.Close()
defer resp.Body.Close()
}
continue
if err != nil {
return
}
// Read body
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
continue
return
}
_ = resp.Body.Close()
// Create Document
doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(body))
// Create response
response := Response{
Response: resp,
Body: body,
Doc: doc,
}
// Send response
g.Results <- &response
}
// Close chan, as we finished sending all the results
close(g.Results)
// Parse response
g.Parse(&response)
}

View File

@ -5,10 +5,22 @@ import (
"testing"
)
func TestGezer_StartURLs(t *testing.T) {
gezer := NewGezer()
gezer.StartURLs("https://api.ipify.org")
for result := range gezer.Results {
fmt.Println(string(result.Body))
}
func TestGezer_StartURLs_Simple(t *testing.T) {
gezer := NewGezer(parse, "https://api.ipify.org", "https://api.ipify.org")
gezer.Start()
}
func parse(response *Response) {
fmt.Println(string(response.Body))
}
//func TestGezer_StartURLs_HTML(t *testing.T) {
// gezer := NewGezer(parse, "http://quotes.toscrape.com/")
// gezer.Start()
// for result := range gezer.Results {
// result.Doc.Find("div.quote").Each(func(_ int, s *goquery.Selection) {
// fmt.Println(s.Find("span.text").Text())
// fmt.Println(s.Find("small.author").Text())
// })
// }
//}

2
go.mod
View File

@ -1,3 +1,5 @@
module github.com/gogezer/gezer
go 1.12
require github.com/PuerkitoBio/goquery v1.5.0

7
go.sum Normal file
View File

@ -0,0 +1,7 @@
github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk=
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=