Use parse function to parse responses, instead of channels.

Parse response as HTML Document using goquery.
Added simple README.
This commit is contained in:
Musab Gültekin 2019-06-06 22:48:57 +03:00
parent 1c96048082
commit 6358b87472
5 changed files with 80 additions and 38 deletions

2
README.md Normal file
View File

@ -0,0 +1,2 @@
# Gezer
Scraper and crawler framework for Golang

View File

@ -1,59 +1,78 @@
package gezer package gezer
import ( import (
"bytes"
"github.com/PuerkitoBio/goquery"
"io/ioutil" "io/ioutil"
"net/http" "net/http"
"sync"
"time" "time"
) )
type Gezer struct { type Gezer struct {
client *http.Client client *http.Client
Results chan *Response wg sync.WaitGroup
Parse func(response *Response)
startURLs []string
startedProcessing int
finishedProcessing int
} }
type Response struct { type Response struct {
*http.Response *http.Response
Body []byte Body []byte
Doc *goquery.Document
} }
func NewGezer() *Gezer { func NewGezer(parse func(response *Response), startURLs ...string) *Gezer {
return &Gezer{ return &Gezer{
client: &http.Client{ client: &http.Client{
Timeout: time.Second * 10, Timeout: time.Second * 10,
}, },
Results: make(chan *Response, 1), Parse: parse,
startURLs: startURLs,
} }
} }
func (g *Gezer) StartURLs(urls ...string) { func (g *Gezer) Start() {
for _, url := range urls { g.wg.Add(len(g.startURLs))
// Get request for _, url := range g.startURLs {
resp, err := g.client.Get(url) go g.getRequest(url)
if err != nil {
if resp != nil {
_ = resp.Body.Close()
}
continue
}
// Read body
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
continue
}
_ = resp.Body.Close()
// Create response
response := Response{
Response: resp,
Body: body,
}
// Send response
g.Results <- &response
} }
// Close chan, as we finished sending all the results g.wg.Wait()
close(g.Results) }
func (g *Gezer) getRequest(url string) {
defer g.wg.Done()
// Get request
resp, err := g.client.Get(url)
if resp != nil {
defer resp.Body.Close()
}
if err != nil {
return
}
// Read body
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return
}
// Create Document
doc, _ := goquery.NewDocumentFromReader(bytes.NewReader(body))
// Create response
response := Response{
Response: resp,
Body: body,
Doc: doc,
}
// Parse response
g.Parse(&response)
} }

View File

@ -5,10 +5,22 @@ import (
"testing" "testing"
) )
func TestGezer_StartURLs(t *testing.T) { func TestGezer_StartURLs_Simple(t *testing.T) {
gezer := NewGezer() gezer := NewGezer(parse, "https://api.ipify.org", "https://api.ipify.org")
gezer.StartURLs("https://api.ipify.org") gezer.Start()
for result := range gezer.Results {
fmt.Println(string(result.Body))
}
} }
func parse(response *Response) {
fmt.Println(string(response.Body))
}
//func TestGezer_StartURLs_HTML(t *testing.T) {
// gezer := NewGezer(parse, "http://quotes.toscrape.com/")
// gezer.Start()
// for result := range gezer.Results {
// result.Doc.Find("div.quote").Each(func(_ int, s *goquery.Selection) {
// fmt.Println(s.Find("span.text").Text())
// fmt.Println(s.Find("small.author").Text())
// })
// }
//}

2
go.mod
View File

@ -1,3 +1,5 @@
module github.com/gogezer/gezer module github.com/gogezer/gezer
go 1.12 go 1.12
require github.com/PuerkitoBio/goquery v1.5.0

7
go.sum Normal file
View File

@ -0,0 +1,7 @@
github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk=
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=