Caching added.
JSON File export will append, not truncate.
This commit is contained in:
parent
9e61a96412
commit
ca197ff06a
3
.gitignore
vendored
3
.gitignore
vendored
@ -16,3 +16,6 @@
|
||||
|
||||
# IDE directories
|
||||
.idea/
|
||||
|
||||
# Output files
|
||||
out.json
|
||||
|
@ -4,14 +4,21 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
var file *os.File
|
||||
var once sync.Once
|
||||
|
||||
func Export(response *Response) {
|
||||
file, err := os.Create("out.json")
|
||||
once.Do(func() {
|
||||
newFile, err := os.OpenFile("out.json", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
||||
return
|
||||
}
|
||||
file = newFile
|
||||
})
|
||||
|
||||
for res := range response.Exports {
|
||||
//fmt.Println(res)
|
||||
|
27
gezer.go
27
gezer.go
@ -4,7 +4,9 @@ import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
httpcacheDumb "github.com/fpfeng/httpcache"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
@ -25,9 +27,11 @@ type Opt struct {
|
||||
}
|
||||
|
||||
func NewGezer(opt Opt) *Gezer {
|
||||
log.SetOutput(os.Stdout)
|
||||
return &Gezer{
|
||||
client: &http.Client{
|
||||
Timeout: time.Second * 10,
|
||||
Transport: httpcacheDumb.NewMemoryCacheTransport(),
|
||||
},
|
||||
opt: opt,
|
||||
}
|
||||
@ -51,7 +55,7 @@ func (g *Gezer) Get(rawURL string) {
|
||||
}
|
||||
|
||||
// Log
|
||||
fmt.Println("Fetching: ", rawURL)
|
||||
log.Println("Fetching: ", rawURL)
|
||||
|
||||
// Get request
|
||||
resp, err := g.client.Get(rawURL)
|
||||
@ -99,17 +103,20 @@ func checkURL(rawURL string, allowedDomains []string) bool {
|
||||
}
|
||||
|
||||
// Check for allowed domains
|
||||
var allowed bool
|
||||
for _, domain := range allowedDomains {
|
||||
if domain == parsedURL.Host {
|
||||
allowed = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !allowed && len(allowedDomains) != 0 {
|
||||
fmt.Fprintf(os.Stderr, "domain not allowed: %s\n", parsedURL.Host)
|
||||
if len(allowedDomains) != 0 && !Contains(allowedDomains, parsedURL.Host) {
|
||||
log.Printf("Domain not allowed: %s\n", parsedURL.Host)
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// Contains checks whether []string contains string
|
||||
func Contains(s []string, e string) bool {
|
||||
for _, a := range s {
|
||||
if a == e {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
@ -8,9 +8,10 @@ import (
|
||||
|
||||
func TestGezer_StartURLs_Simple(t *testing.T) {
|
||||
gezer := NewGezer(Opt{
|
||||
StartURLs: []string{"https://api.ipify.org", "https://api.ipify.org"},
|
||||
StartURLs: []string{"http://api.ipify.org"},
|
||||
ParseFunc: func(r *Response) {
|
||||
fmt.Println(string(r.Body))
|
||||
r.Gezer.Get("http://api.ipify.org")
|
||||
},
|
||||
})
|
||||
gezer.Start()
|
||||
@ -42,6 +43,7 @@ func TestGezer_Concurrent_Requests(t *testing.T) {
|
||||
AllowedDomains: []string{"quotes.toscrape.com"},
|
||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||
ParseFunc: func(r *Response) {
|
||||
//r.Exports <- map[string]interface{}{"href": r.Request.URL.String()}
|
||||
r.Doc.Find("a").Each(func(i int, s *goquery.Selection) {
|
||||
if href, ok := s.Attr("href"); ok {
|
||||
go r.Gezer.Get(r.JoinURL(href))
|
||||
|
5
go.mod
5
go.mod
@ -2,4 +2,7 @@ module github.com/gogezer/gezer
|
||||
|
||||
go 1.12
|
||||
|
||||
require github.com/PuerkitoBio/goquery v1.5.0
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.5.0
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
|
||||
)
|
||||
|
2
go.sum
2
go.sum
@ -2,6 +2,8 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP
|
||||
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
|
||||
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
|
||||
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ=
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8=
|
||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
|
||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
|
Loading…
x
Reference in New Issue
Block a user