Caching added.
JSON File export will append, not truncate.
This commit is contained in:
parent
9e61a96412
commit
ca197ff06a
3
.gitignore
vendored
3
.gitignore
vendored
@ -16,3 +16,6 @@
|
|||||||
|
|
||||||
# IDE directories
|
# IDE directories
|
||||||
.idea/
|
.idea/
|
||||||
|
|
||||||
|
# Output files
|
||||||
|
out.json
|
||||||
|
17
export.go
17
export.go
@ -4,14 +4,21 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var file *os.File
|
||||||
|
var once sync.Once
|
||||||
|
|
||||||
func Export(response *Response) {
|
func Export(response *Response) {
|
||||||
file, err := os.Create("out.json")
|
once.Do(func() {
|
||||||
if err != nil {
|
newFile, err := os.OpenFile("out.json", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
|
||||||
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
if err != nil {
|
||||||
return
|
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
|
||||||
}
|
return
|
||||||
|
}
|
||||||
|
file = newFile
|
||||||
|
})
|
||||||
|
|
||||||
for res := range response.Exports {
|
for res := range response.Exports {
|
||||||
//fmt.Println(res)
|
//fmt.Println(res)
|
||||||
|
29
gezer.go
29
gezer.go
@ -4,7 +4,9 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
httpcacheDumb "github.com/fpfeng/httpcache"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
@ -25,9 +27,11 @@ type Opt struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func NewGezer(opt Opt) *Gezer {
|
func NewGezer(opt Opt) *Gezer {
|
||||||
|
log.SetOutput(os.Stdout)
|
||||||
return &Gezer{
|
return &Gezer{
|
||||||
client: &http.Client{
|
client: &http.Client{
|
||||||
Timeout: time.Second * 10,
|
Timeout: time.Second * 10,
|
||||||
|
Transport: httpcacheDumb.NewMemoryCacheTransport(),
|
||||||
},
|
},
|
||||||
opt: opt,
|
opt: opt,
|
||||||
}
|
}
|
||||||
@ -51,7 +55,7 @@ func (g *Gezer) Get(rawURL string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Log
|
// Log
|
||||||
fmt.Println("Fetching: ", rawURL)
|
log.Println("Fetching: ", rawURL)
|
||||||
|
|
||||||
// Get request
|
// Get request
|
||||||
resp, err := g.client.Get(rawURL)
|
resp, err := g.client.Get(rawURL)
|
||||||
@ -99,17 +103,20 @@ func checkURL(rawURL string, allowedDomains []string) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check for allowed domains
|
// Check for allowed domains
|
||||||
var allowed bool
|
if len(allowedDomains) != 0 && !Contains(allowedDomains, parsedURL.Host) {
|
||||||
for _, domain := range allowedDomains {
|
log.Printf("Domain not allowed: %s\n", parsedURL.Host)
|
||||||
if domain == parsedURL.Host {
|
|
||||||
allowed = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !allowed && len(allowedDomains) != 0 {
|
|
||||||
fmt.Fprintf(os.Stderr, "domain not allowed: %s\n", parsedURL.Host)
|
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Contains checks whether []string contains string
|
||||||
|
func Contains(s []string, e string) bool {
|
||||||
|
for _, a := range s {
|
||||||
|
if a == e {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
@ -8,9 +8,10 @@ import (
|
|||||||
|
|
||||||
func TestGezer_StartURLs_Simple(t *testing.T) {
|
func TestGezer_StartURLs_Simple(t *testing.T) {
|
||||||
gezer := NewGezer(Opt{
|
gezer := NewGezer(Opt{
|
||||||
StartURLs: []string{"https://api.ipify.org", "https://api.ipify.org"},
|
StartURLs: []string{"http://api.ipify.org"},
|
||||||
ParseFunc: func(r *Response) {
|
ParseFunc: func(r *Response) {
|
||||||
fmt.Println(string(r.Body))
|
fmt.Println(string(r.Body))
|
||||||
|
r.Gezer.Get("http://api.ipify.org")
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
gezer.Start()
|
gezer.Start()
|
||||||
@ -42,6 +43,7 @@ func TestGezer_Concurrent_Requests(t *testing.T) {
|
|||||||
AllowedDomains: []string{"quotes.toscrape.com"},
|
AllowedDomains: []string{"quotes.toscrape.com"},
|
||||||
StartURLs: []string{"http://quotes.toscrape.com/"},
|
StartURLs: []string{"http://quotes.toscrape.com/"},
|
||||||
ParseFunc: func(r *Response) {
|
ParseFunc: func(r *Response) {
|
||||||
|
//r.Exports <- map[string]interface{}{"href": r.Request.URL.String()}
|
||||||
r.Doc.Find("a").Each(func(i int, s *goquery.Selection) {
|
r.Doc.Find("a").Each(func(i int, s *goquery.Selection) {
|
||||||
if href, ok := s.Attr("href"); ok {
|
if href, ok := s.Attr("href"); ok {
|
||||||
go r.Gezer.Get(r.JoinURL(href))
|
go r.Gezer.Get(r.JoinURL(href))
|
||||||
|
5
go.mod
5
go.mod
@ -2,4 +2,7 @@ module github.com/gogezer/gezer
|
|||||||
|
|
||||||
go 1.12
|
go 1.12
|
||||||
|
|
||||||
require github.com/PuerkitoBio/goquery v1.5.0
|
require (
|
||||||
|
github.com/PuerkitoBio/goquery v1.5.0
|
||||||
|
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
|
||||||
|
)
|
||||||
|
2
go.sum
2
go.sum
@ -2,6 +2,8 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP
|
|||||||
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
|
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
|
||||||
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
|
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
|
||||||
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||||
|
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ=
|
||||||
|
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8=
|
||||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
|
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
|
||||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||||
|
Loading…
x
Reference in New Issue
Block a user