Caching added.

JSON File export will append, not truncate.
This commit is contained in:
Musab Gültekin 2019-06-08 15:29:09 +03:00
parent 9e61a96412
commit ca197ff06a
6 changed files with 42 additions and 18 deletions

3
.gitignore vendored
View File

@ -16,3 +16,6 @@
# IDE directories
.idea/
# Output files
out.json

View File

@ -4,14 +4,21 @@ import (
"encoding/json"
"fmt"
"os"
"sync"
)
var file *os.File
var once sync.Once
func Export(response *Response) {
file, err := os.Create("out.json")
once.Do(func() {
newFile, err := os.OpenFile("out.json", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
return
}
file = newFile
})
for res := range response.Exports {
//fmt.Println(res)

View File

@ -4,7 +4,9 @@ import (
"bytes"
"fmt"
"github.com/PuerkitoBio/goquery"
httpcacheDumb "github.com/fpfeng/httpcache"
"io/ioutil"
"log"
"net/http"
"net/url"
"os"
@ -25,9 +27,11 @@ type Opt struct {
}
func NewGezer(opt Opt) *Gezer {
log.SetOutput(os.Stdout)
return &Gezer{
client: &http.Client{
Timeout: time.Second * 10,
Transport: httpcacheDumb.NewMemoryCacheTransport(),
},
opt: opt,
}
@ -51,7 +55,7 @@ func (g *Gezer) Get(rawURL string) {
}
// Log
fmt.Println("Fetching: ", rawURL)
log.Println("Fetching: ", rawURL)
// Get request
resp, err := g.client.Get(rawURL)
@ -99,17 +103,20 @@ func checkURL(rawURL string, allowedDomains []string) bool {
}
// Check for allowed domains
var allowed bool
for _, domain := range allowedDomains {
if domain == parsedURL.Host {
allowed = true
break
}
}
if !allowed && len(allowedDomains) != 0 {
fmt.Fprintf(os.Stderr, "domain not allowed: %s\n", parsedURL.Host)
if len(allowedDomains) != 0 && !Contains(allowedDomains, parsedURL.Host) {
log.Printf("Domain not allowed: %s\n", parsedURL.Host)
return false
}
return true
}
// Contains checks whether []string contains string
func Contains(s []string, e string) bool {
for _, a := range s {
if a == e {
return true
}
}
return false
}

View File

@ -8,9 +8,10 @@ import (
func TestGezer_StartURLs_Simple(t *testing.T) {
gezer := NewGezer(Opt{
StartURLs: []string{"https://api.ipify.org", "https://api.ipify.org"},
StartURLs: []string{"http://api.ipify.org"},
ParseFunc: func(r *Response) {
fmt.Println(string(r.Body))
r.Gezer.Get("http://api.ipify.org")
},
})
gezer.Start()
@ -42,6 +43,7 @@ func TestGezer_Concurrent_Requests(t *testing.T) {
AllowedDomains: []string{"quotes.toscrape.com"},
StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: func(r *Response) {
//r.Exports <- map[string]interface{}{"href": r.Request.URL.String()}
r.Doc.Find("a").Each(func(i int, s *goquery.Selection) {
if href, ok := s.Attr("href"); ok {
go r.Gezer.Get(r.JoinURL(href))

5
go.mod
View File

@ -2,4 +2,7 @@ module github.com/gogezer/gezer
go 1.12
require github.com/PuerkitoBio/goquery v1.5.0
require (
github.com/PuerkitoBio/goquery v1.5.0
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
)

2
go.sum
View File

@ -2,6 +2,8 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ=
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=