Caching added.

JSON File export will append, not truncate.
This commit is contained in:
Musab Gültekin 2019-06-08 15:29:09 +03:00
parent 9e61a96412
commit ca197ff06a
6 changed files with 42 additions and 18 deletions

3
.gitignore vendored
View File

@ -16,3 +16,6 @@
# IDE directories # IDE directories
.idea/ .idea/
# Output files
out.json

View File

@ -4,14 +4,21 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"os" "os"
"sync"
) )
var file *os.File
var once sync.Once
func Export(response *Response) { func Export(response *Response) {
file, err := os.Create("out.json") once.Do(func() {
if err != nil { newFile, err := os.OpenFile("out.json", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
fmt.Fprintf(os.Stderr, "output file creation error: %v", err) if err != nil {
return fmt.Fprintf(os.Stderr, "output file creation error: %v", err)
} return
}
file = newFile
})
for res := range response.Exports { for res := range response.Exports {
//fmt.Println(res) //fmt.Println(res)

View File

@ -4,7 +4,9 @@ import (
"bytes" "bytes"
"fmt" "fmt"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
httpcacheDumb "github.com/fpfeng/httpcache"
"io/ioutil" "io/ioutil"
"log"
"net/http" "net/http"
"net/url" "net/url"
"os" "os"
@ -25,9 +27,11 @@ type Opt struct {
} }
func NewGezer(opt Opt) *Gezer { func NewGezer(opt Opt) *Gezer {
log.SetOutput(os.Stdout)
return &Gezer{ return &Gezer{
client: &http.Client{ client: &http.Client{
Timeout: time.Second * 10, Timeout: time.Second * 10,
Transport: httpcacheDumb.NewMemoryCacheTransport(),
}, },
opt: opt, opt: opt,
} }
@ -51,7 +55,7 @@ func (g *Gezer) Get(rawURL string) {
} }
// Log // Log
fmt.Println("Fetching: ", rawURL) log.Println("Fetching: ", rawURL)
// Get request // Get request
resp, err := g.client.Get(rawURL) resp, err := g.client.Get(rawURL)
@ -99,17 +103,20 @@ func checkURL(rawURL string, allowedDomains []string) bool {
} }
// Check for allowed domains // Check for allowed domains
var allowed bool if len(allowedDomains) != 0 && !Contains(allowedDomains, parsedURL.Host) {
for _, domain := range allowedDomains { log.Printf("Domain not allowed: %s\n", parsedURL.Host)
if domain == parsedURL.Host {
allowed = true
break
}
}
if !allowed && len(allowedDomains) != 0 {
fmt.Fprintf(os.Stderr, "domain not allowed: %s\n", parsedURL.Host)
return false return false
} }
return true return true
} }
// Contains checks whether []string contains string
func Contains(s []string, e string) bool {
for _, a := range s {
if a == e {
return true
}
}
return false
}

View File

@ -8,9 +8,10 @@ import (
func TestGezer_StartURLs_Simple(t *testing.T) { func TestGezer_StartURLs_Simple(t *testing.T) {
gezer := NewGezer(Opt{ gezer := NewGezer(Opt{
StartURLs: []string{"https://api.ipify.org", "https://api.ipify.org"}, StartURLs: []string{"http://api.ipify.org"},
ParseFunc: func(r *Response) { ParseFunc: func(r *Response) {
fmt.Println(string(r.Body)) fmt.Println(string(r.Body))
r.Gezer.Get("http://api.ipify.org")
}, },
}) })
gezer.Start() gezer.Start()
@ -42,6 +43,7 @@ func TestGezer_Concurrent_Requests(t *testing.T) {
AllowedDomains: []string{"quotes.toscrape.com"}, AllowedDomains: []string{"quotes.toscrape.com"},
StartURLs: []string{"http://quotes.toscrape.com/"}, StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: func(r *Response) { ParseFunc: func(r *Response) {
//r.Exports <- map[string]interface{}{"href": r.Request.URL.String()}
r.Doc.Find("a").Each(func(i int, s *goquery.Selection) { r.Doc.Find("a").Each(func(i int, s *goquery.Selection) {
if href, ok := s.Attr("href"); ok { if href, ok := s.Attr("href"); ok {
go r.Gezer.Get(r.JoinURL(href)) go r.Gezer.Get(r.JoinURL(href))

5
go.mod
View File

@ -2,4 +2,7 @@ module github.com/gogezer/gezer
go 1.12 go 1.12
require github.com/PuerkitoBio/goquery v1.5.0 require (
github.com/PuerkitoBio/goquery v1.5.0
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
)

2
go.sum
View File

@ -2,6 +2,8 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ=
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=