diff --git a/.gitignore b/.gitignore index ec6c7f9..cb60bc9 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,6 @@ # IDE directories .idea/ + +# Output files +out.json diff --git a/export.go b/export.go index c36a48f..9cd50d6 100644 --- a/export.go +++ b/export.go @@ -4,14 +4,21 @@ import ( "encoding/json" "fmt" "os" + "sync" ) +var file *os.File +var once sync.Once + func Export(response *Response) { - file, err := os.Create("out.json") - if err != nil { - fmt.Fprintf(os.Stderr, "output file creation error: %v", err) - return - } + once.Do(func() { + newFile, err := os.OpenFile("out.json", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) + if err != nil { + fmt.Fprintf(os.Stderr, "output file creation error: %v", err) + return + } + file = newFile + }) for res := range response.Exports { //fmt.Println(res) diff --git a/gezer.go b/gezer.go index 39cb0d4..f5c72c8 100644 --- a/gezer.go +++ b/gezer.go @@ -4,7 +4,9 @@ import ( "bytes" "fmt" "github.com/PuerkitoBio/goquery" + httpcacheDumb "github.com/fpfeng/httpcache" "io/ioutil" + "log" "net/http" "net/url" "os" @@ -25,9 +27,11 @@ type Opt struct { } func NewGezer(opt Opt) *Gezer { + log.SetOutput(os.Stdout) return &Gezer{ client: &http.Client{ - Timeout: time.Second * 10, + Timeout: time.Second * 10, + Transport: httpcacheDumb.NewMemoryCacheTransport(), }, opt: opt, } @@ -51,7 +55,7 @@ func (g *Gezer) Get(rawURL string) { } // Log - fmt.Println("Fetching: ", rawURL) + log.Println("Fetching: ", rawURL) // Get request resp, err := g.client.Get(rawURL) @@ -99,17 +103,20 @@ func checkURL(rawURL string, allowedDomains []string) bool { } // Check for allowed domains - var allowed bool - for _, domain := range allowedDomains { - if domain == parsedURL.Host { - allowed = true - break - } - } - if !allowed && len(allowedDomains) != 0 { - fmt.Fprintf(os.Stderr, "domain not allowed: %s\n", parsedURL.Host) + if len(allowedDomains) != 0 && !Contains(allowedDomains, parsedURL.Host) { + log.Printf("Domain not allowed: %s\n", parsedURL.Host) return false } return true } + +// Contains checks whether []string contains string +func Contains(s []string, e string) bool { + for _, a := range s { + if a == e { + return true + } + } + return false +} diff --git a/gezer_test.go b/gezer_test.go index 4edf82c..2c22753 100644 --- a/gezer_test.go +++ b/gezer_test.go @@ -8,9 +8,10 @@ import ( func TestGezer_StartURLs_Simple(t *testing.T) { gezer := NewGezer(Opt{ - StartURLs: []string{"https://api.ipify.org", "https://api.ipify.org"}, + StartURLs: []string{"http://api.ipify.org"}, ParseFunc: func(r *Response) { fmt.Println(string(r.Body)) + r.Gezer.Get("http://api.ipify.org") }, }) gezer.Start() @@ -42,6 +43,7 @@ func TestGezer_Concurrent_Requests(t *testing.T) { AllowedDomains: []string{"quotes.toscrape.com"}, StartURLs: []string{"http://quotes.toscrape.com/"}, ParseFunc: func(r *Response) { + //r.Exports <- map[string]interface{}{"href": r.Request.URL.String()} r.Doc.Find("a").Each(func(i int, s *goquery.Selection) { if href, ok := s.Attr("href"); ok { go r.Gezer.Get(r.JoinURL(href)) diff --git a/go.mod b/go.mod index 146b938..d264ce5 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,7 @@ module github.com/gogezer/gezer go 1.12 -require github.com/PuerkitoBio/goquery v1.5.0 +require ( + github.com/PuerkitoBio/goquery v1.5.0 + github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 +) diff --git a/go.sum b/go.sum index 0327c72..c8a6c16 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,8 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ= +github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=