Caching added.
JSON File export will append, not truncate.
This commit is contained in:
		
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -16,3 +16,6 @@ | |||||||
|  |  | ||||||
| # IDE directories | # IDE directories | ||||||
| .idea/ | .idea/ | ||||||
|  |  | ||||||
|  | # Output files | ||||||
|  | out.json | ||||||
|   | |||||||
| @@ -4,14 +4,21 @@ import ( | |||||||
| 	"encoding/json" | 	"encoding/json" | ||||||
| 	"fmt" | 	"fmt" | ||||||
| 	"os" | 	"os" | ||||||
|  | 	"sync" | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  | var file *os.File | ||||||
|  | var once sync.Once | ||||||
|  |  | ||||||
| func Export(response *Response) { | func Export(response *Response) { | ||||||
| 	file, err := os.Create("out.json") | 	once.Do(func() { | ||||||
|  | 		newFile, err := os.OpenFile("out.json", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| 			fmt.Fprintf(os.Stderr, "output file creation error: %v", err) | 			fmt.Fprintf(os.Stderr, "output file creation error: %v", err) | ||||||
| 			return | 			return | ||||||
| 		} | 		} | ||||||
|  | 		file = newFile | ||||||
|  | 	}) | ||||||
|  |  | ||||||
| 	for res := range response.Exports { | 	for res := range response.Exports { | ||||||
| 		//fmt.Println(res) | 		//fmt.Println(res) | ||||||
|   | |||||||
							
								
								
									
										27
									
								
								gezer.go
									
									
									
									
									
								
							
							
						
						
									
										27
									
								
								gezer.go
									
									
									
									
									
								
							| @@ -4,7 +4,9 @@ import ( | |||||||
| 	"bytes" | 	"bytes" | ||||||
| 	"fmt" | 	"fmt" | ||||||
| 	"github.com/PuerkitoBio/goquery" | 	"github.com/PuerkitoBio/goquery" | ||||||
|  | 	httpcacheDumb "github.com/fpfeng/httpcache" | ||||||
| 	"io/ioutil" | 	"io/ioutil" | ||||||
|  | 	"log" | ||||||
| 	"net/http" | 	"net/http" | ||||||
| 	"net/url" | 	"net/url" | ||||||
| 	"os" | 	"os" | ||||||
| @@ -25,9 +27,11 @@ type Opt struct { | |||||||
| } | } | ||||||
|  |  | ||||||
| func NewGezer(opt Opt) *Gezer { | func NewGezer(opt Opt) *Gezer { | ||||||
|  | 	log.SetOutput(os.Stdout) | ||||||
| 	return &Gezer{ | 	return &Gezer{ | ||||||
| 		client: &http.Client{ | 		client: &http.Client{ | ||||||
| 			Timeout:   time.Second * 10, | 			Timeout:   time.Second * 10, | ||||||
|  | 			Transport: httpcacheDumb.NewMemoryCacheTransport(), | ||||||
| 		}, | 		}, | ||||||
| 		opt: opt, | 		opt: opt, | ||||||
| 	} | 	} | ||||||
| @@ -51,7 +55,7 @@ func (g *Gezer) Get(rawURL string) { | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Log | 	// Log | ||||||
| 	fmt.Println("Fetching: ", rawURL) | 	log.Println("Fetching: ", rawURL) | ||||||
|  |  | ||||||
| 	// Get request | 	// Get request | ||||||
| 	resp, err := g.client.Get(rawURL) | 	resp, err := g.client.Get(rawURL) | ||||||
| @@ -99,17 +103,20 @@ func checkURL(rawURL string, allowedDomains []string) bool { | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Check for allowed domains | 	// Check for allowed domains | ||||||
| 	var allowed bool | 	if len(allowedDomains) != 0 && !Contains(allowedDomains, parsedURL.Host) { | ||||||
| 	for _, domain := range allowedDomains { | 		log.Printf("Domain not allowed: %s\n", parsedURL.Host) | ||||||
| 		if domain == parsedURL.Host { |  | ||||||
| 			allowed = true |  | ||||||
| 			break |  | ||||||
| 		} |  | ||||||
| 	} |  | ||||||
| 	if !allowed && len(allowedDomains) != 0 { |  | ||||||
| 		fmt.Fprintf(os.Stderr, "domain not allowed: %s\n", parsedURL.Host) |  | ||||||
| 		return false | 		return false | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	return true | 	return true | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // Contains checks whether []string contains string | ||||||
|  | func Contains(s []string, e string) bool { | ||||||
|  | 	for _, a := range s { | ||||||
|  | 		if a == e { | ||||||
|  | 			return true | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return false | ||||||
|  | } | ||||||
|   | |||||||
| @@ -8,9 +8,10 @@ import ( | |||||||
|  |  | ||||||
| func TestGezer_StartURLs_Simple(t *testing.T) { | func TestGezer_StartURLs_Simple(t *testing.T) { | ||||||
| 	gezer := NewGezer(Opt{ | 	gezer := NewGezer(Opt{ | ||||||
| 		StartURLs: []string{"https://api.ipify.org", "https://api.ipify.org"}, | 		StartURLs: []string{"http://api.ipify.org"}, | ||||||
| 		ParseFunc: func(r *Response) { | 		ParseFunc: func(r *Response) { | ||||||
| 			fmt.Println(string(r.Body)) | 			fmt.Println(string(r.Body)) | ||||||
|  | 			r.Gezer.Get("http://api.ipify.org") | ||||||
| 		}, | 		}, | ||||||
| 	}) | 	}) | ||||||
| 	gezer.Start() | 	gezer.Start() | ||||||
| @@ -42,6 +43,7 @@ func TestGezer_Concurrent_Requests(t *testing.T) { | |||||||
| 		AllowedDomains: []string{"quotes.toscrape.com"}, | 		AllowedDomains: []string{"quotes.toscrape.com"}, | ||||||
| 		StartURLs:      []string{"http://quotes.toscrape.com/"}, | 		StartURLs:      []string{"http://quotes.toscrape.com/"}, | ||||||
| 		ParseFunc: func(r *Response) { | 		ParseFunc: func(r *Response) { | ||||||
|  | 			//r.Exports <- map[string]interface{}{"href": r.Request.URL.String()} | ||||||
| 			r.Doc.Find("a").Each(func(i int, s *goquery.Selection) { | 			r.Doc.Find("a").Each(func(i int, s *goquery.Selection) { | ||||||
| 				if href, ok := s.Attr("href"); ok { | 				if href, ok := s.Attr("href"); ok { | ||||||
| 					go r.Gezer.Get(r.JoinURL(href)) | 					go r.Gezer.Get(r.JoinURL(href)) | ||||||
|   | |||||||
							
								
								
									
										5
									
								
								go.mod
									
									
									
									
									
								
							
							
						
						
									
										5
									
								
								go.mod
									
									
									
									
									
								
							| @@ -2,4 +2,7 @@ module github.com/gogezer/gezer | |||||||
|  |  | ||||||
| go 1.12 | go 1.12 | ||||||
|  |  | ||||||
| require github.com/PuerkitoBio/goquery v1.5.0 | require ( | ||||||
|  | 	github.com/PuerkitoBio/goquery v1.5.0 | ||||||
|  | 	github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 | ||||||
|  | ) | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								go.sum
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								go.sum
									
									
									
									
									
								
							| @@ -2,6 +2,8 @@ github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP | |||||||
| github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= | github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg= | ||||||
| github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= | github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= | ||||||
| github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= | github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= | ||||||
|  | github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ= | ||||||
|  | github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8= | ||||||
| golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= | ||||||
| golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U= | golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U= | ||||||
| golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= | golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user