From 90d2be22108c93cd75d3aaee6f6309ec29be9f13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Musab=20G=C3=BCltekin?= Date: Sun, 7 Jul 2019 12:18:40 +0300 Subject: [PATCH] Caching policies added. We used httpcache library to implement this. As it was not possible to support different policies, I mostly copied and modified it. --- README.md | 2 +- cache/cache.go | 625 ++++++++++ cache/cache_test.go | 1476 +++++++++++++++++++++++ cache/diskcache/diskcache.go | 61 + cache/diskcache/diskcache_test.go | 18 + cache/leveldbcache/leveldbcache.go | 51 + cache/leveldbcache/leveldbcache_test.go | 24 + cache/memorycache/memorycache.go | 39 + geziyor.go | 10 +- geziyor_test.go | 6 +- go.mod | 4 +- go.sum | 28 +- options.go | 21 +- 13 files changed, 2349 insertions(+), 16 deletions(-) create mode 100644 cache/cache.go create mode 100644 cache/cache_test.go create mode 100644 cache/diskcache/diskcache.go create mode 100644 cache/diskcache/diskcache_test.go create mode 100644 cache/leveldbcache/leveldbcache.go create mode 100644 cache/leveldbcache/leveldbcache_test.go create mode 100644 cache/memorycache/memorycache.go diff --git a/README.md b/README.md index 026a894..b64e715 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use ## Features - 5.000+ Requests/Sec - JS Rendering -- Caching (Memory/Disk) +- Caching (Memory/Disk/LevelDB) - Automatic Data Exporting (JSON, CSV, or custom) - Metrics (Prometheus, Expvar, or custom) - Limit Concurrency (Global/Per Domain) diff --git a/cache/cache.go b/cache/cache.go new file mode 100644 index 0000000..6e77513 --- /dev/null +++ b/cache/cache.go @@ -0,0 +1,625 @@ +// Package cache provides a http.RoundTripper implementation that works as a +// mostly RFC-compliant cache for http responses. +// +// It is only suitable for use as a 'private' cache (i.e. for a web-browser or an API-client +// and not for a shared proxy). +// +// Mostly borrowed from https://github.com/gregjones/httpcache. Customized for different policies. +package cache + +import ( + "bufio" + "bytes" + "errors" + "github.com/geziyor/geziyor/cache/memorycache" + "io" + "io/ioutil" + "net/http" + "net/http/httputil" + "strings" + "testing" + "time" +) + +type Policy int + +const ( + // This policy has no awareness of any HTTP Cache-Control directives. + // Every request and its corresponding response are cached. + // When the same request is seen again, the response is returned without transferring anything from the Internet. + + // The Dummy policy is useful for testing spiders faster (without having to wait for downloads every time) + // and for trying your spider offline, when an Internet connection is not available. + // The goal is to be able to “replay” a spider run exactly as it ran before. + Dummy Policy = iota + + // This policy provides a RFC2616 compliant HTTP cache, i.e. with HTTP Cache-Control awareness, + // aimed at production and used in continuous runs to avoid downloading unmodified data + // (to save bandwidth and speed up crawls). + RFC2616 +) + +const ( + stale = iota + fresh + transparent + // XFromCache is the header added to responses that are returned from the cache + XFromCache = "X-From-Cache" +) + +// A Cache interface is used by the Transport to store and retrieve responses. +type Cache interface { + // Get returns the []byte representation of a cached response and a bool + // set to true if the value isn't empty + Get(key string) (responseBytes []byte, ok bool) + // Set stores the []byte representation of a response against a key + Set(key string, responseBytes []byte) + // Delete removes the value associated with the key + Delete(key string) +} + +// cacheKey returns the cache key for req. +func cacheKey(req *http.Request) string { + if req.Method == http.MethodGet { + return req.URL.String() + } else { + return req.Method + " " + req.URL.String() + } +} + +// CachedResponse returns the cached http.Response for req if present, and nil +// otherwise. +func CachedResponse(c Cache, req *http.Request) (resp *http.Response, err error) { + cachedVal, ok := c.Get(cacheKey(req)) + if !ok { + return + } + + b := bytes.NewBuffer(cachedVal) + return http.ReadResponse(bufio.NewReader(b), req) +} + +// Transport is an implementation of http.RoundTripper that will return values from a cache +// where possible (avoiding a network request) and will additionally add validators (etag/if-modified-since) +// to repeated requests allowing servers to return 304 / Not Modified +type Transport struct { + Policy Policy + // The RoundTripper interface actually used to make requests + // If nil, http.DefaultTransport is used + Transport http.RoundTripper + Cache Cache + // If true, responses returned from the cache will be given an extra header, X-From-Cache + MarkCachedResponses bool +} + +// NewTransport returns a new Transport with the +// provided Cache implementation and MarkCachedResponses set to true +func NewTransport(c Cache) *Transport { + return &Transport{ + Policy: RFC2616, + Cache: c, + MarkCachedResponses: true, + } +} + +// Client returns an *http.Client that caches responses. +func (t *Transport) Client() *http.Client { + return &http.Client{Transport: t} +} + +// varyMatches will return false unless all of the cached values for the headers listed in Vary +// match the new request +func varyMatches(cachedResp *http.Response, req *http.Request) bool { + for _, header := range headerAllCommaSepValues(cachedResp.Header, "vary") { + header = http.CanonicalHeaderKey(header) + if header != "" && req.Header.Get(header) != cachedResp.Header.Get("X-Varied-"+header) { + return false + } + } + return true +} + +// RoundTrip is a wrapper for caching requests. +// If there is a fresh Response already in cache, then it will be returned without connecting to +// the server. +// +func (t *Transport) RoundTrip(req *http.Request) (resp *http.Response, err error) { + if t.Policy == Dummy { + return t.RoundTripDummy(req) + } + return t.RoundTripRFC2616(req) +} + +// RoundTripDummy has no awareness of any HTTP Cache-Control directives. +// Every request and its corresponding response are cached. +// When the same request is seen again, the response is returned without transferring anything from the Internet. +func (t *Transport) RoundTripDummy(req *http.Request) (resp *http.Response, err error) { + cacheKey := cacheKey(req) + cacheable := (req.Method == "GET" || req.Method == "HEAD") && req.Header.Get("range") == "" + var cachedResp *http.Response + if cacheable { + cachedResp, err = CachedResponse(t.Cache, req) + } else { + // Need to invalidate an existing value + t.Cache.Delete(cacheKey) + } + + transport := t.Transport + if transport == nil { + transport = http.DefaultTransport + } + + if cacheable && cachedResp != nil && err == nil { + if t.MarkCachedResponses { + cachedResp.Header.Set(XFromCache, "1") + } + return cachedResp, nil + } else { + resp, err = transport.RoundTrip(req) + if err != nil { + return nil, err + } + } + + if cacheable { + respBytes, err := httputil.DumpResponse(resp, true) + if err == nil { + t.Cache.Set(cacheKey, respBytes) + } + } else { + t.Cache.Delete(cacheKey) + } + return resp, nil +} + +// RoundTripRFC2616 provides a RFC2616 compliant HTTP cache, i.e. with HTTP Cache-Control awareness, +// aimed at production and used in continuous runs to avoid downloading unmodified data +// (to save bandwidth and speed up crawls). +// +// If there is a stale Response, then any validators it contains will be set on the new request +// to give the server a chance to respond with NotModified. If this happens, then the cached Response +// will be returned. +func (t *Transport) RoundTripRFC2616(req *http.Request) (resp *http.Response, err error) { + cacheKey := cacheKey(req) + cacheable := (req.Method == "GET" || req.Method == "HEAD") && req.Header.Get("range") == "" + var cachedResp *http.Response + if cacheable { + cachedResp, err = CachedResponse(t.Cache, req) + } else { + // Need to invalidate an existing value + t.Cache.Delete(cacheKey) + } + + transport := t.Transport + if transport == nil { + transport = http.DefaultTransport + } + + if cacheable && cachedResp != nil && err == nil { + if t.MarkCachedResponses { + cachedResp.Header.Set(XFromCache, "1") + } + + if varyMatches(cachedResp, req) { + // Can only use cached value if the new request doesn't Vary significantly + freshness := getFreshness(cachedResp.Header, req.Header) + if freshness == fresh { + return cachedResp, nil + } + + if freshness == stale { + var req2 *http.Request + // Add validators if caller hasn't already done so + etag := cachedResp.Header.Get("etag") + if etag != "" && req.Header.Get("etag") == "" { + req2 = cloneRequest(req) + req2.Header.Set("if-none-match", etag) + } + lastModified := cachedResp.Header.Get("last-modified") + if lastModified != "" && req.Header.Get("last-modified") == "" { + if req2 == nil { + req2 = cloneRequest(req) + } + req2.Header.Set("if-modified-since", lastModified) + } + if req2 != nil { + req = req2 + } + } + } + + resp, err = transport.RoundTrip(req) + if err == nil && req.Method == "GET" && resp.StatusCode == http.StatusNotModified { + // Replace the 304 response with the one from cache, but update with some new headers + endToEndHeaders := getEndToEndHeaders(resp.Header) + for _, header := range endToEndHeaders { + cachedResp.Header[header] = resp.Header[header] + } + resp.Body.Close() + resp = cachedResp + } else if (err != nil || resp.StatusCode >= 500) && + req.Method == "GET" && canStaleOnError(cachedResp.Header, req.Header) { + // In case of transport failure and stale-if-error activated, returns cached content + // when available + if resp != nil && resp.Body != nil { + resp.Body.Close() + } + return cachedResp, nil + } else { + if err != nil || resp.StatusCode != http.StatusOK { + t.Cache.Delete(cacheKey) + } + if err != nil { + return nil, err + } + } + } else { + reqCacheControl := parseCacheControl(req.Header) + if _, ok := reqCacheControl["only-if-cached"]; ok { + resp = newGatewayTimeoutResponse(req) + } else { + resp, err = transport.RoundTrip(req) + if err != nil { + return nil, err + } + } + } + + if cacheable && canStore(parseCacheControl(req.Header), parseCacheControl(resp.Header)) { + for _, varyKey := range headerAllCommaSepValues(resp.Header, "vary") { + varyKey = http.CanonicalHeaderKey(varyKey) + fakeHeader := "X-Varied-" + varyKey + reqValue := req.Header.Get(varyKey) + if reqValue != "" { + resp.Header.Set(fakeHeader, reqValue) + } + } + switch req.Method { + case "GET": + // Delay caching until EOF is reached. + resp.Body = &cachingReadCloser{ + R: resp.Body, + OnEOF: func(r io.Reader) { + resp := *resp + resp.Body = ioutil.NopCloser(r) + respBytes, err := httputil.DumpResponse(&resp, true) + if err == nil { + t.Cache.Set(cacheKey, respBytes) + } + }, + } + default: + respBytes, err := httputil.DumpResponse(resp, true) + if err == nil { + t.Cache.Set(cacheKey, respBytes) + } + } + } else { + t.Cache.Delete(cacheKey) + } + return resp, nil +} + +// ErrNoDateHeader indicates that the HTTP headers contained no Date header. +var ErrNoDateHeader = errors.New("no Date header") + +// Date parses and returns the value of the Date header. +func Date(respHeaders http.Header) (date time.Time, err error) { + dateHeader := respHeaders.Get("date") + if dateHeader == "" { + err = ErrNoDateHeader + return + } + + return time.Parse(time.RFC1123, dateHeader) +} + +type realClock struct{} + +func (c *realClock) since(d time.Time) time.Duration { + return time.Since(d) +} + +type timer interface { + since(d time.Time) time.Duration +} + +var clock timer = &realClock{} + +// getFreshness will return one of fresh/stale/transparent based on the cache-control +// values of the request and the response +// +// fresh indicates the response can be returned +// stale indicates that the response needs validating before it is returned +// transparent indicates the response should not be used to fulfil the request +// +// Because this is only a private cache, 'public' and 'private' in cache-control aren't +// signficant. Similarly, smax-age isn't used. +func getFreshness(respHeaders, reqHeaders http.Header) (freshness int) { + respCacheControl := parseCacheControl(respHeaders) + reqCacheControl := parseCacheControl(reqHeaders) + if _, ok := reqCacheControl["no-cache"]; ok { + return transparent + } + if _, ok := respCacheControl["no-cache"]; ok { + return stale + } + if _, ok := reqCacheControl["only-if-cached"]; ok { + return fresh + } + + date, err := Date(respHeaders) + if err != nil { + return stale + } + currentAge := clock.since(date) + + var lifetime time.Duration + var zeroDuration time.Duration + + // If a response includes both an Expires header and a max-age directive, + // the max-age directive overrides the Expires header, even if the Expires header is more restrictive. + if maxAge, ok := respCacheControl["max-age"]; ok { + lifetime, err = time.ParseDuration(maxAge + "s") + if err != nil { + lifetime = zeroDuration + } + } else { + expiresHeader := respHeaders.Get("Expires") + if expiresHeader != "" { + expires, err := time.Parse(time.RFC1123, expiresHeader) + if err != nil { + lifetime = zeroDuration + } else { + lifetime = expires.Sub(date) + } + } + } + + if maxAge, ok := reqCacheControl["max-age"]; ok { + // the client is willing to accept a response whose age is no greater than the specified time in seconds + lifetime, err = time.ParseDuration(maxAge + "s") + if err != nil { + lifetime = zeroDuration + } + } + if minfresh, ok := reqCacheControl["min-fresh"]; ok { + // the client wants a response that will still be fresh for at least the specified number of seconds. + minfreshDuration, err := time.ParseDuration(minfresh + "s") + if err == nil { + currentAge = time.Duration(currentAge + minfreshDuration) + } + } + + if maxstale, ok := reqCacheControl["max-stale"]; ok { + // Indicates that the client is willing to accept a response that has exceeded its expiration time. + // If max-stale is assigned a value, then the client is willing to accept a response that has exceeded + // its expiration time by no more than the specified number of seconds. + // If no value is assigned to max-stale, then the client is willing to accept a stale response of any age. + // + // Responses served only because of a max-stale value are supposed to have a Warning header added to them, + // but that seems like a hassle, and is it actually useful? If so, then there needs to be a different + // return-value available here. + if maxstale == "" { + return fresh + } + maxstaleDuration, err := time.ParseDuration(maxstale + "s") + if err == nil { + currentAge = time.Duration(currentAge - maxstaleDuration) + } + } + + if lifetime > currentAge { + return fresh + } + + return stale +} + +// Returns true if either the request or the response includes the stale-if-error +// cache control extension: https://tools.ietf.org/html/rfc5861 +func canStaleOnError(respHeaders, reqHeaders http.Header) bool { + respCacheControl := parseCacheControl(respHeaders) + reqCacheControl := parseCacheControl(reqHeaders) + + var err error + lifetime := time.Duration(-1) + + if staleMaxAge, ok := respCacheControl["stale-if-error"]; ok { + if staleMaxAge != "" { + lifetime, err = time.ParseDuration(staleMaxAge + "s") + if err != nil { + return false + } + } else { + return true + } + } + if staleMaxAge, ok := reqCacheControl["stale-if-error"]; ok { + if staleMaxAge != "" { + lifetime, err = time.ParseDuration(staleMaxAge + "s") + if err != nil { + return false + } + } else { + return true + } + } + + if lifetime >= 0 { + date, err := Date(respHeaders) + if err != nil { + return false + } + currentAge := clock.since(date) + if lifetime > currentAge { + return true + } + } + + return false +} + +func getEndToEndHeaders(respHeaders http.Header) []string { + // These headers are always hop-by-hop + hopByHopHeaders := map[string]struct{}{ + "Connection": {}, + "Keep-Alive": {}, + "Proxy-Authenticate": {}, + "Proxy-Authorization": {}, + "Te": {}, + "Trailers": {}, + "Transfer-Encoding": {}, + "Upgrade": {}, + } + + for _, extra := range strings.Split(respHeaders.Get("connection"), ",") { + // any header listed in connection, if present, is also considered hop-by-hop + if strings.Trim(extra, " ") != "" { + hopByHopHeaders[http.CanonicalHeaderKey(extra)] = struct{}{} + } + } + var endToEndHeaders []string + for respHeader := range respHeaders { + if _, ok := hopByHopHeaders[respHeader]; !ok { + endToEndHeaders = append(endToEndHeaders, respHeader) + } + } + return endToEndHeaders +} + +func canStore(reqCacheControl, respCacheControl cacheControl) (canStore bool) { + if _, ok := respCacheControl["no-store"]; ok { + return false + } + if _, ok := reqCacheControl["no-store"]; ok { + return false + } + return true +} + +func newGatewayTimeoutResponse(req *http.Request) *http.Response { + var braw bytes.Buffer + braw.WriteString("HTTP/1.1 504 Gateway Timeout\r\n\r\n") + resp, err := http.ReadResponse(bufio.NewReader(&braw), req) + if err != nil { + panic(err) + } + return resp +} + +// cloneRequest returns a clone of the provided *http.Request. +// The clone is a shallow copy of the struct and its Header map. +// (This function copyright goauth2 authors: https://code.google.com/p/goauth2) +func cloneRequest(r *http.Request) *http.Request { + // shallow copy of the struct + r2 := new(http.Request) + *r2 = *r + // deep copy of the Header + r2.Header = make(http.Header) + for k, s := range r.Header { + r2.Header[k] = s + } + return r2 +} + +type cacheControl map[string]string + +func parseCacheControl(headers http.Header) cacheControl { + cc := cacheControl{} + ccHeader := headers.Get("Cache-Control") + for _, part := range strings.Split(ccHeader, ",") { + part = strings.Trim(part, " ") + if part == "" { + continue + } + if strings.ContainsRune(part, '=') { + keyval := strings.Split(part, "=") + cc[strings.Trim(keyval[0], " ")] = strings.Trim(keyval[1], ",") + } else { + cc[part] = "" + } + } + return cc +} + +// headerAllCommaSepValues returns all comma-separated values (each +// with whitespace trimmed) for header name in headers. According to +// Section 4.2 of the HTTP/1.1 spec +// (http://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.2), +// values from multiple occurrences of a header should be concatenated, if +// the header's value is a comma-separated list. +func headerAllCommaSepValues(headers http.Header, name string) []string { + var vals []string + for _, val := range headers[http.CanonicalHeaderKey(name)] { + fields := strings.Split(val, ",") + for i, f := range fields { + fields[i] = strings.TrimSpace(f) + } + vals = append(vals, fields...) + } + return vals +} + +// cachingReadCloser is a wrapper around ReadCloser R that calls OnEOF +// handler with a full copy of the content read from R when EOF is +// reached. +type cachingReadCloser struct { + // Underlying ReadCloser. + R io.ReadCloser + // OnEOF is called with a copy of the content of R when EOF is reached. + OnEOF func(io.Reader) + + buf bytes.Buffer // buf stores a copy of the content of R. +} + +// Read reads the next len(p) bytes from R or until R is drained. The +// return value n is the number of bytes read. If R has no data to +// return, err is io.EOF and OnEOF is called with a full copy of what +// has been read so far. +func (r *cachingReadCloser) Read(p []byte) (n int, err error) { + n, err = r.R.Read(p) + r.buf.Write(p[:n]) + if err == io.EOF { + r.OnEOF(bytes.NewReader(r.buf.Bytes())) + } + return n, err +} + +func (r *cachingReadCloser) Close() error { + return r.R.Close() +} + +// PleaseCache excercises a Cache implementation. +func PleaseCache(t *testing.T, cache Cache) { + key := "testKey" + _, ok := cache.Get(key) + if ok { + t.Fatal("retrieved key before adding it") + } + + val := []byte("some bytes") + cache.Set(key, val) + + retVal, ok := cache.Get(key) + if !ok { + t.Fatal("could not retrieve an element we just added") + } + if !bytes.Equal(retVal, val) { + t.Fatal("retrieved a different value than what we put in") + } + + cache.Delete(key) + + _, ok = cache.Get(key) + if ok { + t.Fatal("deleted key still present") + } +} + +// NewMemoryCacheTransport returns a new Transport using the in-memory cache implementation +func NewMemoryCacheTransport() *Transport { + c := memorycache.New() + t := NewTransport(c) + return t +} diff --git a/cache/cache_test.go b/cache/cache_test.go new file mode 100644 index 0000000..eeb3792 --- /dev/null +++ b/cache/cache_test.go @@ -0,0 +1,1476 @@ +package cache + +import ( + "bytes" + "errors" + "flag" + "github.com/geziyor/geziyor/cache/memorycache" + "io" + "io/ioutil" + "net/http" + "net/http/httptest" + "os" + "strconv" + "testing" + "time" +) + +var s struct { + server *httptest.Server + client http.Client + transport *Transport + done chan struct{} // Closed to unlock infinite handlers. +} + +type fakeClock struct { + elapsed time.Duration +} + +func (c *fakeClock) since(t time.Time) time.Duration { + return c.elapsed +} + +func TestMain(m *testing.M) { + flag.Parse() + setup() + code := m.Run() + teardown() + os.Exit(code) +} + +func setup() { + tp := NewMemoryCacheTransport() + client := http.Client{Transport: tp} + s.transport = tp + s.client = client + s.done = make(chan struct{}) + + mux := http.NewServeMux() + s.server = httptest.NewServer(mux) + + mux.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Cache-Control", "max-age=3600") + })) + + mux.HandleFunc("/method", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Cache-Control", "max-age=3600") + w.Write([]byte(r.Method)) + })) + + mux.HandleFunc("/range", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + lm := "Fri, 14 Dec 2010 01:01:50 GMT" + if r.Header.Get("if-modified-since") == lm { + w.WriteHeader(http.StatusNotModified) + return + } + w.Header().Set("last-modified", lm) + if r.Header.Get("range") == "bytes=4-9" { + w.WriteHeader(http.StatusPartialContent) + w.Write([]byte(" text ")) + return + } + w.Write([]byte("Some text content")) + })) + + mux.HandleFunc("/nostore", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Cache-Control", "no-store") + })) + + mux.HandleFunc("/etag", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + etag := "124567" + if r.Header.Get("if-none-match") == etag { + w.WriteHeader(http.StatusNotModified) + return + } + w.Header().Set("etag", etag) + })) + + mux.HandleFunc("/lastmodified", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + lm := "Fri, 14 Dec 2010 01:01:50 GMT" + if r.Header.Get("if-modified-since") == lm { + w.WriteHeader(http.StatusNotModified) + return + } + w.Header().Set("last-modified", lm) + })) + + mux.HandleFunc("/varyaccept", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Cache-Control", "max-age=3600") + w.Header().Set("Content-Type", "text/plain") + w.Header().Set("Vary", "Accept") + w.Write([]byte("Some text content")) + })) + + mux.HandleFunc("/doublevary", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Cache-Control", "max-age=3600") + w.Header().Set("Content-Type", "text/plain") + w.Header().Set("Vary", "Accept, Accept-Language") + w.Write([]byte("Some text content")) + })) + mux.HandleFunc("/2varyheaders", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Cache-Control", "max-age=3600") + w.Header().Set("Content-Type", "text/plain") + w.Header().Add("Vary", "Accept") + w.Header().Add("Vary", "Accept-Language") + w.Write([]byte("Some text content")) + })) + mux.HandleFunc("/varyunused", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Cache-Control", "max-age=3600") + w.Header().Set("Content-Type", "text/plain") + w.Header().Set("Vary", "X-Madeup-Header") + w.Write([]byte("Some text content")) + })) + + mux.HandleFunc("/cachederror", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + etag := "abc" + if r.Header.Get("if-none-match") == etag { + w.WriteHeader(http.StatusNotModified) + return + } + w.Header().Set("etag", etag) + w.WriteHeader(http.StatusNotFound) + w.Write([]byte("Not found")) + })) + + updateFieldsCounter := 0 + mux.HandleFunc("/updatefields", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("X-Counter", strconv.Itoa(updateFieldsCounter)) + w.Header().Set("Etag", `"e"`) + updateFieldsCounter++ + if r.Header.Get("if-none-match") != "" { + w.WriteHeader(http.StatusNotModified) + return + } + w.Write([]byte("Some text content")) + })) + + // Take 3 seconds to return 200 OK (for testing client timeouts). + mux.HandleFunc("/3seconds", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(3 * time.Second) + })) + + mux.HandleFunc("/infinite", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + for { + select { + case <-s.done: + return + default: + w.Write([]byte{0}) + } + } + })) +} + +func teardown() { + close(s.done) + s.server.Close() +} + +func resetTest() { + s.transport.Cache = memorycache.New() + clock = &realClock{} +} + +// TestCacheableMethod ensures that uncacheable method does not get stored +// in cache and get incorrectly used for a following cacheable method request. +func TestCacheableMethod(t *testing.T) { + resetTest() + { + req, err := http.NewRequest("POST", s.server.URL+"/method", nil) + if err != nil { + t.Fatal(err) + } + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + var buf bytes.Buffer + _, err = io.Copy(&buf, resp.Body) + if err != nil { + t.Fatal(err) + } + err = resp.Body.Close() + if err != nil { + t.Fatal(err) + } + if got, want := buf.String(), "POST"; got != want { + t.Errorf("got %q, want %q", got, want) + } + if resp.StatusCode != http.StatusOK { + t.Errorf("response status code isn't 200 OK: %v", resp.StatusCode) + } + } + { + req, err := http.NewRequest("GET", s.server.URL+"/method", nil) + if err != nil { + t.Fatal(err) + } + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + var buf bytes.Buffer + _, err = io.Copy(&buf, resp.Body) + if err != nil { + t.Fatal(err) + } + err = resp.Body.Close() + if err != nil { + t.Fatal(err) + } + if got, want := buf.String(), "GET"; got != want { + t.Errorf("got wrong body %q, want %q", got, want) + } + if resp.StatusCode != http.StatusOK { + t.Errorf("response status code isn't 200 OK: %v", resp.StatusCode) + } + if resp.Header.Get(XFromCache) != "" { + t.Errorf("XFromCache header isn't blank") + } + } +} + +func TestDontServeHeadResponseToGetRequest(t *testing.T) { + resetTest() + url := s.server.URL + "/" + req, err := http.NewRequest(http.MethodHead, url, nil) + if err != nil { + t.Fatal(err) + } + _, err = s.client.Do(req) + if err != nil { + t.Fatal(err) + } + req, err = http.NewRequest(http.MethodGet, url, nil) + if err != nil { + t.Fatal(err) + } + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + if resp.Header.Get(XFromCache) != "" { + t.Errorf("Cache should not match") + } +} + +func TestDontStorePartialRangeInCache(t *testing.T) { + resetTest() + { + req, err := http.NewRequest("GET", s.server.URL+"/range", nil) + if err != nil { + t.Fatal(err) + } + req.Header.Set("range", "bytes=4-9") + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + var buf bytes.Buffer + _, err = io.Copy(&buf, resp.Body) + if err != nil { + t.Fatal(err) + } + err = resp.Body.Close() + if err != nil { + t.Fatal(err) + } + if got, want := buf.String(), " text "; got != want { + t.Errorf("got %q, want %q", got, want) + } + if resp.StatusCode != http.StatusPartialContent { + t.Errorf("response status code isn't 206 Partial Content: %v", resp.StatusCode) + } + } + { + req, err := http.NewRequest("GET", s.server.URL+"/range", nil) + if err != nil { + t.Fatal(err) + } + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + var buf bytes.Buffer + _, err = io.Copy(&buf, resp.Body) + if err != nil { + t.Fatal(err) + } + err = resp.Body.Close() + if err != nil { + t.Fatal(err) + } + if got, want := buf.String(), "Some text content"; got != want { + t.Errorf("got %q, want %q", got, want) + } + if resp.StatusCode != http.StatusOK { + t.Errorf("response status code isn't 200 OK: %v", resp.StatusCode) + } + if resp.Header.Get(XFromCache) != "" { + t.Error("XFromCache header isn't blank") + } + } + { + req, err := http.NewRequest("GET", s.server.URL+"/range", nil) + if err != nil { + t.Fatal(err) + } + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + var buf bytes.Buffer + _, err = io.Copy(&buf, resp.Body) + if err != nil { + t.Fatal(err) + } + err = resp.Body.Close() + if err != nil { + t.Fatal(err) + } + if got, want := buf.String(), "Some text content"; got != want { + t.Errorf("got %q, want %q", got, want) + } + if resp.StatusCode != http.StatusOK { + t.Errorf("response status code isn't 200 OK: %v", resp.StatusCode) + } + if resp.Header.Get(XFromCache) != "1" { + t.Errorf(`XFromCache header isn't "1": %v`, resp.Header.Get(XFromCache)) + } + } + { + req, err := http.NewRequest("GET", s.server.URL+"/range", nil) + if err != nil { + t.Fatal(err) + } + req.Header.Set("range", "bytes=4-9") + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + var buf bytes.Buffer + _, err = io.Copy(&buf, resp.Body) + if err != nil { + t.Fatal(err) + } + err = resp.Body.Close() + if err != nil { + t.Fatal(err) + } + if got, want := buf.String(), " text "; got != want { + t.Errorf("got %q, want %q", got, want) + } + if resp.StatusCode != http.StatusPartialContent { + t.Errorf("response status code isn't 206 Partial Content: %v", resp.StatusCode) + } + } +} + +func TestCacheOnlyIfBodyRead(t *testing.T) { + resetTest() + { + req, err := http.NewRequest("GET", s.server.URL, nil) + if err != nil { + t.Fatal(err) + } + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + // We do not read the body + resp.Body.Close() + } + { + req, err := http.NewRequest("GET", s.server.URL, nil) + if err != nil { + t.Fatal(err) + } + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatalf("XFromCache header isn't blank") + } + } +} + +func TestOnlyReadBodyOnDemand(t *testing.T) { + resetTest() + + req, err := http.NewRequest("GET", s.server.URL+"/infinite", nil) + if err != nil { + t.Fatal(err) + } + resp, err := s.client.Do(req) // This shouldn't hang forever. + if err != nil { + t.Fatal(err) + } + buf := make([]byte, 10) // Only partially read the body. + _, err = resp.Body.Read(buf) + if err != nil { + t.Fatal(err) + } + resp.Body.Close() +} + +func TestGetOnlyIfCachedHit(t *testing.T) { + resetTest() + { + req, err := http.NewRequest("GET", s.server.URL, nil) + if err != nil { + t.Fatal(err) + } + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + } + { + req, err := http.NewRequest("GET", s.server.URL, nil) + if err != nil { + t.Fatal(err) + } + req.Header.Add("cache-control", "only-if-cached") + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "1" { + t.Fatalf(`XFromCache header isn't "1": %v`, resp.Header.Get(XFromCache)) + } + if resp.StatusCode != http.StatusOK { + t.Fatalf("response status code isn't 200 OK: %v", resp.StatusCode) + } + } +} + +func TestGetOnlyIfCachedMiss(t *testing.T) { + resetTest() + req, err := http.NewRequest("GET", s.server.URL, nil) + if err != nil { + t.Fatal(err) + } + req.Header.Add("cache-control", "only-if-cached") + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + if resp.StatusCode != http.StatusGatewayTimeout { + t.Fatalf("response status code isn't 504 GatewayTimeout: %v", resp.StatusCode) + } +} + +func TestGetNoStoreRequest(t *testing.T) { + resetTest() + req, err := http.NewRequest("GET", s.server.URL, nil) + if err != nil { + t.Fatal(err) + } + req.Header.Add("Cache-Control", "no-store") + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + } +} + +func TestGetNoStoreResponse(t *testing.T) { + resetTest() + req, err := http.NewRequest("GET", s.server.URL+"/nostore", nil) + if err != nil { + t.Fatal(err) + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + } +} + +func TestGetWithEtag(t *testing.T) { + resetTest() + req, err := http.NewRequest("GET", s.server.URL+"/etag", nil) + if err != nil { + t.Fatal(err) + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "1" { + t.Fatalf(`XFromCache header isn't "1": %v`, resp.Header.Get(XFromCache)) + } + // additional assertions to verify that 304 response is converted properly + if resp.StatusCode != http.StatusOK { + t.Fatalf("response status code isn't 200 OK: %v", resp.StatusCode) + } + if _, ok := resp.Header["Connection"]; ok { + t.Fatalf("Connection header isn't absent") + } + } +} + +func TestGetWithLastModified(t *testing.T) { + resetTest() + req, err := http.NewRequest("GET", s.server.URL+"/lastmodified", nil) + if err != nil { + t.Fatal(err) + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "1" { + t.Fatalf(`XFromCache header isn't "1": %v`, resp.Header.Get(XFromCache)) + } + } +} + +func TestGetWithVary(t *testing.T) { + resetTest() + req, err := http.NewRequest("GET", s.server.URL+"/varyaccept", nil) + if err != nil { + t.Fatal(err) + } + req.Header.Set("Accept", "text/plain") + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get("Vary") != "Accept" { + t.Fatalf(`Vary header isn't "Accept": %v`, resp.Header.Get("Vary")) + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "1" { + t.Fatalf(`XFromCache header isn't "1": %v`, resp.Header.Get(XFromCache)) + } + } + req.Header.Set("Accept", "text/html") + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + } + req.Header.Set("Accept", "") + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + } +} + +func TestGetWithDoubleVary(t *testing.T) { + resetTest() + req, err := http.NewRequest("GET", s.server.URL+"/doublevary", nil) + if err != nil { + t.Fatal(err) + } + req.Header.Set("Accept", "text/plain") + req.Header.Set("Accept-Language", "da, en-gb;q=0.8, en;q=0.7") + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get("Vary") == "" { + t.Fatalf(`Vary header is blank`) + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "1" { + t.Fatalf(`XFromCache header isn't "1": %v`, resp.Header.Get(XFromCache)) + } + } + req.Header.Set("Accept-Language", "") + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + } + req.Header.Set("Accept-Language", "da") + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + } +} + +func TestGetWith2VaryHeaders(t *testing.T) { + resetTest() + // Tests that multiple Vary headers' comma-separated lists are + // merged. See https://github.com/gregjones/httpcache/issues/27. + const ( + accept = "text/plain" + acceptLanguage = "da, en-gb;q=0.8, en;q=0.7" + ) + req, err := http.NewRequest("GET", s.server.URL+"/2varyheaders", nil) + if err != nil { + t.Fatal(err) + } + req.Header.Set("Accept", accept) + req.Header.Set("Accept-Language", acceptLanguage) + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get("Vary") == "" { + t.Fatalf(`Vary header is blank`) + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "1" { + t.Fatalf(`XFromCache header isn't "1": %v`, resp.Header.Get(XFromCache)) + } + } + req.Header.Set("Accept-Language", "") + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + } + req.Header.Set("Accept-Language", "da") + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + } + req.Header.Set("Accept-Language", acceptLanguage) + req.Header.Set("Accept", "") + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + } + req.Header.Set("Accept", "image/png") + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "" { + t.Fatal("XFromCache header isn't blank") + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "1" { + t.Fatalf(`XFromCache header isn't "1": %v`, resp.Header.Get(XFromCache)) + } + } +} + +func TestGetVaryUnused(t *testing.T) { + resetTest() + req, err := http.NewRequest("GET", s.server.URL+"/varyunused", nil) + if err != nil { + t.Fatal(err) + } + req.Header.Set("Accept", "text/plain") + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get("Vary") == "" { + t.Fatalf(`Vary header is blank`) + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "1" { + t.Fatalf(`XFromCache header isn't "1": %v`, resp.Header.Get(XFromCache)) + } + } +} + +func TestUpdateFields(t *testing.T) { + resetTest() + req, err := http.NewRequest("GET", s.server.URL+"/updatefields", nil) + if err != nil { + t.Fatal(err) + } + var counter, counter2 string + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + counter = resp.Header.Get("x-counter") + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.Header.Get(XFromCache) != "1" { + t.Fatalf(`XFromCache header isn't "1": %v`, resp.Header.Get(XFromCache)) + } + counter2 = resp.Header.Get("x-counter") + } + if counter == counter2 { + t.Fatalf(`both "x-counter" values are equal: %v %v`, counter, counter2) + } +} + +// This tests the fix for https://github.com/gregjones/httpcache/issues/74. +// Previously, after validating a cached response, its StatusCode +// was incorrectly being replaced. +func TestCachedErrorsKeepStatus(t *testing.T) { + resetTest() + req, err := http.NewRequest("GET", s.server.URL+"/cachederror", nil) + if err != nil { + t.Fatal(err) + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + io.Copy(ioutil.Discard, resp.Body) + } + { + resp, err := s.client.Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusNotFound { + t.Fatalf("Status code isn't 404: %d", resp.StatusCode) + } + } +} + +func TestParseCacheControl(t *testing.T) { + resetTest() + h := http.Header{} + for range parseCacheControl(h) { + t.Fatal("cacheControl should be empty") + } + + h.Set("cache-control", "no-cache") + { + cc := parseCacheControl(h) + if _, ok := cc["foo"]; ok { + t.Error(`Value "foo" shouldn't exist`) + } + noCache, ok := cc["no-cache"] + if !ok { + t.Fatalf(`"no-cache" value isn't set`) + } + if noCache != "" { + t.Fatalf(`"no-cache" value isn't blank: %v`, noCache) + } + } + h.Set("cache-control", "no-cache, max-age=3600") + { + cc := parseCacheControl(h) + noCache, ok := cc["no-cache"] + if !ok { + t.Fatalf(`"no-cache" value isn't set`) + } + if noCache != "" { + t.Fatalf(`"no-cache" value isn't blank: %v`, noCache) + } + if cc["max-age"] != "3600" { + t.Fatalf(`"max-age" value isn't "3600": %v`, cc["max-age"]) + } + } +} + +func TestNoCacheRequestExpiration(t *testing.T) { + resetTest() + respHeaders := http.Header{} + respHeaders.Set("Cache-Control", "max-age=7200") + + reqHeaders := http.Header{} + reqHeaders.Set("Cache-Control", "no-cache") + if getFreshness(respHeaders, reqHeaders) != transparent { + t.Fatal("freshness isn't transparent") + } +} + +func TestNoCacheResponseExpiration(t *testing.T) { + resetTest() + respHeaders := http.Header{} + respHeaders.Set("Cache-Control", "no-cache") + respHeaders.Set("Expires", "Wed, 19 Apr 3000 11:43:00 GMT") + + reqHeaders := http.Header{} + if getFreshness(respHeaders, reqHeaders) != stale { + t.Fatal("freshness isn't stale") + } +} + +func TestReqMustRevalidate(t *testing.T) { + resetTest() + // not paying attention to request setting max-stale means never returning stale + // responses, so always acting as if must-revalidate is set + respHeaders := http.Header{} + + reqHeaders := http.Header{} + reqHeaders.Set("Cache-Control", "must-revalidate") + if getFreshness(respHeaders, reqHeaders) != stale { + t.Fatal("freshness isn't stale") + } +} + +func TestRespMustRevalidate(t *testing.T) { + resetTest() + respHeaders := http.Header{} + respHeaders.Set("Cache-Control", "must-revalidate") + + reqHeaders := http.Header{} + if getFreshness(respHeaders, reqHeaders) != stale { + t.Fatal("freshness isn't stale") + } +} + +func TestFreshExpiration(t *testing.T) { + resetTest() + now := time.Now() + respHeaders := http.Header{} + respHeaders.Set("date", now.Format(time.RFC1123)) + respHeaders.Set("expires", now.Add(time.Duration(2)*time.Second).Format(time.RFC1123)) + + reqHeaders := http.Header{} + if getFreshness(respHeaders, reqHeaders) != fresh { + t.Fatal("freshness isn't fresh") + } + + clock = &fakeClock{elapsed: 3 * time.Second} + if getFreshness(respHeaders, reqHeaders) != stale { + t.Fatal("freshness isn't stale") + } +} + +func TestMaxAge(t *testing.T) { + resetTest() + now := time.Now() + respHeaders := http.Header{} + respHeaders.Set("date", now.Format(time.RFC1123)) + respHeaders.Set("cache-control", "max-age=2") + + reqHeaders := http.Header{} + if getFreshness(respHeaders, reqHeaders) != fresh { + t.Fatal("freshness isn't fresh") + } + + clock = &fakeClock{elapsed: 3 * time.Second} + if getFreshness(respHeaders, reqHeaders) != stale { + t.Fatal("freshness isn't stale") + } +} + +func TestMaxAgeZero(t *testing.T) { + resetTest() + now := time.Now() + respHeaders := http.Header{} + respHeaders.Set("date", now.Format(time.RFC1123)) + respHeaders.Set("cache-control", "max-age=0") + + reqHeaders := http.Header{} + if getFreshness(respHeaders, reqHeaders) != stale { + t.Fatal("freshness isn't stale") + } +} + +func TestBothMaxAge(t *testing.T) { + resetTest() + now := time.Now() + respHeaders := http.Header{} + respHeaders.Set("date", now.Format(time.RFC1123)) + respHeaders.Set("cache-control", "max-age=2") + + reqHeaders := http.Header{} + reqHeaders.Set("cache-control", "max-age=0") + if getFreshness(respHeaders, reqHeaders) != stale { + t.Fatal("freshness isn't stale") + } +} + +func TestMinFreshWithExpires(t *testing.T) { + resetTest() + now := time.Now() + respHeaders := http.Header{} + respHeaders.Set("date", now.Format(time.RFC1123)) + respHeaders.Set("expires", now.Add(time.Duration(2)*time.Second).Format(time.RFC1123)) + + reqHeaders := http.Header{} + reqHeaders.Set("cache-control", "min-fresh=1") + if getFreshness(respHeaders, reqHeaders) != fresh { + t.Fatal("freshness isn't fresh") + } + + reqHeaders = http.Header{} + reqHeaders.Set("cache-control", "min-fresh=2") + if getFreshness(respHeaders, reqHeaders) != stale { + t.Fatal("freshness isn't stale") + } +} + +func TestEmptyMaxStale(t *testing.T) { + resetTest() + now := time.Now() + respHeaders := http.Header{} + respHeaders.Set("date", now.Format(time.RFC1123)) + respHeaders.Set("cache-control", "max-age=20") + + reqHeaders := http.Header{} + reqHeaders.Set("cache-control", "max-stale") + clock = &fakeClock{elapsed: 10 * time.Second} + if getFreshness(respHeaders, reqHeaders) != fresh { + t.Fatal("freshness isn't fresh") + } + + clock = &fakeClock{elapsed: 60 * time.Second} + if getFreshness(respHeaders, reqHeaders) != fresh { + t.Fatal("freshness isn't fresh") + } +} + +func TestMaxStaleValue(t *testing.T) { + resetTest() + now := time.Now() + respHeaders := http.Header{} + respHeaders.Set("date", now.Format(time.RFC1123)) + respHeaders.Set("cache-control", "max-age=10") + + reqHeaders := http.Header{} + reqHeaders.Set("cache-control", "max-stale=20") + clock = &fakeClock{elapsed: 5 * time.Second} + if getFreshness(respHeaders, reqHeaders) != fresh { + t.Fatal("freshness isn't fresh") + } + + clock = &fakeClock{elapsed: 15 * time.Second} + if getFreshness(respHeaders, reqHeaders) != fresh { + t.Fatal("freshness isn't fresh") + } + + clock = &fakeClock{elapsed: 30 * time.Second} + if getFreshness(respHeaders, reqHeaders) != stale { + t.Fatal("freshness isn't stale") + } +} + +func containsHeader(headers []string, header string) bool { + for _, v := range headers { + if http.CanonicalHeaderKey(v) == http.CanonicalHeaderKey(header) { + return true + } + } + return false +} + +func TestGetEndToEndHeaders(t *testing.T) { + resetTest() + var ( + headers http.Header + end2end []string + ) + + headers = http.Header{} + headers.Set("content-type", "text/html") + headers.Set("te", "deflate") + + end2end = getEndToEndHeaders(headers) + if !containsHeader(end2end, "content-type") { + t.Fatal(`doesn't contain "content-type" header`) + } + if containsHeader(end2end, "te") { + t.Fatal(`doesn't contain "te" header`) + } + + headers = http.Header{} + headers.Set("connection", "content-type") + headers.Set("content-type", "text/csv") + headers.Set("te", "deflate") + end2end = getEndToEndHeaders(headers) + if containsHeader(end2end, "connection") { + t.Fatal(`doesn't contain "connection" header`) + } + if containsHeader(end2end, "content-type") { + t.Fatal(`doesn't contain "content-type" header`) + } + if containsHeader(end2end, "te") { + t.Fatal(`doesn't contain "te" header`) + } + + headers = http.Header{} + end2end = getEndToEndHeaders(headers) + if len(end2end) != 0 { + t.Fatal(`non-zero end2end headers`) + } + + headers = http.Header{} + headers.Set("connection", "content-type") + end2end = getEndToEndHeaders(headers) + if len(end2end) != 0 { + t.Fatal(`non-zero end2end headers`) + } +} + +type transportMock struct { + response *http.Response + err error +} + +func (t transportMock) RoundTrip(req *http.Request) (resp *http.Response, err error) { + return t.response, t.err +} + +func TestStaleIfErrorRequest(t *testing.T) { + resetTest() + now := time.Now() + tmock := transportMock{ + response: &http.Response{ + Status: http.StatusText(http.StatusOK), + StatusCode: http.StatusOK, + Header: http.Header{ + "Date": []string{now.Format(time.RFC1123)}, + "Cache-Control": []string{"no-cache"}, + }, + Body: ioutil.NopCloser(bytes.NewBuffer([]byte("some data"))), + }, + err: nil, + } + tp := NewMemoryCacheTransport() + tp.Transport = &tmock + + // First time, response is cached on success + r, _ := http.NewRequest("GET", "http://somewhere.com/", nil) + r.Header.Set("Cache-Control", "stale-if-error") + resp, err := tp.RoundTrip(r) + if err != nil { + t.Fatal(err) + } + if resp == nil { + t.Fatal("resp is nil") + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + + // On failure, response is returned from the cache + tmock.response = nil + tmock.err = errors.New("some error") + resp, err = tp.RoundTrip(r) + if err != nil { + t.Fatal(err) + } + if resp == nil { + t.Fatal("resp is nil") + } +} + +func TestStaleIfErrorRequestLifetime(t *testing.T) { + resetTest() + now := time.Now() + tmock := transportMock{ + response: &http.Response{ + Status: http.StatusText(http.StatusOK), + StatusCode: http.StatusOK, + Header: http.Header{ + "Date": []string{now.Format(time.RFC1123)}, + "Cache-Control": []string{"no-cache"}, + }, + Body: ioutil.NopCloser(bytes.NewBuffer([]byte("some data"))), + }, + err: nil, + } + tp := NewMemoryCacheTransport() + tp.Transport = &tmock + + // First time, response is cached on success + r, _ := http.NewRequest("GET", "http://somewhere.com/", nil) + r.Header.Set("Cache-Control", "stale-if-error=100") + resp, err := tp.RoundTrip(r) + if err != nil { + t.Fatal(err) + } + if resp == nil { + t.Fatal("resp is nil") + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + + // On failure, response is returned from the cache + tmock.response = nil + tmock.err = errors.New("some error") + resp, err = tp.RoundTrip(r) + if err != nil { + t.Fatal(err) + } + if resp == nil { + t.Fatal("resp is nil") + } + + // Same for http errors + tmock.response = &http.Response{StatusCode: http.StatusInternalServerError} + tmock.err = nil + resp, err = tp.RoundTrip(r) + if err != nil { + t.Fatal(err) + } + if resp == nil { + t.Fatal("resp is nil") + } + + // If failure last more than max stale, error is returned + clock = &fakeClock{elapsed: 200 * time.Second} + _, err = tp.RoundTrip(r) + if err != tmock.err { + t.Fatalf("got err %v, want %v", err, tmock.err) + } +} + +func TestStaleIfErrorResponse(t *testing.T) { + resetTest() + now := time.Now() + tmock := transportMock{ + response: &http.Response{ + Status: http.StatusText(http.StatusOK), + StatusCode: http.StatusOK, + Header: http.Header{ + "Date": []string{now.Format(time.RFC1123)}, + "Cache-Control": []string{"no-cache, stale-if-error"}, + }, + Body: ioutil.NopCloser(bytes.NewBuffer([]byte("some data"))), + }, + err: nil, + } + tp := NewMemoryCacheTransport() + tp.Transport = &tmock + + // First time, response is cached on success + r, _ := http.NewRequest("GET", "http://somewhere.com/", nil) + resp, err := tp.RoundTrip(r) + if err != nil { + t.Fatal(err) + } + if resp == nil { + t.Fatal("resp is nil") + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + + // On failure, response is returned from the cache + tmock.response = nil + tmock.err = errors.New("some error") + resp, err = tp.RoundTrip(r) + if err != nil { + t.Fatal(err) + } + if resp == nil { + t.Fatal("resp is nil") + } +} + +func TestStaleIfErrorResponseLifetime(t *testing.T) { + resetTest() + now := time.Now() + tmock := transportMock{ + response: &http.Response{ + Status: http.StatusText(http.StatusOK), + StatusCode: http.StatusOK, + Header: http.Header{ + "Date": []string{now.Format(time.RFC1123)}, + "Cache-Control": []string{"no-cache, stale-if-error=100"}, + }, + Body: ioutil.NopCloser(bytes.NewBuffer([]byte("some data"))), + }, + err: nil, + } + tp := NewMemoryCacheTransport() + tp.Transport = &tmock + + // First time, response is cached on success + r, _ := http.NewRequest("GET", "http://somewhere.com/", nil) + resp, err := tp.RoundTrip(r) + if err != nil { + t.Fatal(err) + } + if resp == nil { + t.Fatal("resp is nil") + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + + // On failure, response is returned from the cache + tmock.response = nil + tmock.err = errors.New("some error") + resp, err = tp.RoundTrip(r) + if err != nil { + t.Fatal(err) + } + if resp == nil { + t.Fatal("resp is nil") + } + + // If failure last more than max stale, error is returned + clock = &fakeClock{elapsed: 200 * time.Second} + _, err = tp.RoundTrip(r) + if err != tmock.err { + t.Fatalf("got err %v, want %v", err, tmock.err) + } +} + +// This tests the fix for https://github.com/gregjones/httpcache/issues/74. +// Previously, after a stale response was used after encountering an error, +// its StatusCode was being incorrectly replaced. +func TestStaleIfErrorKeepsStatus(t *testing.T) { + resetTest() + now := time.Now() + tmock := transportMock{ + response: &http.Response{ + Status: http.StatusText(http.StatusNotFound), + StatusCode: http.StatusNotFound, + Header: http.Header{ + "Date": []string{now.Format(time.RFC1123)}, + "Cache-Control": []string{"no-cache"}, + }, + Body: ioutil.NopCloser(bytes.NewBuffer([]byte("some data"))), + }, + err: nil, + } + tp := NewMemoryCacheTransport() + tp.Transport = &tmock + + // First time, response is cached on success + r, _ := http.NewRequest("GET", "http://somewhere.com/", nil) + r.Header.Set("Cache-Control", "stale-if-error") + resp, err := tp.RoundTrip(r) + if err != nil { + t.Fatal(err) + } + if resp == nil { + t.Fatal("resp is nil") + } + _, err = ioutil.ReadAll(resp.Body) + if err != nil { + t.Fatal(err) + } + + // On failure, response is returned from the cache + tmock.response = nil + tmock.err = errors.New("some error") + resp, err = tp.RoundTrip(r) + if err != nil { + t.Fatal(err) + } + if resp == nil { + t.Fatal("resp is nil") + } + if resp.StatusCode != http.StatusNotFound { + t.Fatalf("Status wasn't 404: %d", resp.StatusCode) + } +} + +// Test that http.Client.Timeout is respected when cache transport is used. +// That is so as long as request cancellation is propagated correctly. +// In the past, that required CancelRequest to be implemented correctly, +// but modern http.Client uses Request.Cancel (or request context) instead, +// so we don't have to do anything. +func TestClientTimeout(t *testing.T) { + if testing.Short() { + t.Skip("skipping timeout test in short mode") // Because it takes at least 3 seconds to run. + } + resetTest() + client := &http.Client{ + Transport: NewMemoryCacheTransport(), + Timeout: time.Second, + } + started := time.Now() + resp, err := client.Get(s.server.URL + "/3seconds") + taken := time.Since(started) + if err == nil { + t.Error("got nil error, want timeout error") + } + if resp != nil { + t.Error("got non-nil resp, want nil resp") + } + if taken >= 2*time.Second { + t.Error("client.Do took 2+ seconds, want < 2 seconds") + } +} diff --git a/cache/diskcache/diskcache.go b/cache/diskcache/diskcache.go new file mode 100644 index 0000000..a8899bd --- /dev/null +++ b/cache/diskcache/diskcache.go @@ -0,0 +1,61 @@ +// Package diskcache provides an implementation of cache.Cache that uses the diskv package +// to supplement an in-memory map with persistent storage +// +package diskcache + +import ( + "bytes" + "crypto/md5" + "encoding/hex" + "github.com/peterbourgon/diskv" + "io" +) + +// Cache is an implementation of cache.Cache that supplements the in-memory map with persistent storage +type Cache struct { + d *diskv.Diskv +} + +// Get returns the response corresponding to key if present +func (c *Cache) Get(key string) (resp []byte, ok bool) { + key = keyToFilename(key) + resp, err := c.d.Read(key) + if err != nil { + return []byte{}, false + } + return resp, true +} + +// Set saves a response to the cache as key +func (c *Cache) Set(key string, resp []byte) { + key = keyToFilename(key) + _ = c.d.WriteStream(key, bytes.NewReader(resp), true) +} + +// Delete removes the response with key from the cache +func (c *Cache) Delete(key string) { + key = keyToFilename(key) + _ = c.d.Erase(key) +} + +func keyToFilename(key string) string { + h := md5.New() + _, _ = io.WriteString(h, key) + return hex.EncodeToString(h.Sum(nil)) +} + +// New returns a new Cache that will store files in basePath +func New(basePath string) *Cache { + return &Cache{ + d: diskv.New(diskv.Options{ + BasePath: basePath, + CacheSizeMax: 100 * 1024 * 1024, // 100MB + }), + } +} + +// NewWithDiskv returns a new Cache using the provided Diskv as underlying +// storage. +func NewWithDiskv(d *diskv.Diskv) *Cache { + return &Cache{d} +} diff --git a/cache/diskcache/diskcache_test.go b/cache/diskcache/diskcache_test.go new file mode 100644 index 0000000..038823e --- /dev/null +++ b/cache/diskcache/diskcache_test.go @@ -0,0 +1,18 @@ +package diskcache + +import ( + "github.com/geziyor/geziyor/cache" + "io/ioutil" + "os" + "testing" +) + +func TestDiskCache(t *testing.T) { + tempDir, err := ioutil.TempDir("", "cache") + if err != nil { + t.Fatalf("TempDir: %v", err) + } + defer os.RemoveAll(tempDir) + + cache.PleaseCache(t, New(tempDir)) +} diff --git a/cache/leveldbcache/leveldbcache.go b/cache/leveldbcache/leveldbcache.go new file mode 100644 index 0000000..53009d6 --- /dev/null +++ b/cache/leveldbcache/leveldbcache.go @@ -0,0 +1,51 @@ +// Package leveldbcache provides an implementation of cache.Cache that +// uses github.com/syndtr/goleveldb/leveldb +package leveldbcache + +import ( + "github.com/syndtr/goleveldb/leveldb" +) + +// Cache is an implementation of cache.Cache with leveldb storage +type Cache struct { + Db *leveldb.DB +} + +// Get returns the response corresponding to key if present +func (c *Cache) Get(key string) (resp []byte, ok bool) { + var err error + resp, err = c.Db.Get([]byte(key), nil) + if err != nil { + return []byte{}, false + } + return resp, true +} + +// Set saves a response to the cache as key +func (c *Cache) Set(key string, resp []byte) { + _ = c.Db.Put([]byte(key), resp, nil) +} + +// Delete removes the response with key from the cache +func (c *Cache) Delete(key string) { + _ = c.Db.Delete([]byte(key), nil) +} + +// New returns a new Cache that will store leveldb in path +func New(path string) (*Cache, error) { + cache := &Cache{} + + var err error + cache.Db, err = leveldb.OpenFile(path, nil) + + if err != nil { + return nil, err + } + return cache, nil +} + +// NewWithDB returns a new Cache using the provided leveldb as underlying +// storage. +func NewWithDB(db *leveldb.DB) *Cache { + return &Cache{db} +} diff --git a/cache/leveldbcache/leveldbcache_test.go b/cache/leveldbcache/leveldbcache_test.go new file mode 100644 index 0000000..89f7008 --- /dev/null +++ b/cache/leveldbcache/leveldbcache_test.go @@ -0,0 +1,24 @@ +package leveldbcache + +import ( + "github.com/geziyor/geziyor/cache" + "io/ioutil" + "os" + "path/filepath" + "testing" +) + +func TestDiskCache(t *testing.T) { + tempDir, err := ioutil.TempDir("", "cache") + if err != nil { + t.Fatalf("TempDir: %v", err) + } + defer os.RemoveAll(tempDir) + + c, err := New(filepath.Join(tempDir, "Db")) + if err != nil { + t.Fatalf("New leveldb,: %v", err) + } + + cache.PleaseCache(t, c) +} diff --git a/cache/memorycache/memorycache.go b/cache/memorycache/memorycache.go new file mode 100644 index 0000000..8fa6af4 --- /dev/null +++ b/cache/memorycache/memorycache.go @@ -0,0 +1,39 @@ +package memorycache + +import ( + "sync" +) + +// Cache is an implementation of Cache that stores responses in an in-memory map. +type Cache struct { + mu sync.RWMutex + items map[string][]byte +} + +// Get returns the []byte representation of the response and true if present, false if not +func (c *Cache) Get(key string) (resp []byte, ok bool) { + c.mu.RLock() + resp, ok = c.items[key] + c.mu.RUnlock() + return resp, ok +} + +// Set saves response resp to the cache with key +func (c *Cache) Set(key string, resp []byte) { + c.mu.Lock() + c.items[key] = resp + c.mu.Unlock() +} + +// Delete removes key from the cache +func (c *Cache) Delete(key string) { + c.mu.Lock() + delete(c.items, key) + c.mu.Unlock() +} + +// New returns a new Cache that will store items in an in-memory map +func New() *Cache { + c := &Cache{items: map[string][]byte{}} + return c +} diff --git a/geziyor.go b/geziyor.go index 00998f0..fb51d7b 100644 --- a/geziyor.go +++ b/geziyor.go @@ -1,7 +1,7 @@ package geziyor import ( - "github.com/fpfeng/httpcache" + "github.com/geziyor/geziyor/cache" "github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/metrics" "github.com/geziyor/geziyor/middleware" @@ -69,8 +69,12 @@ func NewGeziyor(opt *Options) *Geziyor { // Client geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes) if opt.Cache != nil { - geziyor.Client.Transport = &httpcache.Transport{ - Transport: geziyor.Client.Transport, Cache: opt.Cache, MarkCachedResponses: true} + geziyor.Client.Transport = &cache.Transport{ + Policy: opt.CachePolicy, + Transport: geziyor.Client.Transport, + Cache: opt.Cache, + MarkCachedResponses: true, + } } if opt.Timeout != 0 { geziyor.Client.Timeout = opt.Timeout diff --git a/geziyor_test.go b/geziyor_test.go index 2cedb9b..180aaba 100644 --- a/geziyor_test.go +++ b/geziyor_test.go @@ -4,8 +4,9 @@ import ( "fmt" "github.com/PuerkitoBio/goquery" "github.com/fortytw2/leaktest" - "github.com/fpfeng/httpcache" "github.com/geziyor/geziyor" + "github.com/geziyor/geziyor/cache" + "github.com/geziyor/geziyor/cache/diskcache" "github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/export" "github.com/geziyor/geziyor/metrics" @@ -28,12 +29,13 @@ func TestCache(t *testing.T) { defer leaktest.Check(t)() geziyor.NewGeziyor(&geziyor.Options{ StartURLs: []string{"http://api.ipify.org"}, - Cache: httpcache.NewMemoryCache(), ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { fmt.Println(string(r.Body)) g.Exports <- string(r.Body) g.Get("http://api.ipify.org", nil) }, + Cache: diskcache.New(".cache"), + CachePolicy: cache.RFC2616, }).Start() } diff --git a/go.mod b/go.mod index 6681994..2cd1bf6 100644 --- a/go.mod +++ b/go.mod @@ -8,11 +8,13 @@ require ( github.com/chromedp/cdproto v0.0.0-20190609032908-dd39f0bf0a54 github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05 github.com/fortytw2/leaktest v1.3.0 - github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 github.com/go-kit/kit v0.8.0 + github.com/google/btree v1.0.0 // indirect + github.com/peterbourgon/diskv v2.0.1+incompatible github.com/pkg/errors v0.8.1 github.com/prometheus/client_golang v1.0.0 github.com/stretchr/testify v1.3.0 + github.com/syndtr/goleveldb v1.0.0 github.com/temoto/robotstxt v1.1.1 golang.org/x/net v0.0.0-20190522155817-f3200d17e092 golang.org/x/text v0.3.2 diff --git a/go.sum b/go.sum index 065040a..b4f5663 100644 --- a/go.sum +++ b/go.sum @@ -18,8 +18,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= -github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ= -github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8= +github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I= +github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/go-kit/kit v0.8.0 h1:Wz+5lgoB0kkuqLEc6NVmwRknTKP6dTGbSqvhZtBI/j0= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= @@ -34,6 +34,12 @@ github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7a github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w= +github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo= +github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= +github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307 h1:vl4eIlySbjertFaNwiMjXsGrFVK25aOWLq7n+3gh2ls= @@ -47,6 +53,13 @@ github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= +github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs= +github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/gomega v1.4.3 h1:RE1xgDvH7imwFD45h+u2SgIfERHlS2yNG4DObb5BSKU= +github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= +github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -69,18 +82,23 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE= +github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA= github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190522155817-f3200d17e092 h1:4QSRKanuywn15aTZvI/mIDEgPQpswuFndXpOj3rKEco= golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190610081024-1e42afee0f76 h1:QSmW7Q3mFdAGjtAd0byXmFJ55inUydyZ4WQmiuItAIQ= @@ -90,5 +108,11 @@ golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= +gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v2 v2.2.1 h1:mUhvW9EsL+naU5Q3cakzfE91YhliOondGd6ZrsDBHQE= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/options.go b/options.go index 83e2c41..84c1513 100644 --- a/options.go +++ b/options.go @@ -1,7 +1,7 @@ package geziyor import ( - "github.com/fpfeng/httpcache" + "github.com/geziyor/geziyor/cache" "github.com/geziyor/geziyor/client" "github.com/geziyor/geziyor/export" "github.com/geziyor/geziyor/metrics" @@ -15,18 +15,25 @@ type Options struct { // If empty, any domain is allowed AllowedDomains []string - // Set this to enable caching responses. - // Memory Cache: httpcache.NewMemoryCache() - // Disk Cache: diskcache.New(".cache") - Cache httpcache.Cache + // Cache storage backends. + // - Memory + // - Disk + // - LevelDB + Cache cache.Cache - // Charset Detection disable + // Policies for caching. + // - Dummy policy (default) + // - RFC2616 policy + CachePolicy cache.Policy + + // Response charset detection for decoding to UTF-8 CharsetDetectDisabled bool // Concurrent requests limit ConcurrentRequests int - // Concurrent requests per domain limit + // Concurrent requests per domain limit. Uses request.URL.Host + // Subdomains are different than top domain ConcurrentRequestsPerDomain int // If set true, cookies won't send.