Caching policies added.
We used httpcache library to implement this. As it was not possible to support different policies, I mostly copied and modified it.
This commit is contained in:
parent
0d6c2a6864
commit
90d2be2210
@ -8,7 +8,7 @@ Geziyor is a blazing fast web crawling and web scraping framework. It can be use
|
||||
## Features
|
||||
- 5.000+ Requests/Sec
|
||||
- JS Rendering
|
||||
- Caching (Memory/Disk)
|
||||
- Caching (Memory/Disk/LevelDB)
|
||||
- Automatic Data Exporting (JSON, CSV, or custom)
|
||||
- Metrics (Prometheus, Expvar, or custom)
|
||||
- Limit Concurrency (Global/Per Domain)
|
||||
|
625
cache/cache.go
vendored
Normal file
625
cache/cache.go
vendored
Normal file
@ -0,0 +1,625 @@
|
||||
// Package cache provides a http.RoundTripper implementation that works as a
|
||||
// mostly RFC-compliant cache for http responses.
|
||||
//
|
||||
// It is only suitable for use as a 'private' cache (i.e. for a web-browser or an API-client
|
||||
// and not for a shared proxy).
|
||||
//
|
||||
// Mostly borrowed from https://github.com/gregjones/httpcache. Customized for different policies.
|
||||
package cache
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"errors"
|
||||
"github.com/geziyor/geziyor/cache/memorycache"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/http/httputil"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Policy int
|
||||
|
||||
const (
|
||||
// This policy has no awareness of any HTTP Cache-Control directives.
|
||||
// Every request and its corresponding response are cached.
|
||||
// When the same request is seen again, the response is returned without transferring anything from the Internet.
|
||||
|
||||
// The Dummy policy is useful for testing spiders faster (without having to wait for downloads every time)
|
||||
// and for trying your spider offline, when an Internet connection is not available.
|
||||
// The goal is to be able to “replay” a spider run exactly as it ran before.
|
||||
Dummy Policy = iota
|
||||
|
||||
// This policy provides a RFC2616 compliant HTTP cache, i.e. with HTTP Cache-Control awareness,
|
||||
// aimed at production and used in continuous runs to avoid downloading unmodified data
|
||||
// (to save bandwidth and speed up crawls).
|
||||
RFC2616
|
||||
)
|
||||
|
||||
const (
|
||||
stale = iota
|
||||
fresh
|
||||
transparent
|
||||
// XFromCache is the header added to responses that are returned from the cache
|
||||
XFromCache = "X-From-Cache"
|
||||
)
|
||||
|
||||
// A Cache interface is used by the Transport to store and retrieve responses.
|
||||
type Cache interface {
|
||||
// Get returns the []byte representation of a cached response and a bool
|
||||
// set to true if the value isn't empty
|
||||
Get(key string) (responseBytes []byte, ok bool)
|
||||
// Set stores the []byte representation of a response against a key
|
||||
Set(key string, responseBytes []byte)
|
||||
// Delete removes the value associated with the key
|
||||
Delete(key string)
|
||||
}
|
||||
|
||||
// cacheKey returns the cache key for req.
|
||||
func cacheKey(req *http.Request) string {
|
||||
if req.Method == http.MethodGet {
|
||||
return req.URL.String()
|
||||
} else {
|
||||
return req.Method + " " + req.URL.String()
|
||||
}
|
||||
}
|
||||
|
||||
// CachedResponse returns the cached http.Response for req if present, and nil
|
||||
// otherwise.
|
||||
func CachedResponse(c Cache, req *http.Request) (resp *http.Response, err error) {
|
||||
cachedVal, ok := c.Get(cacheKey(req))
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
b := bytes.NewBuffer(cachedVal)
|
||||
return http.ReadResponse(bufio.NewReader(b), req)
|
||||
}
|
||||
|
||||
// Transport is an implementation of http.RoundTripper that will return values from a cache
|
||||
// where possible (avoiding a network request) and will additionally add validators (etag/if-modified-since)
|
||||
// to repeated requests allowing servers to return 304 / Not Modified
|
||||
type Transport struct {
|
||||
Policy Policy
|
||||
// The RoundTripper interface actually used to make requests
|
||||
// If nil, http.DefaultTransport is used
|
||||
Transport http.RoundTripper
|
||||
Cache Cache
|
||||
// If true, responses returned from the cache will be given an extra header, X-From-Cache
|
||||
MarkCachedResponses bool
|
||||
}
|
||||
|
||||
// NewTransport returns a new Transport with the
|
||||
// provided Cache implementation and MarkCachedResponses set to true
|
||||
func NewTransport(c Cache) *Transport {
|
||||
return &Transport{
|
||||
Policy: RFC2616,
|
||||
Cache: c,
|
||||
MarkCachedResponses: true,
|
||||
}
|
||||
}
|
||||
|
||||
// Client returns an *http.Client that caches responses.
|
||||
func (t *Transport) Client() *http.Client {
|
||||
return &http.Client{Transport: t}
|
||||
}
|
||||
|
||||
// varyMatches will return false unless all of the cached values for the headers listed in Vary
|
||||
// match the new request
|
||||
func varyMatches(cachedResp *http.Response, req *http.Request) bool {
|
||||
for _, header := range headerAllCommaSepValues(cachedResp.Header, "vary") {
|
||||
header = http.CanonicalHeaderKey(header)
|
||||
if header != "" && req.Header.Get(header) != cachedResp.Header.Get("X-Varied-"+header) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// RoundTrip is a wrapper for caching requests.
|
||||
// If there is a fresh Response already in cache, then it will be returned without connecting to
|
||||
// the server.
|
||||
//
|
||||
func (t *Transport) RoundTrip(req *http.Request) (resp *http.Response, err error) {
|
||||
if t.Policy == Dummy {
|
||||
return t.RoundTripDummy(req)
|
||||
}
|
||||
return t.RoundTripRFC2616(req)
|
||||
}
|
||||
|
||||
// RoundTripDummy has no awareness of any HTTP Cache-Control directives.
|
||||
// Every request and its corresponding response are cached.
|
||||
// When the same request is seen again, the response is returned without transferring anything from the Internet.
|
||||
func (t *Transport) RoundTripDummy(req *http.Request) (resp *http.Response, err error) {
|
||||
cacheKey := cacheKey(req)
|
||||
cacheable := (req.Method == "GET" || req.Method == "HEAD") && req.Header.Get("range") == ""
|
||||
var cachedResp *http.Response
|
||||
if cacheable {
|
||||
cachedResp, err = CachedResponse(t.Cache, req)
|
||||
} else {
|
||||
// Need to invalidate an existing value
|
||||
t.Cache.Delete(cacheKey)
|
||||
}
|
||||
|
||||
transport := t.Transport
|
||||
if transport == nil {
|
||||
transport = http.DefaultTransport
|
||||
}
|
||||
|
||||
if cacheable && cachedResp != nil && err == nil {
|
||||
if t.MarkCachedResponses {
|
||||
cachedResp.Header.Set(XFromCache, "1")
|
||||
}
|
||||
return cachedResp, nil
|
||||
} else {
|
||||
resp, err = transport.RoundTrip(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if cacheable {
|
||||
respBytes, err := httputil.DumpResponse(resp, true)
|
||||
if err == nil {
|
||||
t.Cache.Set(cacheKey, respBytes)
|
||||
}
|
||||
} else {
|
||||
t.Cache.Delete(cacheKey)
|
||||
}
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
// RoundTripRFC2616 provides a RFC2616 compliant HTTP cache, i.e. with HTTP Cache-Control awareness,
|
||||
// aimed at production and used in continuous runs to avoid downloading unmodified data
|
||||
// (to save bandwidth and speed up crawls).
|
||||
//
|
||||
// If there is a stale Response, then any validators it contains will be set on the new request
|
||||
// to give the server a chance to respond with NotModified. If this happens, then the cached Response
|
||||
// will be returned.
|
||||
func (t *Transport) RoundTripRFC2616(req *http.Request) (resp *http.Response, err error) {
|
||||
cacheKey := cacheKey(req)
|
||||
cacheable := (req.Method == "GET" || req.Method == "HEAD") && req.Header.Get("range") == ""
|
||||
var cachedResp *http.Response
|
||||
if cacheable {
|
||||
cachedResp, err = CachedResponse(t.Cache, req)
|
||||
} else {
|
||||
// Need to invalidate an existing value
|
||||
t.Cache.Delete(cacheKey)
|
||||
}
|
||||
|
||||
transport := t.Transport
|
||||
if transport == nil {
|
||||
transport = http.DefaultTransport
|
||||
}
|
||||
|
||||
if cacheable && cachedResp != nil && err == nil {
|
||||
if t.MarkCachedResponses {
|
||||
cachedResp.Header.Set(XFromCache, "1")
|
||||
}
|
||||
|
||||
if varyMatches(cachedResp, req) {
|
||||
// Can only use cached value if the new request doesn't Vary significantly
|
||||
freshness := getFreshness(cachedResp.Header, req.Header)
|
||||
if freshness == fresh {
|
||||
return cachedResp, nil
|
||||
}
|
||||
|
||||
if freshness == stale {
|
||||
var req2 *http.Request
|
||||
// Add validators if caller hasn't already done so
|
||||
etag := cachedResp.Header.Get("etag")
|
||||
if etag != "" && req.Header.Get("etag") == "" {
|
||||
req2 = cloneRequest(req)
|
||||
req2.Header.Set("if-none-match", etag)
|
||||
}
|
||||
lastModified := cachedResp.Header.Get("last-modified")
|
||||
if lastModified != "" && req.Header.Get("last-modified") == "" {
|
||||
if req2 == nil {
|
||||
req2 = cloneRequest(req)
|
||||
}
|
||||
req2.Header.Set("if-modified-since", lastModified)
|
||||
}
|
||||
if req2 != nil {
|
||||
req = req2
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resp, err = transport.RoundTrip(req)
|
||||
if err == nil && req.Method == "GET" && resp.StatusCode == http.StatusNotModified {
|
||||
// Replace the 304 response with the one from cache, but update with some new headers
|
||||
endToEndHeaders := getEndToEndHeaders(resp.Header)
|
||||
for _, header := range endToEndHeaders {
|
||||
cachedResp.Header[header] = resp.Header[header]
|
||||
}
|
||||
resp.Body.Close()
|
||||
resp = cachedResp
|
||||
} else if (err != nil || resp.StatusCode >= 500) &&
|
||||
req.Method == "GET" && canStaleOnError(cachedResp.Header, req.Header) {
|
||||
// In case of transport failure and stale-if-error activated, returns cached content
|
||||
// when available
|
||||
if resp != nil && resp.Body != nil {
|
||||
resp.Body.Close()
|
||||
}
|
||||
return cachedResp, nil
|
||||
} else {
|
||||
if err != nil || resp.StatusCode != http.StatusOK {
|
||||
t.Cache.Delete(cacheKey)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
} else {
|
||||
reqCacheControl := parseCacheControl(req.Header)
|
||||
if _, ok := reqCacheControl["only-if-cached"]; ok {
|
||||
resp = newGatewayTimeoutResponse(req)
|
||||
} else {
|
||||
resp, err = transport.RoundTrip(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if cacheable && canStore(parseCacheControl(req.Header), parseCacheControl(resp.Header)) {
|
||||
for _, varyKey := range headerAllCommaSepValues(resp.Header, "vary") {
|
||||
varyKey = http.CanonicalHeaderKey(varyKey)
|
||||
fakeHeader := "X-Varied-" + varyKey
|
||||
reqValue := req.Header.Get(varyKey)
|
||||
if reqValue != "" {
|
||||
resp.Header.Set(fakeHeader, reqValue)
|
||||
}
|
||||
}
|
||||
switch req.Method {
|
||||
case "GET":
|
||||
// Delay caching until EOF is reached.
|
||||
resp.Body = &cachingReadCloser{
|
||||
R: resp.Body,
|
||||
OnEOF: func(r io.Reader) {
|
||||
resp := *resp
|
||||
resp.Body = ioutil.NopCloser(r)
|
||||
respBytes, err := httputil.DumpResponse(&resp, true)
|
||||
if err == nil {
|
||||
t.Cache.Set(cacheKey, respBytes)
|
||||
}
|
||||
},
|
||||
}
|
||||
default:
|
||||
respBytes, err := httputil.DumpResponse(resp, true)
|
||||
if err == nil {
|
||||
t.Cache.Set(cacheKey, respBytes)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
t.Cache.Delete(cacheKey)
|
||||
}
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
// ErrNoDateHeader indicates that the HTTP headers contained no Date header.
|
||||
var ErrNoDateHeader = errors.New("no Date header")
|
||||
|
||||
// Date parses and returns the value of the Date header.
|
||||
func Date(respHeaders http.Header) (date time.Time, err error) {
|
||||
dateHeader := respHeaders.Get("date")
|
||||
if dateHeader == "" {
|
||||
err = ErrNoDateHeader
|
||||
return
|
||||
}
|
||||
|
||||
return time.Parse(time.RFC1123, dateHeader)
|
||||
}
|
||||
|
||||
type realClock struct{}
|
||||
|
||||
func (c *realClock) since(d time.Time) time.Duration {
|
||||
return time.Since(d)
|
||||
}
|
||||
|
||||
type timer interface {
|
||||
since(d time.Time) time.Duration
|
||||
}
|
||||
|
||||
var clock timer = &realClock{}
|
||||
|
||||
// getFreshness will return one of fresh/stale/transparent based on the cache-control
|
||||
// values of the request and the response
|
||||
//
|
||||
// fresh indicates the response can be returned
|
||||
// stale indicates that the response needs validating before it is returned
|
||||
// transparent indicates the response should not be used to fulfil the request
|
||||
//
|
||||
// Because this is only a private cache, 'public' and 'private' in cache-control aren't
|
||||
// signficant. Similarly, smax-age isn't used.
|
||||
func getFreshness(respHeaders, reqHeaders http.Header) (freshness int) {
|
||||
respCacheControl := parseCacheControl(respHeaders)
|
||||
reqCacheControl := parseCacheControl(reqHeaders)
|
||||
if _, ok := reqCacheControl["no-cache"]; ok {
|
||||
return transparent
|
||||
}
|
||||
if _, ok := respCacheControl["no-cache"]; ok {
|
||||
return stale
|
||||
}
|
||||
if _, ok := reqCacheControl["only-if-cached"]; ok {
|
||||
return fresh
|
||||
}
|
||||
|
||||
date, err := Date(respHeaders)
|
||||
if err != nil {
|
||||
return stale
|
||||
}
|
||||
currentAge := clock.since(date)
|
||||
|
||||
var lifetime time.Duration
|
||||
var zeroDuration time.Duration
|
||||
|
||||
// If a response includes both an Expires header and a max-age directive,
|
||||
// the max-age directive overrides the Expires header, even if the Expires header is more restrictive.
|
||||
if maxAge, ok := respCacheControl["max-age"]; ok {
|
||||
lifetime, err = time.ParseDuration(maxAge + "s")
|
||||
if err != nil {
|
||||
lifetime = zeroDuration
|
||||
}
|
||||
} else {
|
||||
expiresHeader := respHeaders.Get("Expires")
|
||||
if expiresHeader != "" {
|
||||
expires, err := time.Parse(time.RFC1123, expiresHeader)
|
||||
if err != nil {
|
||||
lifetime = zeroDuration
|
||||
} else {
|
||||
lifetime = expires.Sub(date)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if maxAge, ok := reqCacheControl["max-age"]; ok {
|
||||
// the client is willing to accept a response whose age is no greater than the specified time in seconds
|
||||
lifetime, err = time.ParseDuration(maxAge + "s")
|
||||
if err != nil {
|
||||
lifetime = zeroDuration
|
||||
}
|
||||
}
|
||||
if minfresh, ok := reqCacheControl["min-fresh"]; ok {
|
||||
// the client wants a response that will still be fresh for at least the specified number of seconds.
|
||||
minfreshDuration, err := time.ParseDuration(minfresh + "s")
|
||||
if err == nil {
|
||||
currentAge = time.Duration(currentAge + minfreshDuration)
|
||||
}
|
||||
}
|
||||
|
||||
if maxstale, ok := reqCacheControl["max-stale"]; ok {
|
||||
// Indicates that the client is willing to accept a response that has exceeded its expiration time.
|
||||
// If max-stale is assigned a value, then the client is willing to accept a response that has exceeded
|
||||
// its expiration time by no more than the specified number of seconds.
|
||||
// If no value is assigned to max-stale, then the client is willing to accept a stale response of any age.
|
||||
//
|
||||
// Responses served only because of a max-stale value are supposed to have a Warning header added to them,
|
||||
// but that seems like a hassle, and is it actually useful? If so, then there needs to be a different
|
||||
// return-value available here.
|
||||
if maxstale == "" {
|
||||
return fresh
|
||||
}
|
||||
maxstaleDuration, err := time.ParseDuration(maxstale + "s")
|
||||
if err == nil {
|
||||
currentAge = time.Duration(currentAge - maxstaleDuration)
|
||||
}
|
||||
}
|
||||
|
||||
if lifetime > currentAge {
|
||||
return fresh
|
||||
}
|
||||
|
||||
return stale
|
||||
}
|
||||
|
||||
// Returns true if either the request or the response includes the stale-if-error
|
||||
// cache control extension: https://tools.ietf.org/html/rfc5861
|
||||
func canStaleOnError(respHeaders, reqHeaders http.Header) bool {
|
||||
respCacheControl := parseCacheControl(respHeaders)
|
||||
reqCacheControl := parseCacheControl(reqHeaders)
|
||||
|
||||
var err error
|
||||
lifetime := time.Duration(-1)
|
||||
|
||||
if staleMaxAge, ok := respCacheControl["stale-if-error"]; ok {
|
||||
if staleMaxAge != "" {
|
||||
lifetime, err = time.ParseDuration(staleMaxAge + "s")
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
} else {
|
||||
return true
|
||||
}
|
||||
}
|
||||
if staleMaxAge, ok := reqCacheControl["stale-if-error"]; ok {
|
||||
if staleMaxAge != "" {
|
||||
lifetime, err = time.ParseDuration(staleMaxAge + "s")
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
} else {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
if lifetime >= 0 {
|
||||
date, err := Date(respHeaders)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
currentAge := clock.since(date)
|
||||
if lifetime > currentAge {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func getEndToEndHeaders(respHeaders http.Header) []string {
|
||||
// These headers are always hop-by-hop
|
||||
hopByHopHeaders := map[string]struct{}{
|
||||
"Connection": {},
|
||||
"Keep-Alive": {},
|
||||
"Proxy-Authenticate": {},
|
||||
"Proxy-Authorization": {},
|
||||
"Te": {},
|
||||
"Trailers": {},
|
||||
"Transfer-Encoding": {},
|
||||
"Upgrade": {},
|
||||
}
|
||||
|
||||
for _, extra := range strings.Split(respHeaders.Get("connection"), ",") {
|
||||
// any header listed in connection, if present, is also considered hop-by-hop
|
||||
if strings.Trim(extra, " ") != "" {
|
||||
hopByHopHeaders[http.CanonicalHeaderKey(extra)] = struct{}{}
|
||||
}
|
||||
}
|
||||
var endToEndHeaders []string
|
||||
for respHeader := range respHeaders {
|
||||
if _, ok := hopByHopHeaders[respHeader]; !ok {
|
||||
endToEndHeaders = append(endToEndHeaders, respHeader)
|
||||
}
|
||||
}
|
||||
return endToEndHeaders
|
||||
}
|
||||
|
||||
func canStore(reqCacheControl, respCacheControl cacheControl) (canStore bool) {
|
||||
if _, ok := respCacheControl["no-store"]; ok {
|
||||
return false
|
||||
}
|
||||
if _, ok := reqCacheControl["no-store"]; ok {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func newGatewayTimeoutResponse(req *http.Request) *http.Response {
|
||||
var braw bytes.Buffer
|
||||
braw.WriteString("HTTP/1.1 504 Gateway Timeout\r\n\r\n")
|
||||
resp, err := http.ReadResponse(bufio.NewReader(&braw), req)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// cloneRequest returns a clone of the provided *http.Request.
|
||||
// The clone is a shallow copy of the struct and its Header map.
|
||||
// (This function copyright goauth2 authors: https://code.google.com/p/goauth2)
|
||||
func cloneRequest(r *http.Request) *http.Request {
|
||||
// shallow copy of the struct
|
||||
r2 := new(http.Request)
|
||||
*r2 = *r
|
||||
// deep copy of the Header
|
||||
r2.Header = make(http.Header)
|
||||
for k, s := range r.Header {
|
||||
r2.Header[k] = s
|
||||
}
|
||||
return r2
|
||||
}
|
||||
|
||||
type cacheControl map[string]string
|
||||
|
||||
func parseCacheControl(headers http.Header) cacheControl {
|
||||
cc := cacheControl{}
|
||||
ccHeader := headers.Get("Cache-Control")
|
||||
for _, part := range strings.Split(ccHeader, ",") {
|
||||
part = strings.Trim(part, " ")
|
||||
if part == "" {
|
||||
continue
|
||||
}
|
||||
if strings.ContainsRune(part, '=') {
|
||||
keyval := strings.Split(part, "=")
|
||||
cc[strings.Trim(keyval[0], " ")] = strings.Trim(keyval[1], ",")
|
||||
} else {
|
||||
cc[part] = ""
|
||||
}
|
||||
}
|
||||
return cc
|
||||
}
|
||||
|
||||
// headerAllCommaSepValues returns all comma-separated values (each
|
||||
// with whitespace trimmed) for header name in headers. According to
|
||||
// Section 4.2 of the HTTP/1.1 spec
|
||||
// (http://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.2),
|
||||
// values from multiple occurrences of a header should be concatenated, if
|
||||
// the header's value is a comma-separated list.
|
||||
func headerAllCommaSepValues(headers http.Header, name string) []string {
|
||||
var vals []string
|
||||
for _, val := range headers[http.CanonicalHeaderKey(name)] {
|
||||
fields := strings.Split(val, ",")
|
||||
for i, f := range fields {
|
||||
fields[i] = strings.TrimSpace(f)
|
||||
}
|
||||
vals = append(vals, fields...)
|
||||
}
|
||||
return vals
|
||||
}
|
||||
|
||||
// cachingReadCloser is a wrapper around ReadCloser R that calls OnEOF
|
||||
// handler with a full copy of the content read from R when EOF is
|
||||
// reached.
|
||||
type cachingReadCloser struct {
|
||||
// Underlying ReadCloser.
|
||||
R io.ReadCloser
|
||||
// OnEOF is called with a copy of the content of R when EOF is reached.
|
||||
OnEOF func(io.Reader)
|
||||
|
||||
buf bytes.Buffer // buf stores a copy of the content of R.
|
||||
}
|
||||
|
||||
// Read reads the next len(p) bytes from R or until R is drained. The
|
||||
// return value n is the number of bytes read. If R has no data to
|
||||
// return, err is io.EOF and OnEOF is called with a full copy of what
|
||||
// has been read so far.
|
||||
func (r *cachingReadCloser) Read(p []byte) (n int, err error) {
|
||||
n, err = r.R.Read(p)
|
||||
r.buf.Write(p[:n])
|
||||
if err == io.EOF {
|
||||
r.OnEOF(bytes.NewReader(r.buf.Bytes()))
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
func (r *cachingReadCloser) Close() error {
|
||||
return r.R.Close()
|
||||
}
|
||||
|
||||
// PleaseCache excercises a Cache implementation.
|
||||
func PleaseCache(t *testing.T, cache Cache) {
|
||||
key := "testKey"
|
||||
_, ok := cache.Get(key)
|
||||
if ok {
|
||||
t.Fatal("retrieved key before adding it")
|
||||
}
|
||||
|
||||
val := []byte("some bytes")
|
||||
cache.Set(key, val)
|
||||
|
||||
retVal, ok := cache.Get(key)
|
||||
if !ok {
|
||||
t.Fatal("could not retrieve an element we just added")
|
||||
}
|
||||
if !bytes.Equal(retVal, val) {
|
||||
t.Fatal("retrieved a different value than what we put in")
|
||||
}
|
||||
|
||||
cache.Delete(key)
|
||||
|
||||
_, ok = cache.Get(key)
|
||||
if ok {
|
||||
t.Fatal("deleted key still present")
|
||||
}
|
||||
}
|
||||
|
||||
// NewMemoryCacheTransport returns a new Transport using the in-memory cache implementation
|
||||
func NewMemoryCacheTransport() *Transport {
|
||||
c := memorycache.New()
|
||||
t := NewTransport(c)
|
||||
return t
|
||||
}
|
1476
cache/cache_test.go
vendored
Normal file
1476
cache/cache_test.go
vendored
Normal file
File diff suppressed because it is too large
Load Diff
61
cache/diskcache/diskcache.go
vendored
Normal file
61
cache/diskcache/diskcache.go
vendored
Normal file
@ -0,0 +1,61 @@
|
||||
// Package diskcache provides an implementation of cache.Cache that uses the diskv package
|
||||
// to supplement an in-memory map with persistent storage
|
||||
//
|
||||
package diskcache
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/md5"
|
||||
"encoding/hex"
|
||||
"github.com/peterbourgon/diskv"
|
||||
"io"
|
||||
)
|
||||
|
||||
// Cache is an implementation of cache.Cache that supplements the in-memory map with persistent storage
|
||||
type Cache struct {
|
||||
d *diskv.Diskv
|
||||
}
|
||||
|
||||
// Get returns the response corresponding to key if present
|
||||
func (c *Cache) Get(key string) (resp []byte, ok bool) {
|
||||
key = keyToFilename(key)
|
||||
resp, err := c.d.Read(key)
|
||||
if err != nil {
|
||||
return []byte{}, false
|
||||
}
|
||||
return resp, true
|
||||
}
|
||||
|
||||
// Set saves a response to the cache as key
|
||||
func (c *Cache) Set(key string, resp []byte) {
|
||||
key = keyToFilename(key)
|
||||
_ = c.d.WriteStream(key, bytes.NewReader(resp), true)
|
||||
}
|
||||
|
||||
// Delete removes the response with key from the cache
|
||||
func (c *Cache) Delete(key string) {
|
||||
key = keyToFilename(key)
|
||||
_ = c.d.Erase(key)
|
||||
}
|
||||
|
||||
func keyToFilename(key string) string {
|
||||
h := md5.New()
|
||||
_, _ = io.WriteString(h, key)
|
||||
return hex.EncodeToString(h.Sum(nil))
|
||||
}
|
||||
|
||||
// New returns a new Cache that will store files in basePath
|
||||
func New(basePath string) *Cache {
|
||||
return &Cache{
|
||||
d: diskv.New(diskv.Options{
|
||||
BasePath: basePath,
|
||||
CacheSizeMax: 100 * 1024 * 1024, // 100MB
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
// NewWithDiskv returns a new Cache using the provided Diskv as underlying
|
||||
// storage.
|
||||
func NewWithDiskv(d *diskv.Diskv) *Cache {
|
||||
return &Cache{d}
|
||||
}
|
18
cache/diskcache/diskcache_test.go
vendored
Normal file
18
cache/diskcache/diskcache_test.go
vendored
Normal file
@ -0,0 +1,18 @@
|
||||
package diskcache
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/cache"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestDiskCache(t *testing.T) {
|
||||
tempDir, err := ioutil.TempDir("", "cache")
|
||||
if err != nil {
|
||||
t.Fatalf("TempDir: %v", err)
|
||||
}
|
||||
defer os.RemoveAll(tempDir)
|
||||
|
||||
cache.PleaseCache(t, New(tempDir))
|
||||
}
|
51
cache/leveldbcache/leveldbcache.go
vendored
Normal file
51
cache/leveldbcache/leveldbcache.go
vendored
Normal file
@ -0,0 +1,51 @@
|
||||
// Package leveldbcache provides an implementation of cache.Cache that
|
||||
// uses github.com/syndtr/goleveldb/leveldb
|
||||
package leveldbcache
|
||||
|
||||
import (
|
||||
"github.com/syndtr/goleveldb/leveldb"
|
||||
)
|
||||
|
||||
// Cache is an implementation of cache.Cache with leveldb storage
|
||||
type Cache struct {
|
||||
Db *leveldb.DB
|
||||
}
|
||||
|
||||
// Get returns the response corresponding to key if present
|
||||
func (c *Cache) Get(key string) (resp []byte, ok bool) {
|
||||
var err error
|
||||
resp, err = c.Db.Get([]byte(key), nil)
|
||||
if err != nil {
|
||||
return []byte{}, false
|
||||
}
|
||||
return resp, true
|
||||
}
|
||||
|
||||
// Set saves a response to the cache as key
|
||||
func (c *Cache) Set(key string, resp []byte) {
|
||||
_ = c.Db.Put([]byte(key), resp, nil)
|
||||
}
|
||||
|
||||
// Delete removes the response with key from the cache
|
||||
func (c *Cache) Delete(key string) {
|
||||
_ = c.Db.Delete([]byte(key), nil)
|
||||
}
|
||||
|
||||
// New returns a new Cache that will store leveldb in path
|
||||
func New(path string) (*Cache, error) {
|
||||
cache := &Cache{}
|
||||
|
||||
var err error
|
||||
cache.Db, err = leveldb.OpenFile(path, nil)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return cache, nil
|
||||
}
|
||||
|
||||
// NewWithDB returns a new Cache using the provided leveldb as underlying
|
||||
// storage.
|
||||
func NewWithDB(db *leveldb.DB) *Cache {
|
||||
return &Cache{db}
|
||||
}
|
24
cache/leveldbcache/leveldbcache_test.go
vendored
Normal file
24
cache/leveldbcache/leveldbcache_test.go
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
package leveldbcache
|
||||
|
||||
import (
|
||||
"github.com/geziyor/geziyor/cache"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestDiskCache(t *testing.T) {
|
||||
tempDir, err := ioutil.TempDir("", "cache")
|
||||
if err != nil {
|
||||
t.Fatalf("TempDir: %v", err)
|
||||
}
|
||||
defer os.RemoveAll(tempDir)
|
||||
|
||||
c, err := New(filepath.Join(tempDir, "Db"))
|
||||
if err != nil {
|
||||
t.Fatalf("New leveldb,: %v", err)
|
||||
}
|
||||
|
||||
cache.PleaseCache(t, c)
|
||||
}
|
39
cache/memorycache/memorycache.go
vendored
Normal file
39
cache/memorycache/memorycache.go
vendored
Normal file
@ -0,0 +1,39 @@
|
||||
package memorycache
|
||||
|
||||
import (
|
||||
"sync"
|
||||
)
|
||||
|
||||
// Cache is an implementation of Cache that stores responses in an in-memory map.
|
||||
type Cache struct {
|
||||
mu sync.RWMutex
|
||||
items map[string][]byte
|
||||
}
|
||||
|
||||
// Get returns the []byte representation of the response and true if present, false if not
|
||||
func (c *Cache) Get(key string) (resp []byte, ok bool) {
|
||||
c.mu.RLock()
|
||||
resp, ok = c.items[key]
|
||||
c.mu.RUnlock()
|
||||
return resp, ok
|
||||
}
|
||||
|
||||
// Set saves response resp to the cache with key
|
||||
func (c *Cache) Set(key string, resp []byte) {
|
||||
c.mu.Lock()
|
||||
c.items[key] = resp
|
||||
c.mu.Unlock()
|
||||
}
|
||||
|
||||
// Delete removes key from the cache
|
||||
func (c *Cache) Delete(key string) {
|
||||
c.mu.Lock()
|
||||
delete(c.items, key)
|
||||
c.mu.Unlock()
|
||||
}
|
||||
|
||||
// New returns a new Cache that will store items in an in-memory map
|
||||
func New() *Cache {
|
||||
c := &Cache{items: map[string][]byte{}}
|
||||
return c
|
||||
}
|
10
geziyor.go
10
geziyor.go
@ -1,7 +1,7 @@
|
||||
package geziyor
|
||||
|
||||
import (
|
||||
"github.com/fpfeng/httpcache"
|
||||
"github.com/geziyor/geziyor/cache"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/metrics"
|
||||
"github.com/geziyor/geziyor/middleware"
|
||||
@ -69,8 +69,12 @@ func NewGeziyor(opt *Options) *Geziyor {
|
||||
// Client
|
||||
geziyor.Client = client.NewClient(opt.MaxBodySize, opt.CharsetDetectDisabled, opt.RetryTimes, opt.RetryHTTPCodes)
|
||||
if opt.Cache != nil {
|
||||
geziyor.Client.Transport = &httpcache.Transport{
|
||||
Transport: geziyor.Client.Transport, Cache: opt.Cache, MarkCachedResponses: true}
|
||||
geziyor.Client.Transport = &cache.Transport{
|
||||
Policy: opt.CachePolicy,
|
||||
Transport: geziyor.Client.Transport,
|
||||
Cache: opt.Cache,
|
||||
MarkCachedResponses: true,
|
||||
}
|
||||
}
|
||||
if opt.Timeout != 0 {
|
||||
geziyor.Client.Timeout = opt.Timeout
|
||||
|
@ -4,8 +4,9 @@ import (
|
||||
"fmt"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/fortytw2/leaktest"
|
||||
"github.com/fpfeng/httpcache"
|
||||
"github.com/geziyor/geziyor"
|
||||
"github.com/geziyor/geziyor/cache"
|
||||
"github.com/geziyor/geziyor/cache/diskcache"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/export"
|
||||
"github.com/geziyor/geziyor/metrics"
|
||||
@ -28,12 +29,13 @@ func TestCache(t *testing.T) {
|
||||
defer leaktest.Check(t)()
|
||||
geziyor.NewGeziyor(&geziyor.Options{
|
||||
StartURLs: []string{"http://api.ipify.org"},
|
||||
Cache: httpcache.NewMemoryCache(),
|
||||
ParseFunc: func(g *geziyor.Geziyor, r *client.Response) {
|
||||
fmt.Println(string(r.Body))
|
||||
g.Exports <- string(r.Body)
|
||||
g.Get("http://api.ipify.org", nil)
|
||||
},
|
||||
Cache: diskcache.New(".cache"),
|
||||
CachePolicy: cache.RFC2616,
|
||||
}).Start()
|
||||
}
|
||||
|
||||
|
4
go.mod
4
go.mod
@ -8,11 +8,13 @@ require (
|
||||
github.com/chromedp/cdproto v0.0.0-20190609032908-dd39f0bf0a54
|
||||
github.com/chromedp/chromedp v0.3.1-0.20190617065505-d55cf9043e05
|
||||
github.com/fortytw2/leaktest v1.3.0
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3
|
||||
github.com/go-kit/kit v0.8.0
|
||||
github.com/google/btree v1.0.0 // indirect
|
||||
github.com/peterbourgon/diskv v2.0.1+incompatible
|
||||
github.com/pkg/errors v0.8.1
|
||||
github.com/prometheus/client_golang v1.0.0
|
||||
github.com/stretchr/testify v1.3.0
|
||||
github.com/syndtr/goleveldb v1.0.0
|
||||
github.com/temoto/robotstxt v1.1.1
|
||||
golang.org/x/net v0.0.0-20190522155817-f3200d17e092
|
||||
golang.org/x/text v0.3.2
|
||||
|
28
go.sum
28
go.sum
@ -18,8 +18,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw=
|
||||
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3 h1:yoJ3JExhshZwcfvvQLLRqKICf4/p4gZ/mDzdQV1hRWQ=
|
||||
github.com/fpfeng/httpcache v0.0.0-20181220155740-6b8f16a92be3/go.mod h1:QThlC5qj7EJa+O2HqbxzVGWLspJQHQLVsKmBtbg9ak8=
|
||||
github.com/fsnotify/fsnotify v1.4.7 h1:IXs+QLmnXW2CcXuY+8Mzv/fWEsPGWxqefPtCP5CnV9I=
|
||||
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
|
||||
github.com/go-kit/kit v0.8.0 h1:Wz+5lgoB0kkuqLEc6NVmwRknTKP6dTGbSqvhZtBI/j0=
|
||||
github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
|
||||
github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
|
||||
@ -34,6 +34,12 @@ github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7a
|
||||
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
|
||||
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w=
|
||||
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||
github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo=
|
||||
github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
|
||||
github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
|
||||
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
|
||||
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
|
||||
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
|
||||
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307 h1:vl4eIlySbjertFaNwiMjXsGrFVK25aOWLq7n+3gh2ls=
|
||||
@ -47,6 +53,13 @@ github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
|
||||
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
|
||||
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
|
||||
github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs=
|
||||
github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
|
||||
github.com/onsi/gomega v1.4.3 h1:RE1xgDvH7imwFD45h+u2SgIfERHlS2yNG4DObb5BSKU=
|
||||
github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
|
||||
github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI=
|
||||
github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU=
|
||||
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
|
||||
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
@ -69,18 +82,23 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
|
||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE=
|
||||
github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ=
|
||||
github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
|
||||
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a h1:gOpx8G595UYyvj8UK4+OFyY4rx037g3fmfhe5SasG3U=
|
||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20190522155817-f3200d17e092 h1:4QSRKanuywn15aTZvI/mIDEgPQpswuFndXpOj3rKEco=
|
||||
golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
|
||||
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190610081024-1e42afee0f76 h1:QSmW7Q3mFdAGjtAd0byXmFJ55inUydyZ4WQmiuItAIQ=
|
||||
@ -90,5 +108,11 @@ golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
|
||||
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4=
|
||||
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
|
||||
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
|
||||
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
|
||||
gopkg.in/yaml.v2 v2.2.1 h1:mUhvW9EsL+naU5Q3cakzfE91YhliOondGd6ZrsDBHQE=
|
||||
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
|
21
options.go
21
options.go
@ -1,7 +1,7 @@
|
||||
package geziyor
|
||||
|
||||
import (
|
||||
"github.com/fpfeng/httpcache"
|
||||
"github.com/geziyor/geziyor/cache"
|
||||
"github.com/geziyor/geziyor/client"
|
||||
"github.com/geziyor/geziyor/export"
|
||||
"github.com/geziyor/geziyor/metrics"
|
||||
@ -15,18 +15,25 @@ type Options struct {
|
||||
// If empty, any domain is allowed
|
||||
AllowedDomains []string
|
||||
|
||||
// Set this to enable caching responses.
|
||||
// Memory Cache: httpcache.NewMemoryCache()
|
||||
// Disk Cache: diskcache.New(".cache")
|
||||
Cache httpcache.Cache
|
||||
// Cache storage backends.
|
||||
// - Memory
|
||||
// - Disk
|
||||
// - LevelDB
|
||||
Cache cache.Cache
|
||||
|
||||
// Charset Detection disable
|
||||
// Policies for caching.
|
||||
// - Dummy policy (default)
|
||||
// - RFC2616 policy
|
||||
CachePolicy cache.Policy
|
||||
|
||||
// Response charset detection for decoding to UTF-8
|
||||
CharsetDetectDisabled bool
|
||||
|
||||
// Concurrent requests limit
|
||||
ConcurrentRequests int
|
||||
|
||||
// Concurrent requests per domain limit
|
||||
// Concurrent requests per domain limit. Uses request.URL.Host
|
||||
// Subdomains are different than top domain
|
||||
ConcurrentRequestsPerDomain int
|
||||
|
||||
// If set true, cookies won't send.
|
||||
|
Loading…
x
Reference in New Issue
Block a user