diff --git a/plugin/proxy/README.md b/plugin/proxy/README.md index 484a2e7ae..c23df8c42 100644 --- a/plugin/proxy/README.md +++ b/plugin/proxy/README.md @@ -41,10 +41,10 @@ proxy FROM TO... { communicate with it. The default value is 10 seconds ("10s"). * `max_fails` is the number of failures within fail_timeout that are needed before considering a backend to be down. If 0, the backend will never be marked as down. Default is 1. -* `health_check` will check path (on port) on each backend. If a backend returns a status code of +* `health_check` will check **PATH** (on **PORT**) on each backend. If a backend returns a status code of 200-399, then that backend is marked healthy for double the healthcheck duration. If it doesn't, it is marked as unhealthy and no requests are routed to it. If this option is not provided then - health checks are disabled. The default duration is 30 seconds ("30s"). + health checks are disabled. The default duration is 4 seconds ("4s"). * **IGNORED_NAMES** in `except` is a space-separated list of domains to exclude from proxying. Requests that match none of these names will be passed through. * `spray` when all backends are unhealthy, randomly pick one to send the traffic to. (This is diff --git a/plugin/proxy/google.go b/plugin/proxy/google.go index d7248e149..91d4cba26 100644 --- a/plugin/proxy/google.go +++ b/plugin/proxy/google.go @@ -192,9 +192,9 @@ func newUpstream(hosts []string, old *staticUpstream) Upstream { upstream := &staticUpstream{ from: old.from, HealthCheck: healthcheck.HealthCheck{ - FailTimeout: 10 * time.Second, + FailTimeout: 5 * time.Second, MaxFails: 3, - Future: 60 * time.Second, + Future: 12 * time.Second, }, ex: old.ex, IgnoredSubDomains: old.IgnoredSubDomains, diff --git a/plugin/proxy/lookup.go b/plugin/proxy/lookup.go index 238666608..fc0f3e01f 100644 --- a/plugin/proxy/lookup.go +++ b/plugin/proxy/lookup.go @@ -5,6 +5,7 @@ package proxy import ( "context" "fmt" + "net" "sync/atomic" "time" @@ -26,9 +27,9 @@ func NewLookupWithOption(hosts []string, opts Options) Proxy { upstream := &staticUpstream{ from: ".", HealthCheck: healthcheck.HealthCheck{ - FailTimeout: 10 * time.Second, - MaxFails: 3, // TODO(miek): disable error checking for simple lookups? - Future: 60 * time.Second, + FailTimeout: 5 * time.Second, + MaxFails: 3, + Future: 12 * time.Second, }, ex: newDNSExWithOption(opts), } @@ -85,7 +86,7 @@ func (p Proxy) lookup(state request.Request) (*dns.Msg, error) { } // duplicated from proxy.go, but with a twist, we don't write the - // reply back to the client, we return it and there is no monitoring. + // reply back to the client, we return it and there is no monitoring to update here. atomic.AddInt64(&host.Conns, 1) @@ -96,11 +97,20 @@ func (p Proxy) lookup(state request.Request) (*dns.Msg, error) { if backendErr == nil { return reply, nil } + + if oe, ok := backendErr.(*net.OpError); ok { + if oe.Timeout() { // see proxy.go for docs. + continue + } + } + timeout := host.FailTimeout if timeout == 0 { - timeout = 10 * time.Second + timeout = 2 * time.Second } + atomic.AddInt32(&host.Fails, 1) + go func(host *healthcheck.UpstreamHost, timeout time.Duration) { time.Sleep(timeout) atomic.AddInt32(&host.Fails, -1) diff --git a/plugin/proxy/proxy.go b/plugin/proxy/proxy.go index 9d1e1906b..b2c713a53 100644 --- a/plugin/proxy/proxy.go +++ b/plugin/proxy/proxy.go @@ -4,6 +4,7 @@ package proxy import ( "errors" "fmt" + "net" "sync/atomic" "time" @@ -56,7 +57,7 @@ type Upstream interface { // tryDuration is how long to try upstream hosts; failures result in // immediate retries until this duration ends or we get a nil host. -var tryDuration = 60 * time.Second +var tryDuration = 16 * time.Second // ServeDNS satisfies the plugin.Handler interface. func (p Proxy) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) { @@ -112,11 +113,26 @@ func (p Proxy) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) ( return 0, taperr } + // A "ANY isc.org" query is being dropped by ISC's nameserver, we see this as a i/o timeout, but + // would then mark our upstream is being broken. We should not do this if we consider the error temporary. + // Of course it could really be that our upstream is broken + if oe, ok := backendErr.(*net.OpError); ok { + // Note this keeps looping and trying until tryDuration is hit, at which point our client + // might be long gone... + if oe.Timeout() { + // Our upstream's upstream is problably messing up, continue with next selected + // host - which my be the *same* one as we don't set any uh.Fails. + continue + } + } + timeout := host.FailTimeout if timeout == 0 { - timeout = 10 * time.Second + timeout = 2 * time.Second } + atomic.AddInt32(&host.Fails, 1) + go func(host *healthcheck.UpstreamHost, timeout time.Duration) { time.Sleep(timeout) atomic.AddInt32(&host.Fails, -1) diff --git a/plugin/proxy/upstream.go b/plugin/proxy/upstream.go index f7ad58ea8..0ab29de51 100644 --- a/plugin/proxy/upstream.go +++ b/plugin/proxy/upstream.go @@ -31,9 +31,9 @@ func NewStaticUpstreams(c *caddyfile.Dispenser) ([]Upstream, error) { upstream := &staticUpstream{ from: ".", HealthCheck: healthcheck.HealthCheck{ - FailTimeout: 10 * time.Second, - MaxFails: 1, - Future: 60 * time.Second, + FailTimeout: 5 * time.Second, + MaxFails: 3, + Future: 12 * time.Second, }, ex: newDNSEx(), } @@ -121,7 +121,7 @@ func parseBlock(c *caddyfile.Dispenser, u *staticUpstream) error { if err != nil { return err } - u.HealthCheck.Interval = 30 * time.Second + u.HealthCheck.Interval = 4 * time.Second if c.NextArg() { dur, err := time.ParseDuration(c.Val()) if err != nil {