mirror of
https://github.com/coredns/coredns.git
synced 2025-10-28 08:44:17 -04:00
plugin/proxy: decrease health timeouts (#1107)
Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
This commit is contained in:
@@ -41,10 +41,10 @@ proxy FROM TO... {
|
|||||||
communicate with it. The default value is 10 seconds ("10s").
|
communicate with it. The default value is 10 seconds ("10s").
|
||||||
* `max_fails` is the number of failures within fail_timeout that are needed before considering
|
* `max_fails` is the number of failures within fail_timeout that are needed before considering
|
||||||
a backend to be down. If 0, the backend will never be marked as down. Default is 1.
|
a backend to be down. If 0, the backend will never be marked as down. Default is 1.
|
||||||
* `health_check` will check path (on port) on each backend. If a backend returns a status code of
|
* `health_check` will check **PATH** (on **PORT**) on each backend. If a backend returns a status code of
|
||||||
200-399, then that backend is marked healthy for double the healthcheck duration. If it doesn't,
|
200-399, then that backend is marked healthy for double the healthcheck duration. If it doesn't,
|
||||||
it is marked as unhealthy and no requests are routed to it. If this option is not provided then
|
it is marked as unhealthy and no requests are routed to it. If this option is not provided then
|
||||||
health checks are disabled. The default duration is 30 seconds ("30s").
|
health checks are disabled. The default duration is 4 seconds ("4s").
|
||||||
* **IGNORED_NAMES** in `except` is a space-separated list of domains to exclude from proxying.
|
* **IGNORED_NAMES** in `except` is a space-separated list of domains to exclude from proxying.
|
||||||
Requests that match none of these names will be passed through.
|
Requests that match none of these names will be passed through.
|
||||||
* `spray` when all backends are unhealthy, randomly pick one to send the traffic to. (This is
|
* `spray` when all backends are unhealthy, randomly pick one to send the traffic to. (This is
|
||||||
|
|||||||
@@ -192,9 +192,9 @@ func newUpstream(hosts []string, old *staticUpstream) Upstream {
|
|||||||
upstream := &staticUpstream{
|
upstream := &staticUpstream{
|
||||||
from: old.from,
|
from: old.from,
|
||||||
HealthCheck: healthcheck.HealthCheck{
|
HealthCheck: healthcheck.HealthCheck{
|
||||||
FailTimeout: 10 * time.Second,
|
FailTimeout: 5 * time.Second,
|
||||||
MaxFails: 3,
|
MaxFails: 3,
|
||||||
Future: 60 * time.Second,
|
Future: 12 * time.Second,
|
||||||
},
|
},
|
||||||
ex: old.ex,
|
ex: old.ex,
|
||||||
IgnoredSubDomains: old.IgnoredSubDomains,
|
IgnoredSubDomains: old.IgnoredSubDomains,
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ package proxy
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"net"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -26,9 +27,9 @@ func NewLookupWithOption(hosts []string, opts Options) Proxy {
|
|||||||
upstream := &staticUpstream{
|
upstream := &staticUpstream{
|
||||||
from: ".",
|
from: ".",
|
||||||
HealthCheck: healthcheck.HealthCheck{
|
HealthCheck: healthcheck.HealthCheck{
|
||||||
FailTimeout: 10 * time.Second,
|
FailTimeout: 5 * time.Second,
|
||||||
MaxFails: 3, // TODO(miek): disable error checking for simple lookups?
|
MaxFails: 3,
|
||||||
Future: 60 * time.Second,
|
Future: 12 * time.Second,
|
||||||
},
|
},
|
||||||
ex: newDNSExWithOption(opts),
|
ex: newDNSExWithOption(opts),
|
||||||
}
|
}
|
||||||
@@ -85,7 +86,7 @@ func (p Proxy) lookup(state request.Request) (*dns.Msg, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// duplicated from proxy.go, but with a twist, we don't write the
|
// duplicated from proxy.go, but with a twist, we don't write the
|
||||||
// reply back to the client, we return it and there is no monitoring.
|
// reply back to the client, we return it and there is no monitoring to update here.
|
||||||
|
|
||||||
atomic.AddInt64(&host.Conns, 1)
|
atomic.AddInt64(&host.Conns, 1)
|
||||||
|
|
||||||
@@ -96,11 +97,20 @@ func (p Proxy) lookup(state request.Request) (*dns.Msg, error) {
|
|||||||
if backendErr == nil {
|
if backendErr == nil {
|
||||||
return reply, nil
|
return reply, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if oe, ok := backendErr.(*net.OpError); ok {
|
||||||
|
if oe.Timeout() { // see proxy.go for docs.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
timeout := host.FailTimeout
|
timeout := host.FailTimeout
|
||||||
if timeout == 0 {
|
if timeout == 0 {
|
||||||
timeout = 10 * time.Second
|
timeout = 2 * time.Second
|
||||||
}
|
}
|
||||||
|
|
||||||
atomic.AddInt32(&host.Fails, 1)
|
atomic.AddInt32(&host.Fails, 1)
|
||||||
|
|
||||||
go func(host *healthcheck.UpstreamHost, timeout time.Duration) {
|
go func(host *healthcheck.UpstreamHost, timeout time.Duration) {
|
||||||
time.Sleep(timeout)
|
time.Sleep(timeout)
|
||||||
atomic.AddInt32(&host.Fails, -1)
|
atomic.AddInt32(&host.Fails, -1)
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ package proxy
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"net"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -56,7 +57,7 @@ type Upstream interface {
|
|||||||
|
|
||||||
// tryDuration is how long to try upstream hosts; failures result in
|
// tryDuration is how long to try upstream hosts; failures result in
|
||||||
// immediate retries until this duration ends or we get a nil host.
|
// immediate retries until this duration ends or we get a nil host.
|
||||||
var tryDuration = 60 * time.Second
|
var tryDuration = 16 * time.Second
|
||||||
|
|
||||||
// ServeDNS satisfies the plugin.Handler interface.
|
// ServeDNS satisfies the plugin.Handler interface.
|
||||||
func (p Proxy) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) {
|
func (p Proxy) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) {
|
||||||
@@ -112,11 +113,26 @@ func (p Proxy) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (
|
|||||||
return 0, taperr
|
return 0, taperr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A "ANY isc.org" query is being dropped by ISC's nameserver, we see this as a i/o timeout, but
|
||||||
|
// would then mark our upstream is being broken. We should not do this if we consider the error temporary.
|
||||||
|
// Of course it could really be that our upstream is broken
|
||||||
|
if oe, ok := backendErr.(*net.OpError); ok {
|
||||||
|
// Note this keeps looping and trying until tryDuration is hit, at which point our client
|
||||||
|
// might be long gone...
|
||||||
|
if oe.Timeout() {
|
||||||
|
// Our upstream's upstream is problably messing up, continue with next selected
|
||||||
|
// host - which my be the *same* one as we don't set any uh.Fails.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
timeout := host.FailTimeout
|
timeout := host.FailTimeout
|
||||||
if timeout == 0 {
|
if timeout == 0 {
|
||||||
timeout = 10 * time.Second
|
timeout = 2 * time.Second
|
||||||
}
|
}
|
||||||
|
|
||||||
atomic.AddInt32(&host.Fails, 1)
|
atomic.AddInt32(&host.Fails, 1)
|
||||||
|
|
||||||
go func(host *healthcheck.UpstreamHost, timeout time.Duration) {
|
go func(host *healthcheck.UpstreamHost, timeout time.Duration) {
|
||||||
time.Sleep(timeout)
|
time.Sleep(timeout)
|
||||||
atomic.AddInt32(&host.Fails, -1)
|
atomic.AddInt32(&host.Fails, -1)
|
||||||
|
|||||||
@@ -31,9 +31,9 @@ func NewStaticUpstreams(c *caddyfile.Dispenser) ([]Upstream, error) {
|
|||||||
upstream := &staticUpstream{
|
upstream := &staticUpstream{
|
||||||
from: ".",
|
from: ".",
|
||||||
HealthCheck: healthcheck.HealthCheck{
|
HealthCheck: healthcheck.HealthCheck{
|
||||||
FailTimeout: 10 * time.Second,
|
FailTimeout: 5 * time.Second,
|
||||||
MaxFails: 1,
|
MaxFails: 3,
|
||||||
Future: 60 * time.Second,
|
Future: 12 * time.Second,
|
||||||
},
|
},
|
||||||
ex: newDNSEx(),
|
ex: newDNSEx(),
|
||||||
}
|
}
|
||||||
@@ -121,7 +121,7 @@ func parseBlock(c *caddyfile.Dispenser, u *staticUpstream) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
u.HealthCheck.Interval = 30 * time.Second
|
u.HealthCheck.Interval = 4 * time.Second
|
||||||
if c.NextArg() {
|
if c.NextArg() {
|
||||||
dur, err := time.ParseDuration(c.Val())
|
dur, err := time.ParseDuration(c.Val())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
Reference in New Issue
Block a user