Files
coredns/plugin/proxy/proxy.go

184 lines
5.2 KiB
Go
Raw Normal View History

// Package proxy is plugin that proxies requests.
2016-03-18 20:57:35 +00:00
package proxy
import (
"context"
2016-03-18 20:57:35 +00:00
"errors"
"fmt"
plugin/proxy: decrease health timeouts (#1107) Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
2017-09-24 20:05:36 +01:00
"net"
2016-03-18 20:57:35 +00:00
"sync/atomic"
"time"
"github.com/coredns/coredns/plugin"
"github.com/coredns/coredns/plugin/metrics"
"github.com/coredns/coredns/plugin/pkg/healthcheck"
"github.com/coredns/coredns/request"
2016-03-18 20:57:35 +00:00
"github.com/miekg/dns"
ot "github.com/opentracing/opentracing-go"
2016-03-18 20:57:35 +00:00
)
var (
errUnreachable = errors.New("unreachable backend")
errInvalidProtocol = errors.New("invalid protocol")
errInvalidDomain = errors.New("invalid path for proxy")
)
2016-03-18 20:57:35 +00:00
// Proxy represents a plugin instance that can proxy requests to another (DNS) server.
2016-03-18 20:57:35 +00:00
type Proxy struct {
Next plugin.Handler
// Upstreams is a pointer to a slice, so we can update the upstream (used for Google)
// midway.
Upstreams *[]Upstream
// Trace is the Trace plugin, if it is installed
// This is used by the grpc exchanger to trace through the grpc calls
Trace plugin.Handler
2016-03-18 20:57:35 +00:00
}
// Upstream manages a pool of proxy upstream hosts. Select should return a
// suitable upstream host, or nil if no such hosts are available.
type Upstream interface {
// The domain name this upstream host should be routed on.
From() string
// Selects an upstream host to be routed to.
Select() *healthcheck.UpstreamHost
2016-03-18 20:57:35 +00:00
// Checks if subpdomain is not an ignored.
IsAllowedDomain(string) bool
// Exchanger returns the exchanger to be used for this upstream.
Exchanger() Exchanger
// Stops the upstream from proxying requests to shutdown goroutines cleanly.
Stop() error
2016-03-18 20:57:35 +00:00
}
// tryDuration is how long to try upstream hosts; failures result in
// immediate retries until this duration ends or we get a nil host.
plugin/proxy: decrease health timeouts (#1107) Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
2017-09-24 20:05:36 +01:00
var tryDuration = 16 * time.Second
2016-03-18 20:57:35 +00:00
// ServeDNS satisfies the plugin.Handler interface.
func (p Proxy) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) {
var span, child ot.Span
span = ot.SpanFromContext(ctx)
state := request.Request{W: w, Req: r}
upstream := p.match(state)
if upstream == nil {
return plugin.NextOrFailure(p.Name(), p.Next, ctx, w, r)
}
for {
2016-03-18 20:57:35 +00:00
start := time.Now()
var reply *dns.Msg
var backendErr error
2016-03-18 20:57:35 +00:00
// Since Select() should give us "up" hosts, keep retrying
// hosts until timeout (or until we get a nil host).
for time.Since(start) < tryDuration {
2016-03-18 20:57:35 +00:00
host := upstream.Select()
if host == nil {
return dns.RcodeServerFailure, fmt.Errorf("%s: %s", errUnreachable, "no upstream host")
2016-03-18 20:57:35 +00:00
}
if span != nil {
child = span.Tracer().StartSpan("exchange", ot.ChildOf(span.Context()))
ctx = ot.ContextWithSpan(ctx, child)
}
2016-03-18 20:57:35 +00:00
atomic.AddInt64(&host.Conns, 1)
RequestCount.WithLabelValues(metrics.WithServer(ctx), state.Proto(), upstream.Exchanger().Protocol(), familyToString(state.Family()), host.Name).Add(1)
reply, backendErr = upstream.Exchanger().Exchange(ctx, host.Name, state)
2016-03-18 20:57:35 +00:00
atomic.AddInt64(&host.Conns, -1)
if child != nil {
child.Finish()
}
taperr := toDnstap(ctx, host.Name, upstream.Exchanger(), state, reply, start)
2016-03-18 20:57:35 +00:00
if backendErr == nil {
// Check if the reply is correct; if not return FormErr.
if !state.Match(reply) {
formerr := state.ErrorMessage(dns.RcodeFormatError)
w.WriteMsg(formerr)
return 0, taperr
}
w.WriteMsg(reply)
RequestDuration.WithLabelValues(metrics.WithServer(ctx), state.Proto(), upstream.Exchanger().Protocol(), familyToString(state.Family()), host.Name).Observe(time.Since(start).Seconds())
return 0, taperr
2016-03-18 20:57:35 +00:00
}
plugin/proxy: decrease health timeouts (#1107) Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
2017-09-24 20:05:36 +01:00
// A "ANY isc.org" query is being dropped by ISC's nameserver, we see this as a i/o timeout, but
// would then mark our upstream is being broken. We should not do this if we consider the error temporary.
// Of course it could really be that our upstream is broken
if oe, ok := backendErr.(*net.OpError); ok {
// Note this keeps looping and trying until tryDuration is hit, at which point our client
// might be long gone...
if oe.Timeout() {
2018-08-14 17:55:55 +02:00
// Our upstream's upstream is probably messing up, continue with next selected
plugin/proxy: decrease health timeouts (#1107) Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
2017-09-24 20:05:36 +01:00
// host - which my be the *same* one as we don't set any uh.Fails.
continue
}
}
2016-03-18 20:57:35 +00:00
timeout := host.FailTimeout
if timeout == 0 {
timeout = defaultFailTimeout
2016-03-18 20:57:35 +00:00
}
plugin/proxy: decrease health timeouts (#1107) Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
2017-09-24 20:05:36 +01:00
2016-03-18 20:57:35 +00:00
atomic.AddInt32(&host.Fails, 1)
fails := atomic.LoadInt32(&host.Fails)
plugin/proxy: decrease health timeouts (#1107) Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
2017-09-24 20:05:36 +01:00
go func(host *healthcheck.UpstreamHost, timeout time.Duration) {
2016-03-18 20:57:35 +00:00
time.Sleep(timeout)
// we may go negative here, should be rectified by the HC.
2016-03-18 20:57:35 +00:00
atomic.AddInt32(&host.Fails, -1)
if fails%failureCheck == 0 { // Kick off healthcheck on eveyry third failure.
host.HealthCheckURL()
}
2016-03-18 20:57:35 +00:00
}(host, timeout)
}
return dns.RcodeServerFailure, fmt.Errorf("%s: %s", errUnreachable, backendErr)
2016-03-18 20:57:35 +00:00
}
}
func (p Proxy) match(state request.Request) (u Upstream) {
if p.Upstreams == nil {
return nil
}
longestMatch := 0
for _, upstream := range *p.Upstreams {
from := upstream.From()
if !plugin.Name(from).Matches(state.Name()) || !upstream.IsAllowedDomain(state.Name()) {
continue
}
if lf := len(from); lf > longestMatch {
longestMatch = lf
u = upstream
}
}
return u
2016-03-18 20:57:35 +00:00
}
2016-10-27 11:48:37 +00:00
// Name implements the Handler interface.
func (p Proxy) Name() string { return "proxy" }
const (
defaultFailTimeout = 2 * time.Second
defaultTimeout = 5 * time.Second
failureCheck = 3
)