Files
coredns/plugin/proxy/lookup.go

123 lines
3.3 KiB
Go
Raw Normal View History

package proxy
// functions other plugin might want to use to do lookup in the same style as the proxy.
import (
"context"
"fmt"
plugin/proxy: decrease health timeouts (#1107) Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
2017-09-24 20:05:36 +01:00
"net"
"sync/atomic"
"time"
"github.com/coredns/coredns/plugin/pkg/healthcheck"
"github.com/coredns/coredns/request"
"github.com/miekg/dns"
)
// NewLookup create a new proxy with the hosts in host and a Random policy.
func NewLookup(hosts []string) Proxy { return NewLookupWithOption(hosts, Options{}) }
// NewLookupWithOption process creates a simple round robin forward with potentially forced proto for upstream.
func NewLookupWithOption(hosts []string, opts Options) Proxy {
p := Proxy{Next: nil}
// TODO(miek): this needs to be unified with upstream.go's NewStaticUpstreams, caddy uses NewHost
// we should copy/make something similar.
upstream := &staticUpstream{
from: ".",
HealthCheck: healthcheck.HealthCheck{
plugin/proxy: decrease health timeouts (#1107) Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
2017-09-24 20:05:36 +01:00
FailTimeout: 5 * time.Second,
MaxFails: 3,
},
ex: newDNSExWithOption(opts),
}
upstream.Hosts = make([]*healthcheck.UpstreamHost, len(hosts))
for i, host := range hosts {
uh := &healthcheck.UpstreamHost{
Name: host,
FailTimeout: upstream.FailTimeout,
CheckDown: checkDownFunc(upstream),
}
upstream.Hosts[i] = uh
}
p.Upstreams = &[]Upstream{upstream}
2016-03-21 21:22:23 +00:00
return p
}
// Lookup will use name and type to forge a new message and will send that upstream. It will
// set any EDNS0 options correctly so that downstream will be able to process the reply.
func (p Proxy) Lookup(state request.Request, name string, typ uint16) (*dns.Msg, error) {
req := new(dns.Msg)
req.SetQuestion(name, typ)
state.SizeAndDo(req)
state2 := request.Request{W: state.W, Req: req}
return p.lookup(state2)
}
// Forward forward the request in state as-is. Unlike Lookup that adds EDNS0 suffix to the message.
func (p Proxy) Forward(state request.Request) (*dns.Msg, error) {
return p.lookup(state)
}
func (p Proxy) lookup(state request.Request) (*dns.Msg, error) {
upstream := p.match(state)
if upstream == nil {
return nil, errInvalidDomain
}
for {
start := time.Now()
reply := new(dns.Msg)
var backendErr error
// Since Select() should give us "up" hosts, keep retrying
// hosts until timeout (or until we get a nil host).
for time.Since(start) < tryDuration {
host := upstream.Select()
if host == nil {
return nil, fmt.Errorf("%s: %s", errUnreachable, "no upstream host")
}
// duplicated from proxy.go, but with a twist, we don't write the
plugin/proxy: decrease health timeouts (#1107) Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
2017-09-24 20:05:36 +01:00
// reply back to the client, we return it and there is no monitoring to update here.
atomic.AddInt64(&host.Conns, 1)
reply, backendErr = upstream.Exchanger().Exchange(context.TODO(), host.Name, state)
atomic.AddInt64(&host.Conns, -1)
if backendErr == nil {
return reply, nil
}
plugin/proxy: decrease health timeouts (#1107) Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
2017-09-24 20:05:36 +01:00
if oe, ok := backendErr.(*net.OpError); ok {
if oe.Timeout() { // see proxy.go for docs.
continue
}
}
timeout := host.FailTimeout
if timeout == 0 {
timeout = defaultFailTimeout
}
plugin/proxy: decrease health timeouts (#1107) Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
2017-09-24 20:05:36 +01:00
atomic.AddInt32(&host.Fails, 1)
fails := atomic.LoadInt32(&host.Fails)
plugin/proxy: decrease health timeouts (#1107) Turn down the timeouts and numbers a bit: FailTimeout 10s -> 5s Future 60s -> 12s TryDuration 60s -> 16s The timeout for decrementing the fails in a host: 10s -> 2s And the biggest change: don't set fails when the error is Timeout(), meaning we loop for a bit and may try the same server again, but we don't mark our upstream as bad, see comments in proxy.go. Testing this with "ANY isc.org" and "MX miek.nl" we see: ~~~ ::1 - [24/Sep/2017:08:06:17 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.001621221s 24/Sep/2017:08:06:17 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:37420->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:17 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 35.957284ms 127.0.0.1 - [24/Sep/2017:08:06:18 +0100] "ANY IN isc.org. udp 37 false 4096" SERVFAIL qr,rd 37 10.002051726s 24/Sep/2017:08:06:18 +0100 [ERROR 0 isc.org. ANY] unreachable backend: read udp 192.168.1.148:54901->8.8.8.8:53: i/o timeout ::1 - [24/Sep/2017:08:06:19 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 56.848416ms 127.0.0.1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 48.118349ms ::1 - [24/Sep/2017:08:06:21 +0100] "MX IN miek.nl. udp 37 false 4096" NOERROR qr,rd,ra,ad 170 1.055172915s ~~~ So the ANY isc.org queries show up twice, because we retry internally - this is I think WAI. The `miek.nl MX` queries are just processed normally as no backend is marked as unreachable. May fix #1035 #486
2017-09-24 20:05:36 +01:00
go func(host *healthcheck.UpstreamHost, timeout time.Duration) {
time.Sleep(timeout)
atomic.AddInt32(&host.Fails, -1)
if fails%failureCheck == 0 { // Kick off healthcheck on eveyry third failure.
host.HealthCheckURL()
}
}(host, timeout)
}
return nil, fmt.Errorf("%s: %s", errUnreachable, backendErr)
}
}