fix: No failover to next upstream when receiving SERVFAIL or REFUSED response codes(#7457) (#7458)

This commit is contained in:
Fitz_dev
2025-09-13 05:45:01 +08:00
committed by GitHub
parent 155f451957
commit 9683de0feb
5 changed files with 166 additions and 0 deletions

View File

@@ -52,6 +52,7 @@ forward FROM TO... {
max_concurrent MAX
next RCODE_1 [RCODE_2] [RCODE_3...]
failfast_all_unhealthy_upstreams
failover RCODE_1 [RCODE_2] [RCODE_3...]
}
~~~
@@ -99,6 +100,7 @@ forward FROM TO... {
As an upper bound for **MAX**, consider that each concurrent query will use about 2kb of memory.
* `next` If the `RCODE` (i.e. `NXDOMAIN`) is returned by the remote then execute the next plugin. If no next plugin is defined, or the next plugin is not a `forward` plugin, this setting is ignored
* `failfast_all_unhealthy_upstreams` - determines the handling of requests when all upstream servers are unhealthy and unresponsive to health checks. Enabling this option will immediately return SERVFAIL responses for all requests. By default, requests are sent to a random upstream.
* `failover` - By default when a DNS lookup fails to return a DNS response (e.g. timeout), _forward_ will attempt a lookup on the next upstream server. The `failover` option will make _forward_ do the same for any response with a response code matching an `RCODE` ( e.g. `SERVFAIL``REFUSED`). If all upstreams have been tried, the response from the last attempt is returned.
Also note the TLS config is "global" for the whole forwarding proxy if you need a different
`tls_servername` for different upstreams you're out of luck.
@@ -287,6 +289,17 @@ The following would try 1.2.3.4 first. If the response is `NXDOMAIN`, try 5.6.7.
}
~~~
In the following example, if the response from `1.2.3.4` is `SERVFAIL` or `REFUSED`, it will try `5.6.7.8`. If the response from `5.6.7.8` is `SERVFAIL ` or `REFUSED`, it will try `9.0.1.2`.
~~~ corefile
. {
forward . 1.2.3.4 5.6.7.8 9.0.1.2 {
policy sequential
failover SERVFAIL REFUSED
}
}
~~~
## See Also
[RFC 7858](https://tools.ietf.org/html/rfc7858) for DNS over TLS.

View File

@@ -51,6 +51,7 @@ type Forward struct {
expire time.Duration
maxConcurrent int64
failfastUnhealthyUpstreams bool
failoverRcodes []int
opts proxyPkg.Options // also here for testing
@@ -206,6 +207,21 @@ func (f *Forward) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
return 0, nil
}
// Check if we have a failover Rcode defined, check if we match on the code
tryNext := false
for _, failoverRcode := range f.failoverRcodes {
// if we match, we continue to the next upstream in the list
if failoverRcode == ret.Rcode {
if fails < len(f.proxies) {
tryNext = true
}
}
}
if tryNext {
fails++
continue
}
// Check if we have an alternate Rcode defined, check if we match on the code
for _, alternateRcode := range f.nextAlternateRcodes {
if alternateRcode == ret.Rcode && f.Next != nil { // In case we do not have a Next handler, just continue normally

View File

@@ -312,6 +312,27 @@ func parseBlock(c *caddy.Controller, f *Forward) error {
return c.ArgErr()
}
f.failfastUnhealthyUpstreams = true
case "failover":
args := c.RemainingArgs()
if len(args) == 0 {
return c.ArgErr()
}
toRcode := dns.StringToRcode
for _, rcode := range args {
var rc int
var ok bool
if rc, ok = toRcode[strings.ToUpper(rcode)]; !ok {
if rc == dns.RcodeSuccess {
return fmt.Errorf("NoError cannot be used in failover")
}
return fmt.Errorf("%s is not a valid rcode", rcode)
}
f.failoverRcodes = append(f.failoverRcodes, rc)
}
default:
return c.Errf("unknown property '%s'", c.Val())
}

View File

@@ -1,6 +1,8 @@
package forward
import (
"context"
"fmt"
"os"
"reflect"
"strings"
@@ -8,7 +10,9 @@ import (
"github.com/coredns/caddy"
"github.com/coredns/coredns/core/dnsserver"
"github.com/coredns/coredns/plugin/pkg/dnstest"
"github.com/coredns/coredns/plugin/pkg/proxy"
"github.com/coredns/coredns/plugin/test"
"github.com/miekg/dns"
)
@@ -423,3 +427,72 @@ func TestFailfastAllUnhealthyUpstreams(t *testing.T) {
}
}
}
func TestFailover(t *testing.T) {
server_fail_s := dnstest.NewMultipleServer(func(w dns.ResponseWriter, r *dns.Msg) {
ret := new(dns.Msg)
ret.SetRcode(r, dns.RcodeServerFailure)
w.WriteMsg(ret)
})
defer server_fail_s.Close()
server_refused_s := dnstest.NewMultipleServer(func(w dns.ResponseWriter, r *dns.Msg) {
ret := new(dns.Msg)
ret.SetRcode(r, dns.RcodeRefused)
w.WriteMsg(ret)
})
defer server_refused_s.Close()
s := dnstest.NewMultipleServer(func(w dns.ResponseWriter, r *dns.Msg) {
ret := new(dns.Msg)
ret.SetReply(r)
ret.Answer = append(ret.Answer, test.A("example.org. IN A 127.0.0.1"))
w.WriteMsg(ret)
})
defer s.Close()
tests := []struct {
input string
hasRecord bool
failMsg string
}{
{fmt.Sprintf(
`forward . %s %s %s {
policy sequential
failover ServFail Refused
}`, server_fail_s.Addr, server_refused_s.Addr, s.Addr), true, "If failover is set, records should be returned as long as one of the upstreams is work"},
{fmt.Sprintf(
`forward . %s %s %s {
policy sequential
}`, server_fail_s.Addr, server_refused_s.Addr, s.Addr), false, "If failover is not set and the first upstream is not work, no records should be returned"},
{fmt.Sprintf(
`forward . %s %s %s {
policy sequential
}`, s.Addr, server_fail_s.Addr, server_refused_s.Addr), true, "Although failover is not set, as long as the first upstream is work, there should be has a record return"},
}
for _, testCase := range tests {
c := caddy.NewTestController("dns", testCase.input)
fs, err := parseForward(c)
f := fs[0]
if err != nil {
t.Errorf("Failed to create forwarder: %s", err)
}
f.OnStartup()
defer f.OnShutdown()
m := new(dns.Msg)
m.SetQuestion("example.org.", dns.TypeA)
rec := dnstest.NewRecorder(&test.ResponseWriter{})
if _, err := f.ServeDNS(context.TODO(), rec, m); err != nil {
t.Fatal("Expected to receive reply, but didn't")
}
if (len(rec.Msg.Answer) > 0) != testCase.hasRecord {
t.Errorf(" %s: \n %s", testCase.failMsg, testCase.input)
}
}
}

View File

@@ -58,6 +58,49 @@ func NewServer(f dns.HandlerFunc) *Server {
return &Server{s1: s1, s2: s2, Addr: s2.Listener.Addr().String()}
}
// NewMultipleServer starts and returns a new Server(multiple). The caller should call Close when
// finished, to shut it down.
func NewMultipleServer(f dns.HandlerFunc) *Server {
ch1 := make(chan bool)
ch2 := make(chan bool)
s1 := &dns.Server{
Handler: dns.HandlerFunc(f),
} // udp
s2 := &dns.Server{
Handler: dns.HandlerFunc(f),
} // tcp
for range 5 { // 5 attempts
s2.Listener, _ = reuseport.Listen("tcp", ":0")
if s2.Listener == nil {
continue
}
s1.PacketConn, _ = net.ListenPacket("udp", s2.Listener.Addr().String())
if s1.PacketConn != nil {
break
}
// perhaps UPD port is in use, try again
s2.Listener.Close()
s2.Listener = nil
}
if s2.Listener == nil {
panic("dnstest.NewServer(): failed to create new server")
}
s1.NotifyStartedFunc = func() { close(ch1) }
s2.NotifyStartedFunc = func() { close(ch2) }
go s1.ActivateAndServe()
go s2.ActivateAndServe()
<-ch1
<-ch2
return &Server{s1: s1, s2: s2, Addr: s2.Listener.Addr().String()}
}
// Close shuts down the server.
func (s *Server) Close() {
s.s1.Shutdown()