plugin/forward: add max_age option to enforce an absolute connection lifetime (#7903)

* plugin/pkg/proxy: add max_age for per-connection lifetime cap

Introduce a max_age setting on Transport that closes connections based
on creation time, independent of idle-timeout (expire).

Background: PR #7790 changed the connection pool from LIFO to FIFO for
source-port diversity. Under FIFO, every connection is cycled through
the pool and its used timestamp is refreshed continuously. When request
rate is high enough that pool_size / request_rate < expire, no
connection ever becomes idle and expire never fires. This prevents
CoreDNS from opening new connections to upstreams that scale out (e.g.
new Kubernetes pods behind a ClusterIP service with conntrack pinning).

max_age addresses this by enforcing an absolute upper bound on
connection lifetime regardless of activity:
- persistConn gains a created field set at dial time.
- Transport gains maxAge (default 0 = unlimited, preserving existing
  behaviour).
- Dial(): rejects cached connections whose creation age exceeds max_age.
- cleanup(): when maxAge > 0, uses a linear scan over both idle-timeout
  and max-age predicates; when maxAge == 0, preserves the original
  binary-search path on used time (sorted by FIFO insertion order).
- Both hot paths pre-compute the deadline outside any inner loop to
  avoid repeated time.Now() calls.

Tests added:
- TestMaxAgeExpireByCreation: connection with old created but fresh used
  must be rejected even though idle-timeout would pass.
- TestMaxAgeFIFORotation: three FIFO-rotated connections (old created,
  fresh used) must all be rejected, confirming that continuous rotation
  cannot prevent max-age expiry.

Signed-off-by: cangming <cangming@cangming.app>

* plugin/forward: add max_age option

Expose Transport.SetMaxAge through the forward plugin so operators can
set an absolute upper bound on connection lifetime via the Corefile.

Usage:
  forward . 1.2.3.4 {
      max_age 30s
  }

Default is 0 (unlimited), which preserves existing behaviour.
A positive value causes connections older than max_age to be closed and
re-dialled on the next request, ensuring CoreDNS reconnects to newly
scaled-out upstream pods even under sustained load where the idle
timeout (expire) would never fire.

If max_age is set, it must not be less than expire; the parser rejects
this combination at startup with a clear error message.

Signed-off-by: cangming <cangming@cangming.app>

---------

Signed-off-by: cangming <cangming@cangming.app>
This commit is contained in:
cangming
2026-03-10 02:50:03 +08:00
committed by GitHub
parent bbd54ad288
commit 500707c43a
7 changed files with 231 additions and 9 deletions

View File

@@ -98,7 +98,12 @@ func TestCleanupAll(t *testing.T) {
c2, _ := dns.DialTimeout("udp", tr.addr, maxDialTimeout)
c3, _ := dns.DialTimeout("udp", tr.addr, maxDialTimeout)
tr.conns[typeUDP] = []*persistConn{{c1, time.Now()}, {c2, time.Now()}, {c3, time.Now()}}
now := time.Now()
tr.conns[typeUDP] = []*persistConn{
{c: c1, created: now, used: now},
{c: c2, created: now, used: now},
{c: c3, created: now, used: now},
}
if len(tr.conns[typeUDP]) != 3 {
t.Error("Expected 3 connections")
@@ -226,6 +231,89 @@ func TestYieldAfterStop(t *testing.T) {
}
}
// TestMaxAgeExpireByCreation verifies that a connection is rejected when its
// creation time exceeds max_age, even if it was recently yielded (fresh used time).
// This guards against the FIFO rotation bug where used time is continually
// refreshed, preventing connections from expiring by idle-timeout alone.
func TestMaxAgeExpireByCreation(t *testing.T) {
s := dnstest.NewServer(func(w dns.ResponseWriter, r *dns.Msg) {
ret := new(dns.Msg)
ret.SetReply(r)
w.WriteMsg(ret)
})
defer s.Close()
tr := newTransport("TestMaxAgeExpireByCreation", s.Addr)
tr.SetExpire(10 * time.Second) // long idle-timeout: would not expire the connection
tr.SetMaxAge(100 * time.Millisecond) // short max-age: should close old connection
tr.Start()
defer tr.Stop()
// Inject a connection whose creation time is past max_age but whose used
// time is fresh, simulating a FIFO-rotated connection that is never idle.
oldConn, err := dns.DialTimeout("udp", tr.addr, maxDialTimeout)
if err != nil {
t.Fatalf("Failed to dial: %v", err)
}
pc := &persistConn{
c: oldConn,
created: time.Now().Add(-200 * time.Millisecond), // 2x max-age: should be closed
used: time.Now(), // freshly used: idle-timeout would pass
}
tr.mu.Lock()
tr.conns[typeUDP] = []*persistConn{pc}
tr.mu.Unlock()
_, cached, _ := tr.Dial("udp")
if cached {
t.Error("connection should be closed by max_age, not reused despite fresh used time")
}
}
// TestMaxAgeFIFORotation verifies that connections in a FIFO pool are closed by
// max_age even when continuously rotated (which refreshes their used timestamps).
// Regression test for Scale up: new upstream pods should receive traffic after
// existing connections exceed max_age, regardless of request rate.
func TestMaxAgeFIFORotation(t *testing.T) {
s := dnstest.NewServer(func(w dns.ResponseWriter, r *dns.Msg) {
ret := new(dns.Msg)
ret.SetReply(r)
w.WriteMsg(ret)
})
defer s.Close()
tr := newTransport("TestMaxAgeFIFORotation", s.Addr)
tr.SetExpire(10 * time.Second) // long idle-timeout: FIFO rotation keeps connections alive
tr.SetMaxAge(100 * time.Millisecond) // max-age: connections must be closed by creation age
tr.Start()
defer tr.Stop()
// Inject 3 connections old by creation time but with fresh used timestamps,
// simulating active FIFO rotation where idle-timeout never triggers.
tr.mu.Lock()
for range 3 {
c, err := dns.DialTimeout("udp", tr.addr, maxDialTimeout)
if err != nil {
tr.mu.Unlock()
t.Fatalf("Failed to dial: %v", err)
}
tr.conns[typeUDP] = append(tr.conns[typeUDP], &persistConn{
c: c,
created: time.Now().Add(-200 * time.Millisecond), // exceeds max-age
used: time.Now(), // fresh: idle-timeout would pass
})
}
tr.mu.Unlock()
// All 3 connections must be rejected by max_age despite fresh used timestamps.
for i := range 3 {
_, cached, _ := tr.Dial("udp")
if cached {
t.Errorf("Dial %d: connection should be closed by max_age (FIFO rotation must not prevent max-age expiry)", i+1)
}
}
}
func BenchmarkYield(b *testing.B) {
s := dnstest.NewServer(func(w dns.ResponseWriter, r *dns.Msg) {
ret := new(dns.Msg)