plugin/forward: add max_age option to enforce an absolute connection lifetime (#7903)

* plugin/pkg/proxy: add max_age for per-connection lifetime cap

Introduce a max_age setting on Transport that closes connections based
on creation time, independent of idle-timeout (expire).

Background: PR #7790 changed the connection pool from LIFO to FIFO for
source-port diversity. Under FIFO, every connection is cycled through
the pool and its used timestamp is refreshed continuously. When request
rate is high enough that pool_size / request_rate < expire, no
connection ever becomes idle and expire never fires. This prevents
CoreDNS from opening new connections to upstreams that scale out (e.g.
new Kubernetes pods behind a ClusterIP service with conntrack pinning).

max_age addresses this by enforcing an absolute upper bound on
connection lifetime regardless of activity:
- persistConn gains a created field set at dial time.
- Transport gains maxAge (default 0 = unlimited, preserving existing
  behaviour).
- Dial(): rejects cached connections whose creation age exceeds max_age.
- cleanup(): when maxAge > 0, uses a linear scan over both idle-timeout
  and max-age predicates; when maxAge == 0, preserves the original
  binary-search path on used time (sorted by FIFO insertion order).
- Both hot paths pre-compute the deadline outside any inner loop to
  avoid repeated time.Now() calls.

Tests added:
- TestMaxAgeExpireByCreation: connection with old created but fresh used
  must be rejected even though idle-timeout would pass.
- TestMaxAgeFIFORotation: three FIFO-rotated connections (old created,
  fresh used) must all be rejected, confirming that continuous rotation
  cannot prevent max-age expiry.

Signed-off-by: cangming <cangming@cangming.app>

* plugin/forward: add max_age option

Expose Transport.SetMaxAge through the forward plugin so operators can
set an absolute upper bound on connection lifetime via the Corefile.

Usage:
  forward . 1.2.3.4 {
      max_age 30s
  }

Default is 0 (unlimited), which preserves existing behaviour.
A positive value causes connections older than max_age to be closed and
re-dialled on the next request, ensuring CoreDNS reconnects to newly
scaled-out upstream pods even under sustained load where the idle
timeout (expire) would never fire.

If max_age is set, it must not be less than expire; the parser rejects
this combination at startup with a clear error message.

Signed-off-by: cangming <cangming@cangming.app>

---------

Signed-off-by: cangming <cangming@cangming.app>
This commit is contained in:
cangming
2026-03-10 02:50:03 +08:00
committed by GitHub
parent bbd54ad288
commit 500707c43a
7 changed files with 231 additions and 9 deletions

View File

@@ -49,6 +49,7 @@ type Forward struct {
tlsServerName string
maxfails uint32
expire time.Duration
maxAge time.Duration
maxIdleConns int
maxConcurrent int64
failfastUnhealthyUpstreams bool

View File

@@ -157,6 +157,10 @@ func parseStanza(c *caddy.Controller) (*Forward, error) {
}
}
if f.maxAge > 0 && f.maxAge < f.expire {
return f, fmt.Errorf("max_age (%s) must not be less than expire (%s)", f.maxAge, f.expire)
}
tlsServerNames := make([]string, len(toHosts))
perServerNameProxyCount := make(map[string]int)
transports := make([]string, len(toHosts))
@@ -207,6 +211,7 @@ func parseStanza(c *caddy.Controller) (*Forward, error) {
}
}
f.proxies[i].SetExpire(f.expire)
f.proxies[i].SetMaxAge(f.maxAge)
f.proxies[i].SetMaxIdleConns(f.maxIdleConns)
f.proxies[i].GetHealthchecker().SetRecursionDesired(f.opts.HCRecursionDesired)
// when TLS is used, checks are set to tcp-tls
@@ -323,6 +328,18 @@ func parseBlock(c *caddy.Controller, f *Forward) error {
return fmt.Errorf("expire can't be negative: %s", dur)
}
f.expire = dur
case "max_age":
if !c.NextArg() {
return c.ArgErr()
}
dur, err := time.ParseDuration(c.Val())
if err != nil {
return err
}
if dur < 0 {
return fmt.Errorf("max_age can't be negative: %s", dur)
}
f.maxAge = dur
case "max_idle_conns":
if !c.NextArg() {
return c.ArgErr()

View File

@@ -707,3 +707,78 @@ func TestFailoverValidation(t *testing.T) {
})
}
}
func TestSetupMaxAge(t *testing.T) {
tests := []struct {
name string
input string
shouldErr bool
expectedVal time.Duration
expectedErr string
}{
{
name: "default (no max_age)",
input: "forward . 127.0.0.1\n",
expectedVal: 0,
},
{
name: "valid max_age",
input: "forward . 127.0.0.1 {\nmax_age 30s\n}\n",
expectedVal: 30 * time.Second,
},
{
name: "max_age equal to expire",
input: "forward . 127.0.0.1 {\nexpire 10s\nmax_age 10s\n}\n",
expectedVal: 10 * time.Second,
},
{
name: "max_age zero (unlimited)",
input: "forward . 127.0.0.1 {\nmax_age 0s\n}\n",
expectedVal: 0,
},
{
name: "negative max_age",
input: "forward . 127.0.0.1 {\nmax_age -1s\n}\n",
shouldErr: true,
expectedErr: "negative",
},
{
name: "invalid max_age value",
input: "forward . 127.0.0.1 {\nmax_age invalid\n}\n",
shouldErr: true,
expectedErr: "invalid",
},
{
name: "max_age less than expire",
input: "forward . 127.0.0.1 {\nexpire 30s\nmax_age 10s\n}\n",
shouldErr: true,
expectedErr: "max_age",
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
c := caddy.NewTestController("dns", test.input)
fs, err := parseForward(c)
if test.shouldErr {
if err == nil {
t.Errorf("expected error but found none for input %s", test.input)
return
}
if !strings.Contains(err.Error(), test.expectedErr) {
t.Errorf("expected error to contain %q, got: %v", test.expectedErr, err)
}
return
}
if err != nil {
t.Errorf("expected no error but found: %v", err)
return
}
if fs[0].maxAge != test.expectedVal {
t.Errorf("expected maxAge %v, got %v", test.expectedVal, fs[0].maxAge)
}
})
}
}