fix(proxy): avoid Dial hang after Transport stopped (#7321)

Ensure Dial exits early or returns error when Transport has been
stopped, instead of blocking on the dial or ret channels. This removes
a potential goroutine leak where callers could pile up waiting
forever under heavy load.

Add select guards before send and receive, and propagate clear error
values so callers can handle shutdown gracefully.

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>
This commit is contained in:
Ville Vesilehto
2025-05-28 16:58:48 +03:00
committed by GitHub
parent 5857ad173d
commit 0a48523083
2 changed files with 299 additions and 14 deletions

View File

@@ -18,6 +18,13 @@ import (
"github.com/miekg/dns"
)
const (
ErrTransportStopped = "proxy: transport stopped"
ErrTransportStoppedDuringDial = "proxy: transport stopped during dial"
ErrTransportStoppedRetClosed = "proxy: transport stopped, ret channel closed"
ErrTransportStoppedDuringRetWait = "proxy: transport stopped during ret wait"
)
// limitTimeout is a utility function to auto-tune timeout values
// average observed time is moved towards the last observed delay moderated by a weight
// next timeout to use will be the double of the computed average, limited by min and max frame.
@@ -52,25 +59,48 @@ func (t *Transport) Dial(proto string) (*persistConn, bool, error) {
proto = "tcp-tls"
}
t.dial <- proto
pc := <-t.ret
if pc != nil {
connCacheHitsCount.WithLabelValues(t.proxyName, t.addr, proto).Add(1)
return pc, true, nil
// Check if transport is stopped before attempting to dial
select {
case <-t.stop:
return nil, false, errors.New(ErrTransportStopped)
default:
}
connCacheMissesCount.WithLabelValues(t.proxyName, t.addr, proto).Add(1)
reqTime := time.Now()
timeout := t.dialTimeout()
if proto == "tcp-tls" {
conn, err := dns.DialTimeoutWithTLS("tcp", t.addr, t.tlsConfig, timeout)
// Use select to avoid blocking if connManager has stopped
select {
case t.dial <- proto:
// Successfully sent dial request
case <-t.stop:
return nil, false, errors.New(ErrTransportStoppedDuringDial)
}
// Receive response with stop awareness
select {
case pc, ok := <-t.ret:
if !ok {
// ret channel was closed by connManager during stop
return nil, false, errors.New(ErrTransportStoppedRetClosed)
}
if pc != nil {
connCacheHitsCount.WithLabelValues(t.proxyName, t.addr, proto).Add(1)
return pc, true, nil
}
connCacheMissesCount.WithLabelValues(t.proxyName, t.addr, proto).Add(1)
reqTime := time.Now()
timeout := t.dialTimeout()
if proto == "tcp-tls" {
conn, err := dns.DialTimeoutWithTLS("tcp", t.addr, t.tlsConfig, timeout)
t.updateDialTimeout(time.Since(reqTime))
return &persistConn{c: conn}, false, err
}
conn, err := dns.DialTimeout(proto, t.addr, timeout)
t.updateDialTimeout(time.Since(reqTime))
return &persistConn{c: conn}, false, err
case <-t.stop:
return nil, false, errors.New(ErrTransportStoppedDuringRetWait)
}
conn, err := dns.DialTimeout(proto, t.addr, timeout)
t.updateDialTimeout(time.Since(reqTime))
return &persistConn{c: conn}, false, err
}
// Connect selects an upstream, sends the request and waits for a response.