mirror of
https://github.com/coredns/coredns.git
synced 2026-01-17 14:21:18 -05:00
perf(proxy): use mutex-based connection pool (#7790)
* perf(proxy): use mutex-based connection pool The proxy package (used for example by the forward plugin) utilized an actor model where a single connManager goroutine managed connection pooling via unbuffered channels (dial, yield, ret). This design serialized all connection acquisition and release operations through a single goroutine, creating a bottleneck under high concurrency. This was observable as a performance degradation when using a single upstream backend compared to multiple backends (which sharded the bottleneck). Changes: - Removed dial, yield, and ret channels from the Transport struct. - Removed the connManager goroutine's request processing loop. - Implemented Dial() and Yield() using a sync.Mutex to protect the connection slice, allowing for fast concurrent access without context switching. - Downgraded connManager to a simple background cleanup loop that only handles connection expiration on a ticker. - Updated plugin/pkg/proxy/connect.go to use direct method calls instead of channel sends. - Updated tests to reflect the removal of internal channels. Benchmarks show that this change eliminates the single-backend bottleneck. Now a single upstream backend performs on par with multiple backends, and overall throughput is improved. The implementation aligns with standard Go patterns for connection pooling (e.g., net/http.Transport). Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> * fix: address PR review for persistent.go - Named mutex field instead of embedding, to not expose Lock() and Unlock() - Move stop check outside of lock in Yield() - Close() without a separate goroutine - Change stop channel to struct Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> * fix: address code review feedback for conn pool - Switch from LIFO to FIFO connection selection for source port diversity, reducing DNS cache poisoning risk (RFC 5452). - Remove "clear entire cache" optimization as it was LIFO-specific. FIFO naturally iterates and skips expired connections. - Remove all goroutines for closing connections; collect connections while holding lock, close synchronously after releasing lock. Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> * fix: remove unused error consts No longer utilised after refactoring the channel based approach. Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> * feat(forward): add max_idle_conns option Add configurable connection pool limit for the forward plugin via the max_idle_conns Corefile option. Changes: - Add SetMaxIdleConns to proxy - Add maxIdleConns field to Forward struct - Add max_idle_conns parsing in forward plugin setup - Apply setting to each proxy during configuration - Update forward plugin README with new option By default the value is 0 (unbounded). When set, excess connections returned to the pool are closed immediately rather than cached. Also add a yield related test. Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> * chore(proxy): simple Dial by closing conns inline Remove toClose slice collection to reduce complexity. Instead close expired connections directly while iterating. Reduces complexity with negligible lock-time impact. Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> * chore: fewer explicit Unlock calls Cleaner and less chance of forgetting to unlock on new possible code paths. Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> --------- Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
// Package proxy implements a forwarding proxy. It caches an upstream net.Conn for some time, so if the same
|
||||
// client returns the upstream's Conn will be precached. Depending on how you benchmark this looks to be
|
||||
// 50% faster than just opening a new connection for every client. It works with UDP and TCP and uses
|
||||
// inband healthchecking.
|
||||
// Package proxy implements a forwarding proxy with connection caching.
|
||||
// It manages a pool of upstream connections (UDP and TCP) to reuse them for subsequent requests,
|
||||
// reducing latency and handshake overhead. It supports in-band health checking.
|
||||
package proxy
|
||||
|
||||
import (
|
||||
@@ -19,10 +18,7 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
ErrTransportStopped = "proxy: transport stopped"
|
||||
ErrTransportStoppedDuringDial = "proxy: transport stopped during dial"
|
||||
ErrTransportStoppedRetClosed = "proxy: transport stopped, ret channel closed"
|
||||
ErrTransportStoppedDuringRetWait = "proxy: transport stopped during ret wait"
|
||||
ErrTransportStopped = "proxy: transport stopped"
|
||||
)
|
||||
|
||||
// limitTimeout is a utility function to auto-tune timeout values
|
||||
@@ -66,41 +62,35 @@ func (t *Transport) Dial(proto string) (*persistConn, bool, error) {
|
||||
default:
|
||||
}
|
||||
|
||||
// Use select to avoid blocking if connManager has stopped
|
||||
select {
|
||||
case t.dial <- proto:
|
||||
// Successfully sent dial request
|
||||
case <-t.stop:
|
||||
return nil, false, errors.New(ErrTransportStoppedDuringDial)
|
||||
transtype := stringToTransportType(proto)
|
||||
|
||||
t.mu.Lock()
|
||||
// FIFO: take the oldest conn (front of slice) for source port diversity
|
||||
for len(t.conns[transtype]) > 0 {
|
||||
pc := t.conns[transtype][0]
|
||||
t.conns[transtype] = t.conns[transtype][1:]
|
||||
if time.Since(pc.used) > t.expire {
|
||||
pc.c.Close()
|
||||
continue
|
||||
}
|
||||
t.mu.Unlock()
|
||||
connCacheHitsCount.WithLabelValues(t.proxyName, t.addr, proto).Add(1)
|
||||
return pc, true, nil
|
||||
}
|
||||
t.mu.Unlock()
|
||||
|
||||
// Receive response with stop awareness
|
||||
select {
|
||||
case pc, ok := <-t.ret:
|
||||
if !ok {
|
||||
// ret channel was closed by connManager during stop
|
||||
return nil, false, errors.New(ErrTransportStoppedRetClosed)
|
||||
}
|
||||
connCacheMissesCount.WithLabelValues(t.proxyName, t.addr, proto).Add(1)
|
||||
|
||||
if pc != nil {
|
||||
connCacheHitsCount.WithLabelValues(t.proxyName, t.addr, proto).Add(1)
|
||||
return pc, true, nil
|
||||
}
|
||||
connCacheMissesCount.WithLabelValues(t.proxyName, t.addr, proto).Add(1)
|
||||
|
||||
reqTime := time.Now()
|
||||
timeout := t.dialTimeout()
|
||||
if proto == "tcp-tls" {
|
||||
conn, err := dns.DialTimeoutWithTLS("tcp", t.addr, t.tlsConfig, timeout)
|
||||
t.updateDialTimeout(time.Since(reqTime))
|
||||
return &persistConn{c: conn}, false, err
|
||||
}
|
||||
conn, err := dns.DialTimeout(proto, t.addr, timeout)
|
||||
reqTime := time.Now()
|
||||
timeout := t.dialTimeout()
|
||||
if proto == "tcp-tls" {
|
||||
conn, err := dns.DialTimeoutWithTLS("tcp", t.addr, t.tlsConfig, timeout)
|
||||
t.updateDialTimeout(time.Since(reqTime))
|
||||
return &persistConn{c: conn}, false, err
|
||||
case <-t.stop:
|
||||
return nil, false, errors.New(ErrTransportStoppedDuringRetWait)
|
||||
}
|
||||
conn, err := dns.DialTimeout(proto, t.addr, timeout)
|
||||
t.updateDialTimeout(time.Since(reqTime))
|
||||
return &persistConn{c: conn}, false, err
|
||||
}
|
||||
|
||||
// Connect selects an upstream, sends the request and waits for a response.
|
||||
|
||||
Reference in New Issue
Block a user