fix: prevent SIGTERM/reload deadlock (#7562)

This commit is contained in:
Ville Vesilehto
2025-09-19 14:01:53 +03:00
committed by GitHub
parent 5532ba8484
commit 6ec327836b
6 changed files with 171 additions and 24 deletions

View File

@@ -50,6 +50,10 @@ type Server struct {
classChaos bool // allow non-INET class queries
tsigSecret map[string]string
// Ensure Stop is idempotent when invoked concurrently (e.g., during reload and SIGTERM).
stopOnce sync.Once
wgDoneOnce sync.Once
}
// MetadataCollector is a plugin that can retrieve metadata functions from all metadata providing plugins
@@ -212,33 +216,37 @@ func (s *Server) ListenPacket() (net.PacketConn, error) {
// immediately.
// This implements Caddy.Stopper interface.
func (s *Server) Stop() (err error) {
if runtime.GOOS != "windows" {
// force connections to close after timeout
done := make(chan struct{})
go func() {
s.dnsWg.Done() // decrement our initial increment used as a barrier
s.dnsWg.Wait()
close(done)
}()
var onceErr error
s.stopOnce.Do(func() {
if runtime.GOOS != "windows" {
// force connections to close after timeout
done := make(chan struct{})
go func() {
// decrement our initial increment used as a barrier, but only once
s.wgDoneOnce.Do(func() { s.dnsWg.Done() })
s.dnsWg.Wait()
close(done)
}()
// Wait for remaining connections to finish or
// force them all to close after timeout
select {
case <-time.After(s.graceTimeout):
case <-done:
// Wait for remaining connections to finish or
// force them all to close after timeout
select {
case <-time.After(s.graceTimeout):
case <-done:
}
}
}
// Close the listener now; this stops the server without delay
s.m.Lock()
for _, s1 := range s.server {
// We might not have started and initialized the full set of servers
if s1 != nil {
err = s1.Shutdown()
// Close the listener now; this stops the server without delay
s.m.Lock()
for _, s1 := range s.server {
// We might not have started and initialized the full set of servers
if s1 != nil {
onceErr = s1.Shutdown()
}
}
}
s.m.Unlock()
return
s.m.Unlock()
})
return onceErr
}
// Address together with Stop() implement caddy.GracefulServer.

View File

@@ -2,6 +2,7 @@ package dnsserver
import (
"context"
"sync"
"testing"
"github.com/coredns/coredns/plugin"
@@ -120,3 +121,22 @@ func BenchmarkCoreServeDNS(b *testing.B) {
s.ServeDNS(ctx, w, m)
}
}
// Validates Stop is idempotent and safe under concurrent calls.
func TestStopIsIdempotent(t *testing.T) {
t.Parallel()
s := &Server{}
s.dnsWg.Add(1)
const n = 10
var wg sync.WaitGroup
wg.Add(n)
for range n {
go func() {
defer wg.Done()
_ = s.Stop()
}()
}
wg.Wait()
}

View File

@@ -105,6 +105,10 @@ func hook(event caddy.EventName, info any) error {
// now lets consider that plugin will not be reload, unless appear in next config file
// change status of usage will be reset in setup if the plugin appears in config file
r.setUsage(maybeUsed)
// If shutdown is in progress, avoid attempting a restart.
if shutdownRequested(r.quit) {
return
}
_, err := instance.Restart(corefile)
reloadInfo.WithLabelValues("sha512", hex.EncodeToString(sha512sum[:])).Set(1)
if err != nil {
@@ -126,3 +130,14 @@ func hook(event caddy.EventName, info any) error {
return nil
}
// shutdownRequested reports whether a shutdown has been requested via quit channel.
// helps with unit testing of the shutdown gate logic.
func shutdownRequested(quit <-chan bool) bool {
select {
case <-quit:
return true
default:
return false
}
}

View File

@@ -0,0 +1,50 @@
package reload
import (
"testing"
"github.com/coredns/caddy"
)
// fakeInput implements caddy.Input for testing parse().
type fakeInput struct {
p string
b []byte
}
func (f fakeInput) ServerType() string { return "dns" }
func (f fakeInput) Body() []byte { return f.b }
func (f fakeInput) Path() string { return f.p }
// TestParseInvalidCorefile ensures parse returns an error for invalid Corefile syntax.
func TestParseInvalidCorefile(t *testing.T) {
t.Parallel()
broken := fakeInput{p: "Corefile", b: []byte(". { errors\n")}
if _, err := parse(broken); err == nil {
t.Fatalf("expected parse error for invalid Corefile, got nil")
}
}
// TestShutdownGate ensures the shutdown gate helper recognizes when shutdown is requested.
func TestShutdownGate(t *testing.T) {
t.Parallel()
q := make(chan bool, 1)
if shutdownRequested(q) {
t.Fatalf("expected no shutdown before signal")
}
q <- true
if !shutdownRequested(q) {
t.Fatalf("expected shutdown after signal")
}
}
// TestHookIgnoresNonStartupEvent ensures hook is a no-op for non-startup events.
func TestHookIgnoresNonStartupEvent(t *testing.T) {
t.Parallel()
if err := hook(caddy.EventName("not-startup"), nil); err != nil {
t.Fatalf("expected no error for non-startup event, got %v", err)
}
}

View File

@@ -20,7 +20,7 @@ func init() { plugin.Register("reload", setup) }
// channel for QUIT is never changed in purpose.
// WARNING: this data may be unsync after an invalid attempt of reload Corefile.
var (
r = reload{dur: defaultInterval, u: unused, quit: make(chan bool)}
r = reload{dur: defaultInterval, u: unused, quit: make(chan bool, 1)}
once, shutOnce sync.Once
)

View File

@@ -8,6 +8,7 @@ import (
"net/http"
"strings"
"testing"
"time"
"github.com/coredns/caddy"
"github.com/coredns/coredns/core/dnsserver"
@@ -380,6 +381,59 @@ func TestReloadUnreadyPlugin(t *testing.T) {
c1.Stop()
}
// TestReloadConcurrentRestartAndStop ensures there is no deadlock when a restart
// races with a shutdown (issue #7314).
func TestReloadConcurrentRestartAndStop(t *testing.T) {
corefileA := `.:0 {
reload 2s 1s
whoami
}`
corefileB := `.:0 {
reload 2s 1s
whoami
# change to trigger different config
}`
c, err := CoreDNSServer(corefileA)
if err != nil {
if strings.Contains(err.Error(), inUse) {
return
}
t.Fatalf("Could not start CoreDNS instance: %v", err)
}
restartErr := make(chan error, 1)
stopDone := make(chan struct{})
go func() {
_, err := c.Restart(NewInput(corefileB))
restartErr <- err
}()
// Small delay to increase overlap window
time.Sleep(50 * time.Millisecond)
go func() {
c.Stop()
close(stopDone)
}()
// Both operations should complete promptly; if not, we may be deadlocked.
select {
case <-stopDone:
// ok
case <-time.After(5 * time.Second):
t.Fatalf("Stop did not complete in time (possible deadlock)")
}
select {
case <-restartErr:
// ok: restart either succeeded or returned an error
// we only care about not hanging
case <-time.After(5 * time.Second):
t.Fatalf("Restart did not complete in time (possible deadlock)")
}
}
type unready struct {
next plugin.Handler
}