diff --git a/core/dnsserver/server.go b/core/dnsserver/server.go index da52ff721..89ee5f31f 100644 --- a/core/dnsserver/server.go +++ b/core/dnsserver/server.go @@ -50,6 +50,10 @@ type Server struct { classChaos bool // allow non-INET class queries tsigSecret map[string]string + + // Ensure Stop is idempotent when invoked concurrently (e.g., during reload and SIGTERM). + stopOnce sync.Once + wgDoneOnce sync.Once } // MetadataCollector is a plugin that can retrieve metadata functions from all metadata providing plugins @@ -212,33 +216,37 @@ func (s *Server) ListenPacket() (net.PacketConn, error) { // immediately. // This implements Caddy.Stopper interface. func (s *Server) Stop() (err error) { - if runtime.GOOS != "windows" { - // force connections to close after timeout - done := make(chan struct{}) - go func() { - s.dnsWg.Done() // decrement our initial increment used as a barrier - s.dnsWg.Wait() - close(done) - }() + var onceErr error + s.stopOnce.Do(func() { + if runtime.GOOS != "windows" { + // force connections to close after timeout + done := make(chan struct{}) + go func() { + // decrement our initial increment used as a barrier, but only once + s.wgDoneOnce.Do(func() { s.dnsWg.Done() }) + s.dnsWg.Wait() + close(done) + }() - // Wait for remaining connections to finish or - // force them all to close after timeout - select { - case <-time.After(s.graceTimeout): - case <-done: + // Wait for remaining connections to finish or + // force them all to close after timeout + select { + case <-time.After(s.graceTimeout): + case <-done: + } } - } - // Close the listener now; this stops the server without delay - s.m.Lock() - for _, s1 := range s.server { - // We might not have started and initialized the full set of servers - if s1 != nil { - err = s1.Shutdown() + // Close the listener now; this stops the server without delay + s.m.Lock() + for _, s1 := range s.server { + // We might not have started and initialized the full set of servers + if s1 != nil { + onceErr = s1.Shutdown() + } } - } - s.m.Unlock() - return + s.m.Unlock() + }) + return onceErr } // Address together with Stop() implement caddy.GracefulServer. diff --git a/core/dnsserver/server_test.go b/core/dnsserver/server_test.go index 1e7e1ea14..8eca39b78 100644 --- a/core/dnsserver/server_test.go +++ b/core/dnsserver/server_test.go @@ -2,6 +2,7 @@ package dnsserver import ( "context" + "sync" "testing" "github.com/coredns/coredns/plugin" @@ -120,3 +121,22 @@ func BenchmarkCoreServeDNS(b *testing.B) { s.ServeDNS(ctx, w, m) } } + +// Validates Stop is idempotent and safe under concurrent calls. +func TestStopIsIdempotent(t *testing.T) { + t.Parallel() + + s := &Server{} + s.dnsWg.Add(1) + + const n = 10 + var wg sync.WaitGroup + wg.Add(n) + for range n { + go func() { + defer wg.Done() + _ = s.Stop() + }() + } + wg.Wait() +} diff --git a/plugin/reload/reload.go b/plugin/reload/reload.go index 1793abd91..af8289d30 100644 --- a/plugin/reload/reload.go +++ b/plugin/reload/reload.go @@ -105,6 +105,10 @@ func hook(event caddy.EventName, info any) error { // now lets consider that plugin will not be reload, unless appear in next config file // change status of usage will be reset in setup if the plugin appears in config file r.setUsage(maybeUsed) + // If shutdown is in progress, avoid attempting a restart. + if shutdownRequested(r.quit) { + return + } _, err := instance.Restart(corefile) reloadInfo.WithLabelValues("sha512", hex.EncodeToString(sha512sum[:])).Set(1) if err != nil { @@ -126,3 +130,14 @@ func hook(event caddy.EventName, info any) error { return nil } + +// shutdownRequested reports whether a shutdown has been requested via quit channel. +// helps with unit testing of the shutdown gate logic. +func shutdownRequested(quit <-chan bool) bool { + select { + case <-quit: + return true + default: + return false + } +} diff --git a/plugin/reload/reload_test.go b/plugin/reload/reload_test.go new file mode 100644 index 000000000..48cb42137 --- /dev/null +++ b/plugin/reload/reload_test.go @@ -0,0 +1,50 @@ +package reload + +import ( + "testing" + + "github.com/coredns/caddy" +) + +// fakeInput implements caddy.Input for testing parse(). +type fakeInput struct { + p string + b []byte +} + +func (f fakeInput) ServerType() string { return "dns" } +func (f fakeInput) Body() []byte { return f.b } +func (f fakeInput) Path() string { return f.p } + +// TestParseInvalidCorefile ensures parse returns an error for invalid Corefile syntax. +func TestParseInvalidCorefile(t *testing.T) { + t.Parallel() + + broken := fakeInput{p: "Corefile", b: []byte(". { errors\n")} + if _, err := parse(broken); err == nil { + t.Fatalf("expected parse error for invalid Corefile, got nil") + } +} + +// TestShutdownGate ensures the shutdown gate helper recognizes when shutdown is requested. +func TestShutdownGate(t *testing.T) { + t.Parallel() + + q := make(chan bool, 1) + if shutdownRequested(q) { + t.Fatalf("expected no shutdown before signal") + } + q <- true + if !shutdownRequested(q) { + t.Fatalf("expected shutdown after signal") + } +} + +// TestHookIgnoresNonStartupEvent ensures hook is a no-op for non-startup events. +func TestHookIgnoresNonStartupEvent(t *testing.T) { + t.Parallel() + + if err := hook(caddy.EventName("not-startup"), nil); err != nil { + t.Fatalf("expected no error for non-startup event, got %v", err) + } +} diff --git a/plugin/reload/setup.go b/plugin/reload/setup.go index 6df323431..0cbecc6a4 100644 --- a/plugin/reload/setup.go +++ b/plugin/reload/setup.go @@ -20,7 +20,7 @@ func init() { plugin.Register("reload", setup) } // channel for QUIT is never changed in purpose. // WARNING: this data may be unsync after an invalid attempt of reload Corefile. var ( - r = reload{dur: defaultInterval, u: unused, quit: make(chan bool)} + r = reload{dur: defaultInterval, u: unused, quit: make(chan bool, 1)} once, shutOnce sync.Once ) diff --git a/test/reload_test.go b/test/reload_test.go index b93209483..35edaec0f 100644 --- a/test/reload_test.go +++ b/test/reload_test.go @@ -8,6 +8,7 @@ import ( "net/http" "strings" "testing" + "time" "github.com/coredns/caddy" "github.com/coredns/coredns/core/dnsserver" @@ -380,6 +381,59 @@ func TestReloadUnreadyPlugin(t *testing.T) { c1.Stop() } +// TestReloadConcurrentRestartAndStop ensures there is no deadlock when a restart +// races with a shutdown (issue #7314). +func TestReloadConcurrentRestartAndStop(t *testing.T) { + corefileA := `.:0 { + reload 2s 1s + whoami + }` + corefileB := `.:0 { + reload 2s 1s + whoami + # change to trigger different config + }` + + c, err := CoreDNSServer(corefileA) + if err != nil { + if strings.Contains(err.Error(), inUse) { + return + } + t.Fatalf("Could not start CoreDNS instance: %v", err) + } + + restartErr := make(chan error, 1) + stopDone := make(chan struct{}) + + go func() { + _, err := c.Restart(NewInput(corefileB)) + restartErr <- err + }() + + // Small delay to increase overlap window + time.Sleep(50 * time.Millisecond) + + go func() { + c.Stop() + close(stopDone) + }() + + // Both operations should complete promptly; if not, we may be deadlocked. + select { + case <-stopDone: + // ok + case <-time.After(5 * time.Second): + t.Fatalf("Stop did not complete in time (possible deadlock)") + } + select { + case <-restartErr: + // ok: restart either succeeded or returned an error + // we only care about not hanging + case <-time.After(5 * time.Second): + t.Fatalf("Restart did not complete in time (possible deadlock)") + } +} + type unready struct { next plugin.Handler }