fix(cache): isolate metadata in prefetch goroutine (#7631)

Wrap doPrefetch with a fresh metadata context to prevent concurrent
writes to the request-scoped metadata map during background prefetch.

Add a new integration test configuring a plugin chain, triggering
the issue seen here. Hammers concurrent queries while log reads
metadata fields repeatedly.

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>
This commit is contained in:
Ville Vesilehto
2025-10-23 01:47:41 +03:00
committed by GitHub
parent bbe40d8629
commit 4ca99cd7b6
2 changed files with 119 additions and 0 deletions

View File

@@ -92,6 +92,8 @@ func wildcardFunc(ctx context.Context) func() string {
}
func (c *Cache) doPrefetch(ctx context.Context, state request.Request, cw *ResponseWriter, i *item, now time.Time) {
// Use a fresh metadata map to avoid concurrent writes to the original request's metadata.
ctx = metadata.ContextWithMetadata(ctx)
cachePrefetches.WithLabelValues(cw.server, c.zonesMetricLabel, c.viewMetricLabel).Inc()
c.doRefresh(ctx, state, cw)

View File

@@ -1,7 +1,12 @@
package test
import (
"fmt"
"math/rand"
"strings"
"sync"
"testing"
"time"
"github.com/coredns/coredns/plugin/test"
@@ -157,3 +162,115 @@ func TestLookupCacheWithoutEdns(t *testing.T) {
}
t.Fatalf("Expected empty additional section, got %v", resp.Extra)
}
// TestIssue7630 exercises the metadata map race reported in GitHub issue #7630.
// It configures a server chain: metadata -> log -> forward -> cache with serve_stale
// and very short TTLs to force prefetch. The test primes the cache, lets entries expire,
// then sends concurrent queries mixing positive and NXDOMAIN lookups while the log plugin
// formats lines that repeatedly read the {/forward/upstream} metadata key.
//
// Without the fix in cache.doPrefetch that creates a fresh metadata context for the
// background prefetch goroutine, this scenario can crash with:
//
// "fatal error: concurrent map read and map write"
//
// or trigger the race detector. With the fix, the test should be stable.
// See: https://github.com/coredns/coredns/issues/7630
func TestIssue7630(t *testing.T) {
name, rm, err := test.TempFile(".", exampleOrg)
if err != nil {
t.Fatalf("Failed to create zone: %s", err)
}
defer rm()
upstreamCorefile := `example.org:0 {
file ` + name + `
}`
up, udpUp, _, err := CoreDNSServerAndPorts(upstreamCorefile)
if err != nil {
t.Fatalf("Could not start upstream CoreDNS: %s", err)
}
defer up.Stop()
// Build an intentionally heavy log format that repeatedly reads the same metadata
// to widen the read window.
const repeat = 100
logMetaReads := strings.Repeat("{/forward/upstream}", repeat)
// Caching/forwarding server under test. Use 1s TTL and min TTL 0 to expire quickly
// for both positive and negative responses.
// Enable serve_stale so that every post-expiration query spawns a prefetch goroutine.
// Include metadata and log that reads {/forward/upstream} set by forward.
underTestCorefile := `example.org:0 {
metadata
log . "` + logMetaReads + `"
forward . ` + udpUp + ` {
health_check 5s domain example.org
}
cache {
success 1000 1 0
denial 1000 1 0
serve_stale
}
}`
inst, udp, _, err := CoreDNSServerAndPorts(underTestCorefile)
if err != nil {
t.Fatalf("Could not start test CoreDNS: %s", err)
}
defer inst.Stop()
// Prime cache with some initial lookups to populate both positive and negative caches.
m := new(dns.Msg)
m.SetQuestion("short.example.org.", dns.TypeA)
if _, err := dns.Exchange(m, udp); err != nil {
t.Fatalf("priming positive query failed: %s", err)
}
// A few NXDOMAINs
for i := range 50 {
m := new(dns.Msg)
m.SetQuestion(fmt.Sprintf("nx%d.example.org.", i), dns.TypeA)
if _, err := dns.Exchange(m, udp); err != nil {
t.Fatalf("priming negative query failed: %s", err)
}
}
// Wait for TTL (1s) to expire so subsequent queries serve stale and trigger prefetch.
time.Sleep(1300 * time.Millisecond)
const (
concurrency = 10
iterations = 100
names = 256
)
var wg sync.WaitGroup
wg.Add(concurrency)
for range concurrency {
go func() {
defer wg.Done()
r := rand.New(rand.NewSource(time.Now().UnixNano()))
for i := range iterations {
// Mix of positive and negative queries to trigger many independent prefetches.
var qname string
if i%4 == 0 {
qname = "short.example.org."
} else {
qname = fmt.Sprintf("nx%d.example.org.", r.Intn(names))
}
m := new(dns.Msg)
m.SetQuestion(qname, dns.TypeA)
if _, err := dns.Exchange(m, udp); err != nil {
// Any error here is unexpected; if the historical race regresses,
// the process may crash with concurrent map read/write before we see this.
t.Errorf("query failed: %v", err)
return
}
// Small delay to allow background prefetch to overlap with logging of other requests.
time.Sleep(10 * time.Millisecond)
}
}()
}
wg.Wait()
}