From 4ca99cd7b663e152e037de95db7f973f61ea66ac Mon Sep 17 00:00:00 2001 From: Ville Vesilehto Date: Thu, 23 Oct 2025 01:47:41 +0300 Subject: [PATCH] fix(cache): isolate metadata in prefetch goroutine (#7631) Wrap doPrefetch with a fresh metadata context to prevent concurrent writes to the request-scoped metadata map during background prefetch. Add a new integration test configuring a plugin chain, triggering the issue seen here. Hammers concurrent queries while log reads metadata fields repeatedly. Signed-off-by: Ville Vesilehto --- plugin/cache/handler.go | 2 + test/cache_test.go | 117 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/plugin/cache/handler.go b/plugin/cache/handler.go index 4de37ea8b..c45801ad6 100644 --- a/plugin/cache/handler.go +++ b/plugin/cache/handler.go @@ -92,6 +92,8 @@ func wildcardFunc(ctx context.Context) func() string { } func (c *Cache) doPrefetch(ctx context.Context, state request.Request, cw *ResponseWriter, i *item, now time.Time) { + // Use a fresh metadata map to avoid concurrent writes to the original request's metadata. + ctx = metadata.ContextWithMetadata(ctx) cachePrefetches.WithLabelValues(cw.server, c.zonesMetricLabel, c.viewMetricLabel).Inc() c.doRefresh(ctx, state, cw) diff --git a/test/cache_test.go b/test/cache_test.go index c470aa439..958e8270d 100644 --- a/test/cache_test.go +++ b/test/cache_test.go @@ -1,7 +1,12 @@ package test import ( + "fmt" + "math/rand" + "strings" + "sync" "testing" + "time" "github.com/coredns/coredns/plugin/test" @@ -157,3 +162,115 @@ func TestLookupCacheWithoutEdns(t *testing.T) { } t.Fatalf("Expected empty additional section, got %v", resp.Extra) } + +// TestIssue7630 exercises the metadata map race reported in GitHub issue #7630. +// It configures a server chain: metadata -> log -> forward -> cache with serve_stale +// and very short TTLs to force prefetch. The test primes the cache, lets entries expire, +// then sends concurrent queries mixing positive and NXDOMAIN lookups while the log plugin +// formats lines that repeatedly read the {/forward/upstream} metadata key. +// +// Without the fix in cache.doPrefetch that creates a fresh metadata context for the +// background prefetch goroutine, this scenario can crash with: +// +// "fatal error: concurrent map read and map write" +// +// or trigger the race detector. With the fix, the test should be stable. +// See: https://github.com/coredns/coredns/issues/7630 +func TestIssue7630(t *testing.T) { + name, rm, err := test.TempFile(".", exampleOrg) + if err != nil { + t.Fatalf("Failed to create zone: %s", err) + } + defer rm() + + upstreamCorefile := `example.org:0 { + file ` + name + ` + }` + + up, udpUp, _, err := CoreDNSServerAndPorts(upstreamCorefile) + if err != nil { + t.Fatalf("Could not start upstream CoreDNS: %s", err) + } + defer up.Stop() + + // Build an intentionally heavy log format that repeatedly reads the same metadata + // to widen the read window. + const repeat = 100 + logMetaReads := strings.Repeat("{/forward/upstream}", repeat) + + // Caching/forwarding server under test. Use 1s TTL and min TTL 0 to expire quickly + // for both positive and negative responses. + // Enable serve_stale so that every post-expiration query spawns a prefetch goroutine. + // Include metadata and log that reads {/forward/upstream} set by forward. + underTestCorefile := `example.org:0 { + metadata + log . "` + logMetaReads + `" + forward . ` + udpUp + ` { + health_check 5s domain example.org + } + cache { + success 1000 1 0 + denial 1000 1 0 + serve_stale + } + }` + + inst, udp, _, err := CoreDNSServerAndPorts(underTestCorefile) + if err != nil { + t.Fatalf("Could not start test CoreDNS: %s", err) + } + defer inst.Stop() + + // Prime cache with some initial lookups to populate both positive and negative caches. + m := new(dns.Msg) + m.SetQuestion("short.example.org.", dns.TypeA) + if _, err := dns.Exchange(m, udp); err != nil { + t.Fatalf("priming positive query failed: %s", err) + } + // A few NXDOMAINs + for i := range 50 { + m := new(dns.Msg) + m.SetQuestion(fmt.Sprintf("nx%d.example.org.", i), dns.TypeA) + if _, err := dns.Exchange(m, udp); err != nil { + t.Fatalf("priming negative query failed: %s", err) + } + } + + // Wait for TTL (1s) to expire so subsequent queries serve stale and trigger prefetch. + time.Sleep(1300 * time.Millisecond) + + const ( + concurrency = 10 + iterations = 100 + names = 256 + ) + + var wg sync.WaitGroup + wg.Add(concurrency) + for range concurrency { + go func() { + defer wg.Done() + r := rand.New(rand.NewSource(time.Now().UnixNano())) + for i := range iterations { + // Mix of positive and negative queries to trigger many independent prefetches. + var qname string + if i%4 == 0 { + qname = "short.example.org." + } else { + qname = fmt.Sprintf("nx%d.example.org.", r.Intn(names)) + } + m := new(dns.Msg) + m.SetQuestion(qname, dns.TypeA) + if _, err := dns.Exchange(m, udp); err != nil { + // Any error here is unexpected; if the historical race regresses, + // the process may crash with concurrent map read/write before we see this. + t.Errorf("query failed: %v", err) + return + } + // Small delay to allow background prefetch to overlap with logging of other requests. + time.Sleep(10 * time.Millisecond) + } + }() + } + wg.Wait() +}