mirror of
https://github.com/coredns/coredns.git
synced 2025-10-26 15:54:16 -04:00
fix(cache): isolate metadata in prefetch goroutine (#7631)
Wrap doPrefetch with a fresh metadata context to prevent concurrent writes to the request-scoped metadata map during background prefetch. Add a new integration test configuring a plugin chain, triggering the issue seen here. Hammers concurrent queries while log reads metadata fields repeatedly. Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>
This commit is contained in:
2
plugin/cache/handler.go
vendored
2
plugin/cache/handler.go
vendored
@@ -92,6 +92,8 @@ func wildcardFunc(ctx context.Context) func() string {
|
||||
}
|
||||
|
||||
func (c *Cache) doPrefetch(ctx context.Context, state request.Request, cw *ResponseWriter, i *item, now time.Time) {
|
||||
// Use a fresh metadata map to avoid concurrent writes to the original request's metadata.
|
||||
ctx = metadata.ContextWithMetadata(ctx)
|
||||
cachePrefetches.WithLabelValues(cw.server, c.zonesMetricLabel, c.viewMetricLabel).Inc()
|
||||
c.doRefresh(ctx, state, cw)
|
||||
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
package test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/coredns/coredns/plugin/test"
|
||||
|
||||
@@ -157,3 +162,115 @@ func TestLookupCacheWithoutEdns(t *testing.T) {
|
||||
}
|
||||
t.Fatalf("Expected empty additional section, got %v", resp.Extra)
|
||||
}
|
||||
|
||||
// TestIssue7630 exercises the metadata map race reported in GitHub issue #7630.
|
||||
// It configures a server chain: metadata -> log -> forward -> cache with serve_stale
|
||||
// and very short TTLs to force prefetch. The test primes the cache, lets entries expire,
|
||||
// then sends concurrent queries mixing positive and NXDOMAIN lookups while the log plugin
|
||||
// formats lines that repeatedly read the {/forward/upstream} metadata key.
|
||||
//
|
||||
// Without the fix in cache.doPrefetch that creates a fresh metadata context for the
|
||||
// background prefetch goroutine, this scenario can crash with:
|
||||
//
|
||||
// "fatal error: concurrent map read and map write"
|
||||
//
|
||||
// or trigger the race detector. With the fix, the test should be stable.
|
||||
// See: https://github.com/coredns/coredns/issues/7630
|
||||
func TestIssue7630(t *testing.T) {
|
||||
name, rm, err := test.TempFile(".", exampleOrg)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create zone: %s", err)
|
||||
}
|
||||
defer rm()
|
||||
|
||||
upstreamCorefile := `example.org:0 {
|
||||
file ` + name + `
|
||||
}`
|
||||
|
||||
up, udpUp, _, err := CoreDNSServerAndPorts(upstreamCorefile)
|
||||
if err != nil {
|
||||
t.Fatalf("Could not start upstream CoreDNS: %s", err)
|
||||
}
|
||||
defer up.Stop()
|
||||
|
||||
// Build an intentionally heavy log format that repeatedly reads the same metadata
|
||||
// to widen the read window.
|
||||
const repeat = 100
|
||||
logMetaReads := strings.Repeat("{/forward/upstream}", repeat)
|
||||
|
||||
// Caching/forwarding server under test. Use 1s TTL and min TTL 0 to expire quickly
|
||||
// for both positive and negative responses.
|
||||
// Enable serve_stale so that every post-expiration query spawns a prefetch goroutine.
|
||||
// Include metadata and log that reads {/forward/upstream} set by forward.
|
||||
underTestCorefile := `example.org:0 {
|
||||
metadata
|
||||
log . "` + logMetaReads + `"
|
||||
forward . ` + udpUp + ` {
|
||||
health_check 5s domain example.org
|
||||
}
|
||||
cache {
|
||||
success 1000 1 0
|
||||
denial 1000 1 0
|
||||
serve_stale
|
||||
}
|
||||
}`
|
||||
|
||||
inst, udp, _, err := CoreDNSServerAndPorts(underTestCorefile)
|
||||
if err != nil {
|
||||
t.Fatalf("Could not start test CoreDNS: %s", err)
|
||||
}
|
||||
defer inst.Stop()
|
||||
|
||||
// Prime cache with some initial lookups to populate both positive and negative caches.
|
||||
m := new(dns.Msg)
|
||||
m.SetQuestion("short.example.org.", dns.TypeA)
|
||||
if _, err := dns.Exchange(m, udp); err != nil {
|
||||
t.Fatalf("priming positive query failed: %s", err)
|
||||
}
|
||||
// A few NXDOMAINs
|
||||
for i := range 50 {
|
||||
m := new(dns.Msg)
|
||||
m.SetQuestion(fmt.Sprintf("nx%d.example.org.", i), dns.TypeA)
|
||||
if _, err := dns.Exchange(m, udp); err != nil {
|
||||
t.Fatalf("priming negative query failed: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for TTL (1s) to expire so subsequent queries serve stale and trigger prefetch.
|
||||
time.Sleep(1300 * time.Millisecond)
|
||||
|
||||
const (
|
||||
concurrency = 10
|
||||
iterations = 100
|
||||
names = 256
|
||||
)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(concurrency)
|
||||
for range concurrency {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
r := rand.New(rand.NewSource(time.Now().UnixNano()))
|
||||
for i := range iterations {
|
||||
// Mix of positive and negative queries to trigger many independent prefetches.
|
||||
var qname string
|
||||
if i%4 == 0 {
|
||||
qname = "short.example.org."
|
||||
} else {
|
||||
qname = fmt.Sprintf("nx%d.example.org.", r.Intn(names))
|
||||
}
|
||||
m := new(dns.Msg)
|
||||
m.SetQuestion(qname, dns.TypeA)
|
||||
if _, err := dns.Exchange(m, udp); err != nil {
|
||||
// Any error here is unexpected; if the historical race regresses,
|
||||
// the process may crash with concurrent map read/write before we see this.
|
||||
t.Errorf("query failed: %v", err)
|
||||
return
|
||||
}
|
||||
// Small delay to allow background prefetch to overlap with logging of other requests.
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user