mirror of
https://github.com/coredns/coredns.git
synced 2025-10-26 15:54:16 -04:00
fix(cache): isolate metadata in prefetch goroutine (#7631)
Wrap doPrefetch with a fresh metadata context to prevent concurrent writes to the request-scoped metadata map during background prefetch. Add a new integration test configuring a plugin chain, triggering the issue seen here. Hammers concurrent queries while log reads metadata fields repeatedly. Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>
This commit is contained in:
2
plugin/cache/handler.go
vendored
2
plugin/cache/handler.go
vendored
@@ -92,6 +92,8 @@ func wildcardFunc(ctx context.Context) func() string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *Cache) doPrefetch(ctx context.Context, state request.Request, cw *ResponseWriter, i *item, now time.Time) {
|
func (c *Cache) doPrefetch(ctx context.Context, state request.Request, cw *ResponseWriter, i *item, now time.Time) {
|
||||||
|
// Use a fresh metadata map to avoid concurrent writes to the original request's metadata.
|
||||||
|
ctx = metadata.ContextWithMetadata(ctx)
|
||||||
cachePrefetches.WithLabelValues(cw.server, c.zonesMetricLabel, c.viewMetricLabel).Inc()
|
cachePrefetches.WithLabelValues(cw.server, c.zonesMetricLabel, c.viewMetricLabel).Inc()
|
||||||
c.doRefresh(ctx, state, cw)
|
c.doRefresh(ctx, state, cw)
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,12 @@
|
|||||||
package test
|
package test
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math/rand"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/coredns/coredns/plugin/test"
|
"github.com/coredns/coredns/plugin/test"
|
||||||
|
|
||||||
@@ -157,3 +162,115 @@ func TestLookupCacheWithoutEdns(t *testing.T) {
|
|||||||
}
|
}
|
||||||
t.Fatalf("Expected empty additional section, got %v", resp.Extra)
|
t.Fatalf("Expected empty additional section, got %v", resp.Extra)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestIssue7630 exercises the metadata map race reported in GitHub issue #7630.
|
||||||
|
// It configures a server chain: metadata -> log -> forward -> cache with serve_stale
|
||||||
|
// and very short TTLs to force prefetch. The test primes the cache, lets entries expire,
|
||||||
|
// then sends concurrent queries mixing positive and NXDOMAIN lookups while the log plugin
|
||||||
|
// formats lines that repeatedly read the {/forward/upstream} metadata key.
|
||||||
|
//
|
||||||
|
// Without the fix in cache.doPrefetch that creates a fresh metadata context for the
|
||||||
|
// background prefetch goroutine, this scenario can crash with:
|
||||||
|
//
|
||||||
|
// "fatal error: concurrent map read and map write"
|
||||||
|
//
|
||||||
|
// or trigger the race detector. With the fix, the test should be stable.
|
||||||
|
// See: https://github.com/coredns/coredns/issues/7630
|
||||||
|
func TestIssue7630(t *testing.T) {
|
||||||
|
name, rm, err := test.TempFile(".", exampleOrg)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to create zone: %s", err)
|
||||||
|
}
|
||||||
|
defer rm()
|
||||||
|
|
||||||
|
upstreamCorefile := `example.org:0 {
|
||||||
|
file ` + name + `
|
||||||
|
}`
|
||||||
|
|
||||||
|
up, udpUp, _, err := CoreDNSServerAndPorts(upstreamCorefile)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Could not start upstream CoreDNS: %s", err)
|
||||||
|
}
|
||||||
|
defer up.Stop()
|
||||||
|
|
||||||
|
// Build an intentionally heavy log format that repeatedly reads the same metadata
|
||||||
|
// to widen the read window.
|
||||||
|
const repeat = 100
|
||||||
|
logMetaReads := strings.Repeat("{/forward/upstream}", repeat)
|
||||||
|
|
||||||
|
// Caching/forwarding server under test. Use 1s TTL and min TTL 0 to expire quickly
|
||||||
|
// for both positive and negative responses.
|
||||||
|
// Enable serve_stale so that every post-expiration query spawns a prefetch goroutine.
|
||||||
|
// Include metadata and log that reads {/forward/upstream} set by forward.
|
||||||
|
underTestCorefile := `example.org:0 {
|
||||||
|
metadata
|
||||||
|
log . "` + logMetaReads + `"
|
||||||
|
forward . ` + udpUp + ` {
|
||||||
|
health_check 5s domain example.org
|
||||||
|
}
|
||||||
|
cache {
|
||||||
|
success 1000 1 0
|
||||||
|
denial 1000 1 0
|
||||||
|
serve_stale
|
||||||
|
}
|
||||||
|
}`
|
||||||
|
|
||||||
|
inst, udp, _, err := CoreDNSServerAndPorts(underTestCorefile)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Could not start test CoreDNS: %s", err)
|
||||||
|
}
|
||||||
|
defer inst.Stop()
|
||||||
|
|
||||||
|
// Prime cache with some initial lookups to populate both positive and negative caches.
|
||||||
|
m := new(dns.Msg)
|
||||||
|
m.SetQuestion("short.example.org.", dns.TypeA)
|
||||||
|
if _, err := dns.Exchange(m, udp); err != nil {
|
||||||
|
t.Fatalf("priming positive query failed: %s", err)
|
||||||
|
}
|
||||||
|
// A few NXDOMAINs
|
||||||
|
for i := range 50 {
|
||||||
|
m := new(dns.Msg)
|
||||||
|
m.SetQuestion(fmt.Sprintf("nx%d.example.org.", i), dns.TypeA)
|
||||||
|
if _, err := dns.Exchange(m, udp); err != nil {
|
||||||
|
t.Fatalf("priming negative query failed: %s", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for TTL (1s) to expire so subsequent queries serve stale and trigger prefetch.
|
||||||
|
time.Sleep(1300 * time.Millisecond)
|
||||||
|
|
||||||
|
const (
|
||||||
|
concurrency = 10
|
||||||
|
iterations = 100
|
||||||
|
names = 256
|
||||||
|
)
|
||||||
|
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(concurrency)
|
||||||
|
for range concurrency {
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
r := rand.New(rand.NewSource(time.Now().UnixNano()))
|
||||||
|
for i := range iterations {
|
||||||
|
// Mix of positive and negative queries to trigger many independent prefetches.
|
||||||
|
var qname string
|
||||||
|
if i%4 == 0 {
|
||||||
|
qname = "short.example.org."
|
||||||
|
} else {
|
||||||
|
qname = fmt.Sprintf("nx%d.example.org.", r.Intn(names))
|
||||||
|
}
|
||||||
|
m := new(dns.Msg)
|
||||||
|
m.SetQuestion(qname, dns.TypeA)
|
||||||
|
if _, err := dns.Exchange(m, udp); err != nil {
|
||||||
|
// Any error here is unexpected; if the historical race regresses,
|
||||||
|
// the process may crash with concurrent map read/write before we see this.
|
||||||
|
t.Errorf("query failed: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Small delay to allow background prefetch to overlap with logging of other requests.
|
||||||
|
time.Sleep(10 * time.Millisecond)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user