Files
coredns/test/metrics_test.go
Ville Vesilehto 32ea433a29 fix(metrics): preserve request size from plugins (#7313)
The rewrite plugin modifies DNS messages, affecting the request
size observed in the coredns_dns_request_size_bytes metric.

This change captures the original request size before any plugins
can modify it. It adds a functional options pattern to Report() to
pass this information while maintaining API compatibility.

Tests have been added to verify the fix prevents rewrite from
affecting the request size metrics.

Docs included.

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>
2025-05-22 06:38:22 -07:00

410 lines
12 KiB
Go

package test
import (
"fmt"
"os"
"path/filepath"
"reflect"
"strconv"
"strings"
"testing"
"time"
"github.com/coredns/coredns/plugin/metrics"
"github.com/coredns/coredns/plugin/metrics/vars"
"github.com/coredns/coredns/plugin/test"
"github.com/miekg/dns"
)
// Because we don't properly shutdown the metrics servers we are re-using the metrics between tests, not a superbad issue
// but depending on the ordering of the tests this trips up stuff.
// Start test server that has metrics enabled. Then tear it down again.
func TestMetricsServer(t *testing.T) {
corefile := `
example.org:0 {
chaos CoreDNS-001 miek@miek.nl
prometheus localhost:0
}
example.com:0 {
log
prometheus localhost:0
}`
srv, err := CoreDNSServer(corefile)
if err != nil {
t.Fatalf("Could not get CoreDNS serving instance: %s", err)
}
defer srv.Stop()
}
func TestMetricsRefused(t *testing.T) {
metricName := "coredns_dns_responses_total"
corefile := `example.org:0 {
whoami
prometheus localhost:0
}`
srv, udp, _, err := CoreDNSServerAndPorts(corefile)
if err != nil {
t.Fatalf("Could not get CoreDNS serving instance: %s", err)
}
defer srv.Stop()
m := new(dns.Msg)
m.SetQuestion("google.com.", dns.TypeA)
if _, err = dns.Exchange(m, udp); err != nil {
t.Fatalf("Could not send message: %s", err)
}
data := test.Scrape("http://" + metrics.ListenAddr + "/metrics")
got, labels := test.MetricValue(metricName, data)
if got != "1" {
t.Errorf("Expected value %s for refused, but got %s", "1", got)
}
if labels["zone"] != vars.Dropped {
t.Errorf("Expected zone value %s for refused, but got %s", vars.Dropped, labels["zone"])
}
if labels["rcode"] != "REFUSED" {
t.Errorf("Expected zone value %s for refused, but got %s", "REFUSED", labels["rcode"])
}
}
// getBucketCount extracts the count for a specific bucket from a metric family
func getBucketCount(mf *test.MetricFamily, bucketLabel string) (int, error) {
if mf == nil {
return 0, fmt.Errorf("metric family is nil")
}
if len(mf.Metrics) == 0 {
return 0, fmt.Errorf("metric family %s has no metrics", mf.Name)
}
// mf.Metrics[0] is an interface{} containing an unexported 'histogram' struct from plugin/test.
metricPoint := mf.Metrics[0]
val := reflect.ValueOf(metricPoint)
// Check if the underlying type is a struct (as histogram is)
if val.Kind() != reflect.Struct {
return 0, fmt.Errorf("metric point for %s is not a struct, but %s", mf.Name, val.Kind())
}
// Access the 'Buckets' field, which should be map[string]string
bucketsField := val.FieldByName("Buckets")
if !bucketsField.IsValid() {
return 0, fmt.Errorf("metric point for %s has no 'Buckets' field", mf.Name)
}
bucketsMap, ok := bucketsField.Interface().(map[string]string)
if !ok {
return 0, fmt.Errorf("'Buckets' field for %s is not a map[string]string", mf.Name)
}
countStr, ok := bucketsMap[bucketLabel]
if !ok {
// For these tests, we'll treat a missing bucket as 0.
return 0, nil
}
count, err := strconv.Atoi(countStr)
if err != nil {
return 0, fmt.Errorf("could not parse bucket count '%s' for %s: %v", countStr, mf.Name, err)
}
return count, nil
}
// extractRequestSizeBucketCounts extracts bucket counts from DNS request size metrics
func extractRequestSizeBucketCounts(t *testing.T, metrics []*test.MetricFamily, label string) (int, int, error) {
var countBelow100, countAbove100 int
var err error
for _, mf := range metrics {
if strings.Contains(mf.Name, "coredns_dns_request_size_bytes") {
t.Logf(" %s: %v", mf.Name, mf.Metrics)
countBelow100, err = getBucketCount(mf, "100")
if err != nil {
return 0, 0, fmt.Errorf("%s: error getting bucket count for 100: %v", label, err)
}
countAbove100, err = getBucketCount(mf, "1023")
if err != nil {
return 0, 0, fmt.Errorf("%s: error getting bucket count for 1023: %v", label, err)
}
return countBelow100, countAbove100, nil
}
}
return 0, 0, fmt.Errorf("%s: could not find coredns_dns_request_size_bytes metric", label)
}
func TestMetricsRewriteRequestSize(t *testing.T) {
// number of requests to send
numRequests := 5
// First test without rewrite
corefileWithoutRewrite := `.:0 {
prometheus localhost:0
forward . 8.8.8.8
}`
srv, udp, _, err := CoreDNSServerAndPorts(corefileWithoutRewrite)
if err != nil {
t.Fatalf("Could not get CoreDNS serving instance: %s", err)
}
// Create a DNS request with a long name to have a size close to 100 bytes
m := new(dns.Msg)
m.SetQuestion("somerequestthathaveasize90.123456789.123456789.123456789.example.com.", dns.TypeA)
expectedSize := 86
actualSize := m.Len()
if actualSize != expectedSize {
t.Fatalf("Expected request size %d, but got %d", expectedSize, actualSize)
}
// Send multiple requests
for i := 0; i < numRequests; i++ {
if _, err = dns.Exchange(m, udp); err != nil {
t.Fatalf("Could not send message: %s", err)
}
}
metricsWithoutRewrite := test.Scrape("http://" + metrics.ListenAddr + "/metrics")
t.Log("Available metrics without rewrite:")
countBelow100withoutRewrite, countAbove100withoutRewrite, err := extractRequestSizeBucketCounts(t, metricsWithoutRewrite, "without rewrite")
if err != nil {
t.Error(err)
}
// Stop the first server
srv.Stop()
time.Sleep(100 * time.Millisecond) // Give server time to clean up
// Now test with rewrite plugin
corefileWithRewrite := `.:0 {
prometheus localhost:0
rewrite edns0 local set 0x13 test123456 revert
forward . 8.8.8.8
}`
srv2, udp2, _, err := CoreDNSServerAndPorts(corefileWithRewrite)
if err != nil {
t.Fatalf("Could not get CoreDNS serving instance: %s", err)
}
defer srv2.Stop()
// Send the same requests with rewrite
for i := 0; i < numRequests; i++ {
if _, err = dns.Exchange(m, udp2); err != nil {
t.Fatalf("Could not send message: %s", err)
}
}
// Scrape metrics again
metricsWithRewrite := test.Scrape("http://" + metrics.ListenAddr + "/metrics")
t.Log("Available metrics with rewrite:")
countBelow100withRewrite, countAbove100withRewrite, err := extractRequestSizeBucketCounts(t, metricsWithRewrite, "with rewrite")
if err != nil {
t.Error(err)
}
// Both servers should record metrics in the same buckets regardless of the
// rewrite plugin's modifications. The original request size is 86 bytes,
// which falls into the le=100 bucket, before and after the rewrite.
if countBelow100withoutRewrite != countAbove100withoutRewrite &&
countBelow100withRewrite != countAbove100withRewrite {
t.Errorf("Expected all requests to go to le=100 bucket")
}
// The count in the le=100 bucket should be the same with or without rewrite.
// Second round of requests should go to le=100 bucket.
if countBelow100withRewrite != countBelow100withoutRewrite+numRequests {
t.Errorf("Expected all requests to go to le=100 bucket")
}
}
func TestMetricsAuto(t *testing.T) {
tmpdir := t.TempDir()
corefile := `org:0 {
auto {
directory ` + tmpdir + ` db\.(.*) {1}
reload 0.1s
}
prometheus localhost:0
}`
i, err := CoreDNSServer(corefile)
if err != nil {
t.Fatalf("Could not get CoreDNS serving instance: %s", err)
}
udp, _ := CoreDNSServerPorts(i, 0)
if udp == "" {
t.Fatalf("Could not get UDP listening port")
}
defer i.Stop()
// Write db.example.org to get example.org.
if err = os.WriteFile(filepath.Join(tmpdir, "db.example.org"), []byte(zoneContent), 0644); err != nil {
t.Fatal(err)
}
time.Sleep(110 * time.Millisecond) // wait for it to be picked up
m := new(dns.Msg)
m.SetQuestion("www.example.org.", dns.TypeA)
if _, err := dns.Exchange(m, udp); err != nil {
t.Fatalf("Could not send message: %s", err)
}
metricName := "coredns_dns_requests_total" // {zone, proto, family, type}
data := test.Scrape("http://" + metrics.ListenAddr + "/metrics")
// Get the value for the metrics where the one of the labels values matches "example.org."
got, _ := test.MetricValueLabel(metricName, "example.org.", data)
if got == "0" {
t.Errorf("Expected value %s for %s, but got %s", "> 1", metricName, got)
}
// Remove db.example.org again. And see if the metric stops increasing.
os.Remove(filepath.Join(tmpdir, "db.example.org"))
time.Sleep(110 * time.Millisecond) // wait for it to be picked up
if _, err := dns.Exchange(m, udp); err != nil {
t.Fatalf("Could not send message: %s", err)
}
data = test.Scrape("http://" + metrics.ListenAddr + "/metrics")
got, _ = test.MetricValueLabel(metricName, "example.org.", data)
if got == "0" {
t.Errorf("Expected value %s for %s, but got %s", "> 1", metricName, got)
}
}
// Show that when 2 blocs share the same metric listener (they have a prometheus plugin on the same listening address),
// ALL the metrics of the second bloc in order are declared in prometheus, especially the plugins that are used ONLY in the second bloc
func TestMetricsSeveralBlocs(t *testing.T) {
cacheSizeMetricName := "coredns_cache_entries"
addrMetrics := "localhost:9155"
corefile := `
example.org:0 {
prometheus ` + addrMetrics + `
forward . 8.8.8.8:53 {
force_tcp
}
}
google.com:0 {
prometheus ` + addrMetrics + `
whoami
cache
}`
i, udp, _, err := CoreDNSServerAndPorts(corefile)
if err != nil {
t.Fatalf("Could not get CoreDNS serving instance: %s", err)
}
defer i.Stop()
// send an initial query to setup properly the cache size
m := new(dns.Msg)
m.SetQuestion("google.com.", dns.TypeA)
if _, err = dns.Exchange(m, udp); err != nil {
t.Fatalf("Could not send message: %s", err)
}
beginCacheSize := test.ScrapeMetricAsInt(addrMetrics, cacheSizeMetricName, "", 0)
// send an query, different from initial to ensure we have another add to the cache
m = new(dns.Msg)
m.SetQuestion("www.google.com.", dns.TypeA)
if _, err = dns.Exchange(m, udp); err != nil {
t.Fatalf("Could not send message: %s", err)
}
endCacheSize := test.ScrapeMetricAsInt(addrMetrics, cacheSizeMetricName, "", 0)
if err != nil {
t.Errorf("Unexpected metric data retrieved for %s : %s", cacheSizeMetricName, err)
}
if endCacheSize-beginCacheSize != 1 {
t.Errorf("Expected metric data retrieved for %s, expected %d, got %d", cacheSizeMetricName, 1, endCacheSize-beginCacheSize)
}
}
func TestMetricsPluginEnabled(t *testing.T) {
corefile := `
example.org:0 {
chaos CoreDNS-001 miek@miek.nl
prometheus localhost:0
}
example.com:0 {
whoami
prometheus localhost:0
}`
srv, err := CoreDNSServer(corefile)
if err != nil {
t.Fatalf("Could not get CoreDNS serving instance: %s", err)
}
defer srv.Stop()
metricName := "coredns_plugin_enabled" //{server, zone, name}
data := test.Scrape("http://" + metrics.ListenAddr + "/metrics")
// Get the value for the metrics where the one of the labels values matches "chaos".
got, _ := test.MetricValueLabel(metricName, "chaos", data)
if got != "1" {
t.Errorf("Expected value %s for %s, but got %s", "1", metricName, got)
}
// Get the value for the metrics where the one of the labels values matches "erratic".
got, _ = test.MetricValueLabel(metricName, "erratic", data) // none of these tests use 'erratic'
if got != "" {
t.Errorf("Expected value %s for %s, but got %s", "", metricName, got)
}
}
func TestMetricsAvailable(t *testing.T) {
procMetric := "coredns_build_info"
procCache := "coredns_cache_entries"
procCacheMiss := "coredns_cache_misses_total"
procForward := "coredns_dns_request_duration_seconds"
corefileWithMetrics := `.:0 {
prometheus localhost:0
cache
forward . 8.8.8.8 {
force_tcp
}
}`
inst, _, tcp, err := CoreDNSServerAndPorts(corefileWithMetrics)
defer inst.Stop()
if err != nil {
if strings.Contains(err.Error(), inUse) {
return
}
t.Errorf("Could not get service instance: %s", err)
}
// send a query and check we can scrap corresponding metrics
cl := dns.Client{Net: "tcp"}
m := new(dns.Msg)
m.SetQuestion("www.example.org.", dns.TypeA)
if _, _, err := cl.Exchange(m, tcp); err != nil {
t.Fatalf("Could not send message: %s", err)
}
// we should have metrics from forward, cache, and metrics itself
if err := collectMetricsInfo(metrics.ListenAddr, procMetric, procCache, procCacheMiss, procForward); err != nil {
t.Errorf("Could not scrap one of expected stats : %s", err)
}
}