diff --git a/go.mod b/go.mod index d795f655d..be45d87a6 100644 --- a/go.mod +++ b/go.mod @@ -123,6 +123,7 @@ require ( github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/lufia/plan9stats v0.0.0-20240226150601-1dcf7310316a // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect diff --git a/man/coredns-metrics.7 b/man/coredns-metrics.7 index 565af3445..9c9410bc5 100644 --- a/man/coredns-metrics.7 +++ b/man/coredns-metrics.7 @@ -1,5 +1,5 @@ .\" Generated by Mmark Markdown Processer - mmark.miek.nl -.TH "COREDNS-METRICS" 7 "March 2021" "CoreDNS" "CoreDNS Plugins" +.TH "COREDNS-METRICS" 7 "May 2025" "CoreDNS" "CoreDNS Plugins" .SH "NAME" .PP @@ -10,30 +10,38 @@ .PP With \fIprometheus\fP you export metrics from CoreDNS and any plugin that has them. The default location for the metrics is \fB\fClocalhost:9153\fR. The metrics path is fixed to \fB\fC/metrics\fR. -The following metrics are exported: + +.PP +In addition to the default Go metrics exported by the Prometheus Go client +\[la]https://prometheus.io/docs/guides/go-application/\[ra], +the following metrics are exported: .IP \(bu 4 \fB\fCcoredns_build_info{version, revision, goversion}\fR - info about CoreDNS itself. .IP \(bu 4 \fB\fCcoredns_panics_total{}\fR - total number of panics. .IP \(bu 4 -\fB\fCcoredns_dns_requests_total{server, zone, proto, family, type}\fR - total query count. +\fB\fCcoredns_dns_requests_total{server, zone, view, proto, family, type}\fR - total query count. .IP \(bu 4 -\fB\fCcoredns_dns_request_duration_seconds{server, zone, type}\fR - duration to process each query. +\fB\fCcoredns_dns_request_duration_seconds{server, zone, view, type}\fR - duration to process each query. .IP \(bu 4 -\fB\fCcoredns_dns_request_size_bytes{server, zone, proto}\fR - size of the request in bytes. +\fB\fCcoredns_dns_request_size_bytes{server, zone, view, proto}\fR - size of the request in bytes. Uses the original size before any plugin rewrites. .IP \(bu 4 -\fB\fCcoredns_dns_do_requests_total{server, zone}\fR - queries that have the DO bit set +\fB\fCcoredns_dns_do_requests_total{server, view, zone}\fR - queries that have the DO bit set .IP \(bu 4 -\fB\fCcoredns_dns_response_size_bytes{server, zone, proto}\fR - response size in bytes. +\fB\fCcoredns_dns_response_size_bytes{server, zone, view, proto}\fR - response size in bytes. .IP \(bu 4 -\fB\fCcoredns_dns_responses_total{server, zone, rcode}\fR - response per zone and rcode. +\fB\fCcoredns_dns_responses_total{server, zone, view, rcode, plugin}\fR - response per zone, rcode and plugin. .IP \(bu 4 -\fB\fCcoredns_plugin_enabled{server, zone, name}\fR - indicates whether a plugin is enabled on per server and zone basis. +\fB\fCcoredns_dns_https_responses_total{server, status}\fR - responses per server and http status code. +.IP \(bu 4 +\fB\fCcoredns_dns_quic_responses_total{server, status}\fR - responses per server and QUIC application code. +.IP \(bu 4 +\fB\fCcoredns_plugin_enabled{server, zone, view, name}\fR - indicates whether a plugin is enabled on per server, zone and view basis. .PP -Each counter has a label \fB\fCzone\fR which is the zonename used for the request/response. +Almost each counter has a label \fB\fCzone\fR which is the zonename used for the request/response. .PP Extra labels used are: @@ -48,14 +56,35 @@ this is \fB\fCdns://:53\fR. If you are using the \fIbind\fP plugin an IP address The address family (\fB\fCfamily\fR) of the transport (1 = IP (IP version 4), 2 = IP6 (IP version 6)). .IP \(bu 4 \fB\fCtype\fR which holds the query type. It holds most common types (A, AAAA, MX, SOA, CNAME, PTR, TXT, -NS, SRV, DS, DNSKEY, RRSIG, NSEC, NSEC3, IXFR, AXFR and ANY) and "other" which lumps together all +NS, SRV, DS, DNSKEY, RRSIG, NSEC, NSEC3, HTTPS, IXFR, AXFR and ANY) and "other" which lumps together all other types. +.IP \(bu 4 +\fB\fCstatus\fR which holds the https status code. Possible values are: + +.RS +.IP \(en 4 +200 - request is processed, +.IP \(en 4 +404 - request has been rejected on validation, +.IP \(en 4 +400 - request to dns message conversion failed, +.IP \(en 4 +500 - processing ended up with no response. + +.RE +.IP \(bu 4 +the \fB\fCplugin\fR label holds the name of the plugin that made the write to the client. If the server +did the write (on error for instance), the value is empty. .PP If monitoring is enabled, queries that do not enter the plugin chain are exported under the fake name "dropped" (without a closing dot - this is never a valid domain name). +.PP +Other plugins may export additional stats when the \fIprometheus\fP plugin is enabled. Those stats are documented in each +plugin's README. + .PP This plugin can only be used once per Server Block. diff --git a/plugin/metrics/README.md b/plugin/metrics/README.md index 144a5d1c6..f15cef58a 100644 --- a/plugin/metrics/README.md +++ b/plugin/metrics/README.md @@ -16,7 +16,7 @@ the following metrics are exported: * `coredns_panics_total{}` - total number of panics. * `coredns_dns_requests_total{server, zone, view, proto, family, type}` - total query count. * `coredns_dns_request_duration_seconds{server, zone, view, type}` - duration to process each query. -* `coredns_dns_request_size_bytes{server, zone, view, proto}` - size of the request in bytes. +* `coredns_dns_request_size_bytes{server, zone, view, proto}` - size of the request in bytes. Uses the original size before any plugin rewrites. * `coredns_dns_do_requests_total{server, view, zone}` - queries that have the DO bit set * `coredns_dns_response_size_bytes{server, zone, view, proto}` - response size in bytes. * `coredns_dns_responses_total{server, zone, view, rcode, plugin}` - response per zone, rcode and plugin. diff --git a/plugin/metrics/handler.go b/plugin/metrics/handler.go index 41da69011..fb350a2f5 100644 --- a/plugin/metrics/handler.go +++ b/plugin/metrics/handler.go @@ -16,6 +16,9 @@ import ( func (m *Metrics) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) { state := request.Request{W: w, Req: r} + // Capture the original request size before any plugins modify it + originalSize := r.Len() + qname := state.QName() zone := plugin.Zones(m.ZoneNames()).Matches(qname) if zone == "" { @@ -34,7 +37,9 @@ func (m *Metrics) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg rc = status } plugin := m.authoritativePlugin(rw.Caller) - vars.Report(WithServer(ctx), state, zone, WithView(ctx), rcode.ToString(rc), plugin, rw.Len, rw.Start) + // Pass the original request size to vars.Report + vars.Report(WithServer(ctx), state, zone, WithView(ctx), rcode.ToString(rc), plugin, + rw.Len, rw.Start, vars.WithOriginalReqSize(originalSize)) return status, err } diff --git a/plugin/metrics/vars/report.go b/plugin/metrics/vars/report.go index 92f6bc163..f3b280b19 100644 --- a/plugin/metrics/vars/report.go +++ b/plugin/metrics/vars/report.go @@ -6,10 +6,34 @@ import ( "github.com/coredns/coredns/request" ) +// ReportOptions is a struct that contains available options for the Report function. +type ReportOptions struct { + OriginalReqSize int +} + +// ReportOption defines a function that modifies ReportOptions +type ReportOption func(*ReportOptions) + +// WithOriginalReqSize returns an option to set the original request size +func WithOriginalReqSize(size int) ReportOption { + return func(opts *ReportOptions) { + opts.OriginalReqSize = size + } +} + // Report reports the metrics data associated with request. This function is exported because it is also // called from core/dnsserver to report requests hitting the server that should not be handled and are thus // not sent down the plugin chain. -func Report(server string, req request.Request, zone, view, rcode, plugin string, size int, start time.Time) { +func Report(server string, req request.Request, zone, view, rcode, plugin string, + size int, start time.Time, opts ...ReportOption) { + options := ReportOptions{ + OriginalReqSize: 0, + } + + for _, opt := range opts { + opt(&options) + } + // Proto and Family. net := req.Proto() fam := "1" @@ -27,7 +51,13 @@ func Report(server string, req request.Request, zone, view, rcode, plugin string RequestDuration.WithLabelValues(server, zone, view).Observe(time.Since(start).Seconds()) ResponseSize.WithLabelValues(server, zone, view, net).Observe(float64(size)) - RequestSize.WithLabelValues(server, zone, view, net).Observe(float64(req.Len())) + + reqSize := req.Len() + if options.OriginalReqSize > 0 { + reqSize = options.OriginalReqSize + } + + RequestSize.WithLabelValues(server, zone, view, net).Observe(float64(reqSize)) ResponseRcode.WithLabelValues(server, zone, view, rcode, plugin).Inc() } diff --git a/plugin/metrics/vars/report_test.go b/plugin/metrics/vars/report_test.go new file mode 100644 index 000000000..abc28476a --- /dev/null +++ b/plugin/metrics/vars/report_test.go @@ -0,0 +1,101 @@ +package vars + +import ( + "testing" + "time" + + "github.com/coredns/coredns/plugin/test" + "github.com/coredns/coredns/request" + + "github.com/miekg/dns" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestReportWithOptions(t *testing.T) { + tests := []struct { + name string + question string + qtype uint16 + edns0 bool + do bool + originalSize int + useOriginal bool + }{ + { + name: "A record without DO bit", + question: "example.org.", + qtype: dns.TypeA, + edns0: true, + do: false, + originalSize: 0, + useOriginal: false, + }, + { + name: "A record with DO bit", + question: "example.org.", + qtype: dns.TypeA, + edns0: true, + do: true, + originalSize: 0, + useOriginal: false, + }, + { + name: "A record with original size", + question: "example.org.", + qtype: dns.TypeA, + edns0: false, + do: false, + originalSize: 42, + useOriginal: true, + }, + { + name: "A record bogus qtype", + question: "example.org.", + qtype: 0, // does not exist + edns0: false, + do: false, + originalSize: 42, + useOriginal: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + m := new(dns.Msg) + m.SetQuestion(tc.question, tc.qtype) + if tc.edns0 { + m.SetEdns0(4096, tc.do) + } + + w := &test.ResponseWriter{} + state := request.Request{W: w, Req: m} + + if state.Do() != tc.do { + t.Errorf("DO bit detection failed, got %v, want %v", state.Do(), tc.do) + } + + qType := qTypeString(tc.qtype) + expectedType := dns.Type(tc.qtype).String() + if qType != expectedType && qType != "other" { + t.Errorf("qTypeString(%d) = %s, want %s or 'other'", tc.qtype, qType, expectedType) + } + + var opts []ReportOption + if tc.useOriginal { + opts = append(opts, WithOriginalReqSize(tc.originalSize)) + } + + net := state.Proto() + fam := "1" + + countBefore := testutil.ToFloat64(RequestCount.WithLabelValues("dns://:53", "example.org.", "", net, fam, qType)) + + Report("dns://:53", state, "example.org.", "", "NOERROR", "test", 100, time.Now(), opts...) + + countAfter := testutil.ToFloat64(RequestCount.WithLabelValues("dns://:53", "example.org.", "", net, fam, qType)) + if countAfter <= countBefore { + t.Errorf("RequestCount was not incremented. Before: %f, After: %f", countBefore, countAfter) + } + }) + } +} diff --git a/test/metrics_test.go b/test/metrics_test.go index 28a0d6fba..e17af4ff3 100644 --- a/test/metrics_test.go +++ b/test/metrics_test.go @@ -1,8 +1,11 @@ package test import ( + "fmt" "os" "path/filepath" + "reflect" + "strconv" "strings" "testing" "time" @@ -70,6 +73,159 @@ func TestMetricsRefused(t *testing.T) { } } +// getBucketCount extracts the count for a specific bucket from a metric family +func getBucketCount(mf *test.MetricFamily, bucketLabel string) (int, error) { + if mf == nil { + return 0, fmt.Errorf("metric family is nil") + } + if len(mf.Metrics) == 0 { + return 0, fmt.Errorf("metric family %s has no metrics", mf.Name) + } + + // mf.Metrics[0] is an interface{} containing an unexported 'histogram' struct from plugin/test. + metricPoint := mf.Metrics[0] + val := reflect.ValueOf(metricPoint) + + // Check if the underlying type is a struct (as histogram is) + if val.Kind() != reflect.Struct { + return 0, fmt.Errorf("metric point for %s is not a struct, but %s", mf.Name, val.Kind()) + } + + // Access the 'Buckets' field, which should be map[string]string + bucketsField := val.FieldByName("Buckets") + if !bucketsField.IsValid() { + return 0, fmt.Errorf("metric point for %s has no 'Buckets' field", mf.Name) + } + + bucketsMap, ok := bucketsField.Interface().(map[string]string) + if !ok { + return 0, fmt.Errorf("'Buckets' field for %s is not a map[string]string", mf.Name) + } + + countStr, ok := bucketsMap[bucketLabel] + if !ok { + // For these tests, we'll treat a missing bucket as 0. + return 0, nil + } + + count, err := strconv.Atoi(countStr) + if err != nil { + return 0, fmt.Errorf("could not parse bucket count '%s' for %s: %v", countStr, mf.Name, err) + } + return count, nil +} + +// extractRequestSizeBucketCounts extracts bucket counts from DNS request size metrics +func extractRequestSizeBucketCounts(t *testing.T, metrics []*test.MetricFamily, label string) (int, int, error) { + var countBelow100, countAbove100 int + var err error + + for _, mf := range metrics { + if strings.Contains(mf.Name, "coredns_dns_request_size_bytes") { + t.Logf(" %s: %v", mf.Name, mf.Metrics) + countBelow100, err = getBucketCount(mf, "100") + if err != nil { + return 0, 0, fmt.Errorf("%s: error getting bucket count for 100: %v", label, err) + } + countAbove100, err = getBucketCount(mf, "1023") + if err != nil { + return 0, 0, fmt.Errorf("%s: error getting bucket count for 1023: %v", label, err) + } + return countBelow100, countAbove100, nil + } + } + + return 0, 0, fmt.Errorf("%s: could not find coredns_dns_request_size_bytes metric", label) +} + +func TestMetricsRewriteRequestSize(t *testing.T) { + // number of requests to send + numRequests := 5 + + // First test without rewrite + corefileWithoutRewrite := `.:0 { + prometheus localhost:0 + forward . 8.8.8.8 + }` + + srv, udp, _, err := CoreDNSServerAndPorts(corefileWithoutRewrite) + if err != nil { + t.Fatalf("Could not get CoreDNS serving instance: %s", err) + } + + // Create a DNS request with a long name to have a size close to 100 bytes + m := new(dns.Msg) + m.SetQuestion("somerequestthathaveasize90.123456789.123456789.123456789.example.com.", dns.TypeA) + expectedSize := 86 + actualSize := m.Len() + if actualSize != expectedSize { + t.Fatalf("Expected request size %d, but got %d", expectedSize, actualSize) + } + + // Send multiple requests + for i := 0; i < numRequests; i++ { + if _, err = dns.Exchange(m, udp); err != nil { + t.Fatalf("Could not send message: %s", err) + } + } + + metricsWithoutRewrite := test.Scrape("http://" + metrics.ListenAddr + "/metrics") + + t.Log("Available metrics without rewrite:") + countBelow100withoutRewrite, countAbove100withoutRewrite, err := extractRequestSizeBucketCounts(t, metricsWithoutRewrite, "without rewrite") + if err != nil { + t.Error(err) + } + + // Stop the first server + srv.Stop() + time.Sleep(100 * time.Millisecond) // Give server time to clean up + + // Now test with rewrite plugin + corefileWithRewrite := `.:0 { + prometheus localhost:0 + rewrite edns0 local set 0x13 test123456 revert + forward . 8.8.8.8 + }` + + srv2, udp2, _, err := CoreDNSServerAndPorts(corefileWithRewrite) + if err != nil { + t.Fatalf("Could not get CoreDNS serving instance: %s", err) + } + defer srv2.Stop() + + // Send the same requests with rewrite + for i := 0; i < numRequests; i++ { + if _, err = dns.Exchange(m, udp2); err != nil { + t.Fatalf("Could not send message: %s", err) + } + } + + // Scrape metrics again + metricsWithRewrite := test.Scrape("http://" + metrics.ListenAddr + "/metrics") + + t.Log("Available metrics with rewrite:") + countBelow100withRewrite, countAbove100withRewrite, err := extractRequestSizeBucketCounts(t, metricsWithRewrite, "with rewrite") + if err != nil { + t.Error(err) + } + + // Both servers should record metrics in the same buckets regardless of the + // rewrite plugin's modifications. The original request size is 86 bytes, + // which falls into the le=100 bucket, before and after the rewrite. + + if countBelow100withoutRewrite != countAbove100withoutRewrite && + countBelow100withRewrite != countAbove100withRewrite { + t.Errorf("Expected all requests to go to le=100 bucket") + } + + // The count in the le=100 bucket should be the same with or without rewrite. + // Second round of requests should go to le=100 bucket. + if countBelow100withRewrite != countBelow100withoutRewrite+numRequests { + t.Errorf("Expected all requests to go to le=100 bucket") + } +} + func TestMetricsAuto(t *testing.T) { tmpdir := t.TempDir()