fix(kubernetes): record cluster_ip services in dns_programming_duration metric (#7951)

Signed-off-by: Azeez Syed <syedazeez337@gmail.com>
2026-06-01 23:00:23 -04:00 · 2026-03-24 17:59:28 +05:30
parent 734426798f
commit f582a01dc9
3 changed files with 125 additions and 15 deletions
--- a/plugin/kubernetes/README.md
+++ b/plugin/kubernetes/README.md
@@ -267,7 +267,7 @@ The following are client level metrics to monitor apiserver request latency & st

 ## Bugs

-The duration metric only supports the "headless\_with\_selector" service currently.
+The duration metric does not yet support the `headless_without_selector` service kind.

 ## See Also

--- a/plugin/kubernetes/object/metrics.go
+++ b/plugin/kubernetes/object/metrics.go
@@ -62,22 +62,23 @@ func (l *EndpointLatencyRecorder) init(o meta.Object) {
 }

 func (l *EndpointLatencyRecorder) record() {
-	// isHeadless indicates whether the endpoints object belongs to a headless
-	// service (i.e. clusterIp = None). Note that this can be a  false negatives if the service
-	// informer is lagging, i.e. we may not see a recently created service. Given that the services
-	// don't change very often (comparing to much more frequent endpoints changes), cases when this method
-	// will return wrong answer should be relatively rare. Because of that we intentionally accept this
-	// flaw to keep the solution simple.
-	isHeadless := len(l.Services) == 1 && l.Services[0].Headless()
-
-	if !isHeadless || l.TT.IsZero() {
+	// Note: len(l.Services) != 1 can be a false negative if the service informer is lagging,
+	// i.e. we may not see a recently created service. Given that services don't change very
+	// often (compared to much more frequent endpoint changes), cases when this method skips
+	// recording should be relatively rare. We intentionally accept this flaw to keep the
+	// solution simple.
+	if l.TT.IsZero() || len(l.Services) != 1 {
 		return
 	}

-	// If we're here it means that the Endpoints object is for a headless service and that
-	// the Endpoints object was created by the endpoints-controller (because the
-	// LastChangeTriggerTime annotation is set). It means that the corresponding service is a
-	// "headless service with selector".
-	DNSProgrammingLatency.WithLabelValues("headless_with_selector").
+	// If we're here it means that the Endpoints object was created by the endpoints-controller
+	// (because the LastChangeTriggerTime annotation is set) and the backing Service is known.
+	// For headless services this means the service has a selector ("headless_with_selector").
+	// For non-headless services the service kind is "cluster_ip".
+	serviceKind := "cluster_ip"
+	if l.Services[0].Headless() {
+		serviceKind = "headless_with_selector"
+	}
+	DNSProgrammingLatency.WithLabelValues(serviceKind).
 		Observe(DurationSinceFunc(l.TT).Seconds())
 }
--- a/plugin/kubernetes/object/metrics_test.go
+++ b/plugin/kubernetes/object/metrics_test.go
@@ -0,0 +1,109 @@
+package object
+
+import (
+	"testing"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+	dto "github.com/prometheus/client_model/go"
+	api "k8s.io/api/core/v1"
+)
+
+// histSampleCount reads the sample count for a specific service_kind label
+// from a HistogramVec.
+func histSampleCount(t *testing.T, vec *prometheus.HistogramVec, label string) uint64 {
+	t.Helper()
+	obs, err := vec.GetMetricWithLabelValues(label)
+	if err != nil {
+		t.Fatalf("GetMetricWithLabelValues(%q): %v", label, err)
+	}
+	m, ok := obs.(prometheus.Metric)
+	if !ok {
+		t.Fatalf("observer for label %q does not implement prometheus.Metric", label)
+	}
+	pb := &dto.Metric{}
+	if err := m.Write(pb); err != nil {
+		t.Fatalf("Write metric for label %q: %v", label, err)
+	}
+	return pb.GetHistogram().GetSampleCount()
+}
+
+// NOTE: subtests in this function must NOT call t.Parallel() — they swap
+// global package-level vars (DNSProgrammingLatency, DurationSinceFunc).
+func TestEndpointLatencyRecorder_record(t *testing.T) {
+	tests := []struct {
+		name            string
+		services        []*Service
+		ttSet           bool
+		wantLabel       string
+		wantSampleCount uint64
+	}{
+		{
+			name:            "headless_with_selector: headless service with trigger annotation",
+			services:        []*Service{{ClusterIPs: []string{api.ClusterIPNone}}},
+			ttSet:           true,
+			wantLabel:       "headless_with_selector",
+			wantSampleCount: 1,
+		},
+		{
+			name:            "cluster_ip: ClusterIP service with trigger annotation",
+			services:        []*Service{{ClusterIPs: []string{"10.0.0.1"}}},
+			ttSet:           true,
+			wantLabel:       "cluster_ip",
+			wantSampleCount: 1,
+		},
+		{
+			name:            "no annotation on headless: TT zero means no observation",
+			services:        []*Service{{ClusterIPs: []string{api.ClusterIPNone}}},
+			ttSet:           false,
+			wantLabel:       "headless_with_selector",
+			wantSampleCount: 0,
+		},
+		{
+			name:            "no annotation on ClusterIP: TT zero means no observation",
+			services:        []*Service{{ClusterIPs: []string{"10.0.0.1"}}},
+			ttSet:           false,
+			wantLabel:       "cluster_ip",
+			wantSampleCount: 0,
+		},
+		{
+			name:            "informer lag: no backing service found, TT set, no observation",
+			services:        nil,
+			ttSet:           true,
+			wantLabel:       "cluster_ip",
+			wantSampleCount: 0,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			// Replace global metric with a fresh unregistered histogram for isolation.
+			// Do NOT add t.Parallel() here — these subtests swap global package state.
+			origMetric := DNSProgrammingLatency
+			reg := prometheus.NewRegistry()
+			DNSProgrammingLatency = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
+				Name:    "test_dns_programming_duration_seconds",
+				Help:    "test histogram",
+				Buckets: prometheus.ExponentialBuckets(0.001, 2, 20),
+			}, []string{"service_kind"})
+			t.Cleanup(func() { DNSProgrammingLatency = origMetric })
+
+			origDurationSince := DurationSinceFunc
+			DurationSinceFunc = func(time.Time) time.Duration { return time.Second }
+			t.Cleanup(func() { DurationSinceFunc = origDurationSince })
+
+			rec := &EndpointLatencyRecorder{Services: tc.services}
+			if tc.ttSet {
+				rec.TT = time.Now().Add(-time.Second)
+			}
+
+			rec.record()
+
+			got := histSampleCount(t, DNSProgrammingLatency, tc.wantLabel)
+			if got != tc.wantSampleCount {
+				t.Errorf("sample count for label %q = %d, want %d", tc.wantLabel, got, tc.wantSampleCount)
+			}
+		})
+	}
+}