diff --git a/plugin/kubernetes/README.md b/plugin/kubernetes/README.md index a396cf785..fc2326af8 100644 --- a/plugin/kubernetes/README.md +++ b/plugin/kubernetes/README.md @@ -267,7 +267,7 @@ The following are client level metrics to monitor apiserver request latency & st ## Bugs -The duration metric only supports the "headless\_with\_selector" service currently. +The duration metric does not yet support the `headless_without_selector` service kind. ## See Also diff --git a/plugin/kubernetes/object/metrics.go b/plugin/kubernetes/object/metrics.go index 149648411..313483445 100644 --- a/plugin/kubernetes/object/metrics.go +++ b/plugin/kubernetes/object/metrics.go @@ -62,22 +62,23 @@ func (l *EndpointLatencyRecorder) init(o meta.Object) { } func (l *EndpointLatencyRecorder) record() { - // isHeadless indicates whether the endpoints object belongs to a headless - // service (i.e. clusterIp = None). Note that this can be a false negatives if the service - // informer is lagging, i.e. we may not see a recently created service. Given that the services - // don't change very often (comparing to much more frequent endpoints changes), cases when this method - // will return wrong answer should be relatively rare. Because of that we intentionally accept this - // flaw to keep the solution simple. - isHeadless := len(l.Services) == 1 && l.Services[0].Headless() - - if !isHeadless || l.TT.IsZero() { + // Note: len(l.Services) != 1 can be a false negative if the service informer is lagging, + // i.e. we may not see a recently created service. Given that services don't change very + // often (compared to much more frequent endpoint changes), cases when this method skips + // recording should be relatively rare. We intentionally accept this flaw to keep the + // solution simple. + if l.TT.IsZero() || len(l.Services) != 1 { return } - // If we're here it means that the Endpoints object is for a headless service and that - // the Endpoints object was created by the endpoints-controller (because the - // LastChangeTriggerTime annotation is set). It means that the corresponding service is a - // "headless service with selector". - DNSProgrammingLatency.WithLabelValues("headless_with_selector"). + // If we're here it means that the Endpoints object was created by the endpoints-controller + // (because the LastChangeTriggerTime annotation is set) and the backing Service is known. + // For headless services this means the service has a selector ("headless_with_selector"). + // For non-headless services the service kind is "cluster_ip". + serviceKind := "cluster_ip" + if l.Services[0].Headless() { + serviceKind = "headless_with_selector" + } + DNSProgrammingLatency.WithLabelValues(serviceKind). Observe(DurationSinceFunc(l.TT).Seconds()) } diff --git a/plugin/kubernetes/object/metrics_test.go b/plugin/kubernetes/object/metrics_test.go new file mode 100644 index 000000000..d0b9481ae --- /dev/null +++ b/plugin/kubernetes/object/metrics_test.go @@ -0,0 +1,109 @@ +package object + +import ( + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + dto "github.com/prometheus/client_model/go" + api "k8s.io/api/core/v1" +) + +// histSampleCount reads the sample count for a specific service_kind label +// from a HistogramVec. +func histSampleCount(t *testing.T, vec *prometheus.HistogramVec, label string) uint64 { + t.Helper() + obs, err := vec.GetMetricWithLabelValues(label) + if err != nil { + t.Fatalf("GetMetricWithLabelValues(%q): %v", label, err) + } + m, ok := obs.(prometheus.Metric) + if !ok { + t.Fatalf("observer for label %q does not implement prometheus.Metric", label) + } + pb := &dto.Metric{} + if err := m.Write(pb); err != nil { + t.Fatalf("Write metric for label %q: %v", label, err) + } + return pb.GetHistogram().GetSampleCount() +} + +// NOTE: subtests in this function must NOT call t.Parallel() — they swap +// global package-level vars (DNSProgrammingLatency, DurationSinceFunc). +func TestEndpointLatencyRecorder_record(t *testing.T) { + tests := []struct { + name string + services []*Service + ttSet bool + wantLabel string + wantSampleCount uint64 + }{ + { + name: "headless_with_selector: headless service with trigger annotation", + services: []*Service{{ClusterIPs: []string{api.ClusterIPNone}}}, + ttSet: true, + wantLabel: "headless_with_selector", + wantSampleCount: 1, + }, + { + name: "cluster_ip: ClusterIP service with trigger annotation", + services: []*Service{{ClusterIPs: []string{"10.0.0.1"}}}, + ttSet: true, + wantLabel: "cluster_ip", + wantSampleCount: 1, + }, + { + name: "no annotation on headless: TT zero means no observation", + services: []*Service{{ClusterIPs: []string{api.ClusterIPNone}}}, + ttSet: false, + wantLabel: "headless_with_selector", + wantSampleCount: 0, + }, + { + name: "no annotation on ClusterIP: TT zero means no observation", + services: []*Service{{ClusterIPs: []string{"10.0.0.1"}}}, + ttSet: false, + wantLabel: "cluster_ip", + wantSampleCount: 0, + }, + { + name: "informer lag: no backing service found, TT set, no observation", + services: nil, + ttSet: true, + wantLabel: "cluster_ip", + wantSampleCount: 0, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // Replace global metric with a fresh unregistered histogram for isolation. + // Do NOT add t.Parallel() here — these subtests swap global package state. + origMetric := DNSProgrammingLatency + reg := prometheus.NewRegistry() + DNSProgrammingLatency = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ + Name: "test_dns_programming_duration_seconds", + Help: "test histogram", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 20), + }, []string{"service_kind"}) + t.Cleanup(func() { DNSProgrammingLatency = origMetric }) + + origDurationSince := DurationSinceFunc + DurationSinceFunc = func(time.Time) time.Duration { return time.Second } + t.Cleanup(func() { DurationSinceFunc = origDurationSince }) + + rec := &EndpointLatencyRecorder{Services: tc.services} + if tc.ttSet { + rec.TT = time.Now().Add(-time.Second) + } + + rec.record() + + got := histSampleCount(t, DNSProgrammingLatency, tc.wantLabel) + if got != tc.wantSampleCount { + t.Errorf("sample count for label %q = %d, want %d", tc.wantLabel, got, tc.wantSampleCount) + } + }) + } +}