fix(kubernetes): record cluster_ip services in dns_programming_duration metric (#7951)

Signed-off-by: Azeez Syed <syedazeez337@gmail.com>
This commit is contained in:
Syed Azeez
2026-03-24 17:59:28 +05:30
committed by GitHub
parent 734426798f
commit f582a01dc9
3 changed files with 125 additions and 15 deletions

View File

@@ -267,7 +267,7 @@ The following are client level metrics to monitor apiserver request latency & st
## Bugs
The duration metric only supports the "headless\_with\_selector" service currently.
The duration metric does not yet support the `headless_without_selector` service kind.
## See Also

View File

@@ -62,22 +62,23 @@ func (l *EndpointLatencyRecorder) init(o meta.Object) {
}
func (l *EndpointLatencyRecorder) record() {
// isHeadless indicates whether the endpoints object belongs to a headless
// service (i.e. clusterIp = None). Note that this can be a false negatives if the service
// informer is lagging, i.e. we may not see a recently created service. Given that the services
// don't change very often (comparing to much more frequent endpoints changes), cases when this method
// will return wrong answer should be relatively rare. Because of that we intentionally accept this
// flaw to keep the solution simple.
isHeadless := len(l.Services) == 1 && l.Services[0].Headless()
if !isHeadless || l.TT.IsZero() {
// Note: len(l.Services) != 1 can be a false negative if the service informer is lagging,
// i.e. we may not see a recently created service. Given that services don't change very
// often (compared to much more frequent endpoint changes), cases when this method skips
// recording should be relatively rare. We intentionally accept this flaw to keep the
// solution simple.
if l.TT.IsZero() || len(l.Services) != 1 {
return
}
// If we're here it means that the Endpoints object is for a headless service and that
// the Endpoints object was created by the endpoints-controller (because the
// LastChangeTriggerTime annotation is set). It means that the corresponding service is a
// "headless service with selector".
DNSProgrammingLatency.WithLabelValues("headless_with_selector").
// If we're here it means that the Endpoints object was created by the endpoints-controller
// (because the LastChangeTriggerTime annotation is set) and the backing Service is known.
// For headless services this means the service has a selector ("headless_with_selector").
// For non-headless services the service kind is "cluster_ip".
serviceKind := "cluster_ip"
if l.Services[0].Headless() {
serviceKind = "headless_with_selector"
}
DNSProgrammingLatency.WithLabelValues(serviceKind).
Observe(DurationSinceFunc(l.TT).Seconds())
}

View File

@@ -0,0 +1,109 @@
package object
import (
"testing"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
dto "github.com/prometheus/client_model/go"
api "k8s.io/api/core/v1"
)
// histSampleCount reads the sample count for a specific service_kind label
// from a HistogramVec.
func histSampleCount(t *testing.T, vec *prometheus.HistogramVec, label string) uint64 {
t.Helper()
obs, err := vec.GetMetricWithLabelValues(label)
if err != nil {
t.Fatalf("GetMetricWithLabelValues(%q): %v", label, err)
}
m, ok := obs.(prometheus.Metric)
if !ok {
t.Fatalf("observer for label %q does not implement prometheus.Metric", label)
}
pb := &dto.Metric{}
if err := m.Write(pb); err != nil {
t.Fatalf("Write metric for label %q: %v", label, err)
}
return pb.GetHistogram().GetSampleCount()
}
// NOTE: subtests in this function must NOT call t.Parallel() — they swap
// global package-level vars (DNSProgrammingLatency, DurationSinceFunc).
func TestEndpointLatencyRecorder_record(t *testing.T) {
tests := []struct {
name string
services []*Service
ttSet bool
wantLabel string
wantSampleCount uint64
}{
{
name: "headless_with_selector: headless service with trigger annotation",
services: []*Service{{ClusterIPs: []string{api.ClusterIPNone}}},
ttSet: true,
wantLabel: "headless_with_selector",
wantSampleCount: 1,
},
{
name: "cluster_ip: ClusterIP service with trigger annotation",
services: []*Service{{ClusterIPs: []string{"10.0.0.1"}}},
ttSet: true,
wantLabel: "cluster_ip",
wantSampleCount: 1,
},
{
name: "no annotation on headless: TT zero means no observation",
services: []*Service{{ClusterIPs: []string{api.ClusterIPNone}}},
ttSet: false,
wantLabel: "headless_with_selector",
wantSampleCount: 0,
},
{
name: "no annotation on ClusterIP: TT zero means no observation",
services: []*Service{{ClusterIPs: []string{"10.0.0.1"}}},
ttSet: false,
wantLabel: "cluster_ip",
wantSampleCount: 0,
},
{
name: "informer lag: no backing service found, TT set, no observation",
services: nil,
ttSet: true,
wantLabel: "cluster_ip",
wantSampleCount: 0,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
// Replace global metric with a fresh unregistered histogram for isolation.
// Do NOT add t.Parallel() here — these subtests swap global package state.
origMetric := DNSProgrammingLatency
reg := prometheus.NewRegistry()
DNSProgrammingLatency = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
Name: "test_dns_programming_duration_seconds",
Help: "test histogram",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 20),
}, []string{"service_kind"})
t.Cleanup(func() { DNSProgrammingLatency = origMetric })
origDurationSince := DurationSinceFunc
DurationSinceFunc = func(time.Time) time.Duration { return time.Second }
t.Cleanup(func() { DurationSinceFunc = origDurationSince })
rec := &EndpointLatencyRecorder{Services: tc.services}
if tc.ttSet {
rec.TT = time.Now().Add(-time.Second)
}
rec.record()
got := histSampleCount(t, DNSProgrammingLatency, tc.wantLabel)
if got != tc.wantSampleCount {
t.Errorf("sample count for label %q = %d, want %d", tc.wantLabel, got, tc.wantSampleCount)
}
})
}
}