mirror of
https://github.com/coredns/coredns.git
synced 2026-04-04 19:25:40 -04:00
fix(kubernetes): record cluster_ip services in dns_programming_duration metric (#7951)
Signed-off-by: Azeez Syed <syedazeez337@gmail.com>
This commit is contained in:
@@ -267,7 +267,7 @@ The following are client level metrics to monitor apiserver request latency & st
|
||||
|
||||
## Bugs
|
||||
|
||||
The duration metric only supports the "headless\_with\_selector" service currently.
|
||||
The duration metric does not yet support the `headless_without_selector` service kind.
|
||||
|
||||
## See Also
|
||||
|
||||
|
||||
@@ -62,22 +62,23 @@ func (l *EndpointLatencyRecorder) init(o meta.Object) {
|
||||
}
|
||||
|
||||
func (l *EndpointLatencyRecorder) record() {
|
||||
// isHeadless indicates whether the endpoints object belongs to a headless
|
||||
// service (i.e. clusterIp = None). Note that this can be a false negatives if the service
|
||||
// informer is lagging, i.e. we may not see a recently created service. Given that the services
|
||||
// don't change very often (comparing to much more frequent endpoints changes), cases when this method
|
||||
// will return wrong answer should be relatively rare. Because of that we intentionally accept this
|
||||
// flaw to keep the solution simple.
|
||||
isHeadless := len(l.Services) == 1 && l.Services[0].Headless()
|
||||
|
||||
if !isHeadless || l.TT.IsZero() {
|
||||
// Note: len(l.Services) != 1 can be a false negative if the service informer is lagging,
|
||||
// i.e. we may not see a recently created service. Given that services don't change very
|
||||
// often (compared to much more frequent endpoint changes), cases when this method skips
|
||||
// recording should be relatively rare. We intentionally accept this flaw to keep the
|
||||
// solution simple.
|
||||
if l.TT.IsZero() || len(l.Services) != 1 {
|
||||
return
|
||||
}
|
||||
|
||||
// If we're here it means that the Endpoints object is for a headless service and that
|
||||
// the Endpoints object was created by the endpoints-controller (because the
|
||||
// LastChangeTriggerTime annotation is set). It means that the corresponding service is a
|
||||
// "headless service with selector".
|
||||
DNSProgrammingLatency.WithLabelValues("headless_with_selector").
|
||||
// If we're here it means that the Endpoints object was created by the endpoints-controller
|
||||
// (because the LastChangeTriggerTime annotation is set) and the backing Service is known.
|
||||
// For headless services this means the service has a selector ("headless_with_selector").
|
||||
// For non-headless services the service kind is "cluster_ip".
|
||||
serviceKind := "cluster_ip"
|
||||
if l.Services[0].Headless() {
|
||||
serviceKind = "headless_with_selector"
|
||||
}
|
||||
DNSProgrammingLatency.WithLabelValues(serviceKind).
|
||||
Observe(DurationSinceFunc(l.TT).Seconds())
|
||||
}
|
||||
|
||||
109
plugin/kubernetes/object/metrics_test.go
Normal file
109
plugin/kubernetes/object/metrics_test.go
Normal file
@@ -0,0 +1,109 @@
|
||||
package object
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
api "k8s.io/api/core/v1"
|
||||
)
|
||||
|
||||
// histSampleCount reads the sample count for a specific service_kind label
|
||||
// from a HistogramVec.
|
||||
func histSampleCount(t *testing.T, vec *prometheus.HistogramVec, label string) uint64 {
|
||||
t.Helper()
|
||||
obs, err := vec.GetMetricWithLabelValues(label)
|
||||
if err != nil {
|
||||
t.Fatalf("GetMetricWithLabelValues(%q): %v", label, err)
|
||||
}
|
||||
m, ok := obs.(prometheus.Metric)
|
||||
if !ok {
|
||||
t.Fatalf("observer for label %q does not implement prometheus.Metric", label)
|
||||
}
|
||||
pb := &dto.Metric{}
|
||||
if err := m.Write(pb); err != nil {
|
||||
t.Fatalf("Write metric for label %q: %v", label, err)
|
||||
}
|
||||
return pb.GetHistogram().GetSampleCount()
|
||||
}
|
||||
|
||||
// NOTE: subtests in this function must NOT call t.Parallel() — they swap
|
||||
// global package-level vars (DNSProgrammingLatency, DurationSinceFunc).
|
||||
func TestEndpointLatencyRecorder_record(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
services []*Service
|
||||
ttSet bool
|
||||
wantLabel string
|
||||
wantSampleCount uint64
|
||||
}{
|
||||
{
|
||||
name: "headless_with_selector: headless service with trigger annotation",
|
||||
services: []*Service{{ClusterIPs: []string{api.ClusterIPNone}}},
|
||||
ttSet: true,
|
||||
wantLabel: "headless_with_selector",
|
||||
wantSampleCount: 1,
|
||||
},
|
||||
{
|
||||
name: "cluster_ip: ClusterIP service with trigger annotation",
|
||||
services: []*Service{{ClusterIPs: []string{"10.0.0.1"}}},
|
||||
ttSet: true,
|
||||
wantLabel: "cluster_ip",
|
||||
wantSampleCount: 1,
|
||||
},
|
||||
{
|
||||
name: "no annotation on headless: TT zero means no observation",
|
||||
services: []*Service{{ClusterIPs: []string{api.ClusterIPNone}}},
|
||||
ttSet: false,
|
||||
wantLabel: "headless_with_selector",
|
||||
wantSampleCount: 0,
|
||||
},
|
||||
{
|
||||
name: "no annotation on ClusterIP: TT zero means no observation",
|
||||
services: []*Service{{ClusterIPs: []string{"10.0.0.1"}}},
|
||||
ttSet: false,
|
||||
wantLabel: "cluster_ip",
|
||||
wantSampleCount: 0,
|
||||
},
|
||||
{
|
||||
name: "informer lag: no backing service found, TT set, no observation",
|
||||
services: nil,
|
||||
ttSet: true,
|
||||
wantLabel: "cluster_ip",
|
||||
wantSampleCount: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
// Replace global metric with a fresh unregistered histogram for isolation.
|
||||
// Do NOT add t.Parallel() here — these subtests swap global package state.
|
||||
origMetric := DNSProgrammingLatency
|
||||
reg := prometheus.NewRegistry()
|
||||
DNSProgrammingLatency = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
|
||||
Name: "test_dns_programming_duration_seconds",
|
||||
Help: "test histogram",
|
||||
Buckets: prometheus.ExponentialBuckets(0.001, 2, 20),
|
||||
}, []string{"service_kind"})
|
||||
t.Cleanup(func() { DNSProgrammingLatency = origMetric })
|
||||
|
||||
origDurationSince := DurationSinceFunc
|
||||
DurationSinceFunc = func(time.Time) time.Duration { return time.Second }
|
||||
t.Cleanup(func() { DurationSinceFunc = origDurationSince })
|
||||
|
||||
rec := &EndpointLatencyRecorder{Services: tc.services}
|
||||
if tc.ttSet {
|
||||
rec.TT = time.Now().Add(-time.Second)
|
||||
}
|
||||
|
||||
rec.record()
|
||||
|
||||
got := histSampleCount(t, DNSProgrammingLatency, tc.wantLabel)
|
||||
if got != tc.wantSampleCount {
|
||||
t.Errorf("sample count for label %q = %d, want %d", tc.wantLabel, got, tc.wantSampleCount)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user