mirror of
https://github.com/coredns/coredns.git
synced 2026-04-05 11:45:33 -04:00
fix(kubernetes): record cluster_ip services in dns_programming_duration metric (#7951)
Signed-off-by: Azeez Syed <syedazeez337@gmail.com>
This commit is contained in:
@@ -267,7 +267,7 @@ The following are client level metrics to monitor apiserver request latency & st
|
|||||||
|
|
||||||
## Bugs
|
## Bugs
|
||||||
|
|
||||||
The duration metric only supports the "headless\_with\_selector" service currently.
|
The duration metric does not yet support the `headless_without_selector` service kind.
|
||||||
|
|
||||||
## See Also
|
## See Also
|
||||||
|
|
||||||
|
|||||||
@@ -62,22 +62,23 @@ func (l *EndpointLatencyRecorder) init(o meta.Object) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (l *EndpointLatencyRecorder) record() {
|
func (l *EndpointLatencyRecorder) record() {
|
||||||
// isHeadless indicates whether the endpoints object belongs to a headless
|
// Note: len(l.Services) != 1 can be a false negative if the service informer is lagging,
|
||||||
// service (i.e. clusterIp = None). Note that this can be a false negatives if the service
|
// i.e. we may not see a recently created service. Given that services don't change very
|
||||||
// informer is lagging, i.e. we may not see a recently created service. Given that the services
|
// often (compared to much more frequent endpoint changes), cases when this method skips
|
||||||
// don't change very often (comparing to much more frequent endpoints changes), cases when this method
|
// recording should be relatively rare. We intentionally accept this flaw to keep the
|
||||||
// will return wrong answer should be relatively rare. Because of that we intentionally accept this
|
// solution simple.
|
||||||
// flaw to keep the solution simple.
|
if l.TT.IsZero() || len(l.Services) != 1 {
|
||||||
isHeadless := len(l.Services) == 1 && l.Services[0].Headless()
|
|
||||||
|
|
||||||
if !isHeadless || l.TT.IsZero() {
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we're here it means that the Endpoints object is for a headless service and that
|
// If we're here it means that the Endpoints object was created by the endpoints-controller
|
||||||
// the Endpoints object was created by the endpoints-controller (because the
|
// (because the LastChangeTriggerTime annotation is set) and the backing Service is known.
|
||||||
// LastChangeTriggerTime annotation is set). It means that the corresponding service is a
|
// For headless services this means the service has a selector ("headless_with_selector").
|
||||||
// "headless service with selector".
|
// For non-headless services the service kind is "cluster_ip".
|
||||||
DNSProgrammingLatency.WithLabelValues("headless_with_selector").
|
serviceKind := "cluster_ip"
|
||||||
|
if l.Services[0].Headless() {
|
||||||
|
serviceKind = "headless_with_selector"
|
||||||
|
}
|
||||||
|
DNSProgrammingLatency.WithLabelValues(serviceKind).
|
||||||
Observe(DurationSinceFunc(l.TT).Seconds())
|
Observe(DurationSinceFunc(l.TT).Seconds())
|
||||||
}
|
}
|
||||||
|
|||||||
109
plugin/kubernetes/object/metrics_test.go
Normal file
109
plugin/kubernetes/object/metrics_test.go
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
package object
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||||
|
dto "github.com/prometheus/client_model/go"
|
||||||
|
api "k8s.io/api/core/v1"
|
||||||
|
)
|
||||||
|
|
||||||
|
// histSampleCount reads the sample count for a specific service_kind label
|
||||||
|
// from a HistogramVec.
|
||||||
|
func histSampleCount(t *testing.T, vec *prometheus.HistogramVec, label string) uint64 {
|
||||||
|
t.Helper()
|
||||||
|
obs, err := vec.GetMetricWithLabelValues(label)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("GetMetricWithLabelValues(%q): %v", label, err)
|
||||||
|
}
|
||||||
|
m, ok := obs.(prometheus.Metric)
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("observer for label %q does not implement prometheus.Metric", label)
|
||||||
|
}
|
||||||
|
pb := &dto.Metric{}
|
||||||
|
if err := m.Write(pb); err != nil {
|
||||||
|
t.Fatalf("Write metric for label %q: %v", label, err)
|
||||||
|
}
|
||||||
|
return pb.GetHistogram().GetSampleCount()
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: subtests in this function must NOT call t.Parallel() — they swap
|
||||||
|
// global package-level vars (DNSProgrammingLatency, DurationSinceFunc).
|
||||||
|
func TestEndpointLatencyRecorder_record(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
services []*Service
|
||||||
|
ttSet bool
|
||||||
|
wantLabel string
|
||||||
|
wantSampleCount uint64
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "headless_with_selector: headless service with trigger annotation",
|
||||||
|
services: []*Service{{ClusterIPs: []string{api.ClusterIPNone}}},
|
||||||
|
ttSet: true,
|
||||||
|
wantLabel: "headless_with_selector",
|
||||||
|
wantSampleCount: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "cluster_ip: ClusterIP service with trigger annotation",
|
||||||
|
services: []*Service{{ClusterIPs: []string{"10.0.0.1"}}},
|
||||||
|
ttSet: true,
|
||||||
|
wantLabel: "cluster_ip",
|
||||||
|
wantSampleCount: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "no annotation on headless: TT zero means no observation",
|
||||||
|
services: []*Service{{ClusterIPs: []string{api.ClusterIPNone}}},
|
||||||
|
ttSet: false,
|
||||||
|
wantLabel: "headless_with_selector",
|
||||||
|
wantSampleCount: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "no annotation on ClusterIP: TT zero means no observation",
|
||||||
|
services: []*Service{{ClusterIPs: []string{"10.0.0.1"}}},
|
||||||
|
ttSet: false,
|
||||||
|
wantLabel: "cluster_ip",
|
||||||
|
wantSampleCount: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "informer lag: no backing service found, TT set, no observation",
|
||||||
|
services: nil,
|
||||||
|
ttSet: true,
|
||||||
|
wantLabel: "cluster_ip",
|
||||||
|
wantSampleCount: 0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range tests {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
// Replace global metric with a fresh unregistered histogram for isolation.
|
||||||
|
// Do NOT add t.Parallel() here — these subtests swap global package state.
|
||||||
|
origMetric := DNSProgrammingLatency
|
||||||
|
reg := prometheus.NewRegistry()
|
||||||
|
DNSProgrammingLatency = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Name: "test_dns_programming_duration_seconds",
|
||||||
|
Help: "test histogram",
|
||||||
|
Buckets: prometheus.ExponentialBuckets(0.001, 2, 20),
|
||||||
|
}, []string{"service_kind"})
|
||||||
|
t.Cleanup(func() { DNSProgrammingLatency = origMetric })
|
||||||
|
|
||||||
|
origDurationSince := DurationSinceFunc
|
||||||
|
DurationSinceFunc = func(time.Time) time.Duration { return time.Second }
|
||||||
|
t.Cleanup(func() { DurationSinceFunc = origDurationSince })
|
||||||
|
|
||||||
|
rec := &EndpointLatencyRecorder{Services: tc.services}
|
||||||
|
if tc.ttSet {
|
||||||
|
rec.TT = time.Now().Add(-time.Second)
|
||||||
|
}
|
||||||
|
|
||||||
|
rec.record()
|
||||||
|
|
||||||
|
got := histSampleCount(t, DNSProgrammingLatency, tc.wantLabel)
|
||||||
|
if got != tc.wantSampleCount {
|
||||||
|
t.Errorf("sample count for label %q = %d, want %d", tc.wantLabel, got, tc.wantSampleCount)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user