mirror of
https://github.com/coredns/coredns.git
synced 2025-12-18 16:15:18 -05:00
Reduce the cardinality of health endpoint metrics (#4650)
The health endpoint histogram has a large amount of cardinality for a simple endpoint. Introduce a new "Slim" set of buckets for `/health` to reduce the metrics load on large deployments. Especially those that have per-node DNS caching services. Add a metric to count internal health check failures rather than use the timeout value as side effect monitor of the check error. This avoids incorrectly recording the timeout value if there is an error that is not a timeout (ex. refused) Signed-off-by: SuperQ <superq@gmail.com>
This commit is contained in:
@@ -26,7 +26,8 @@ func (h *health) overloaded() {
|
||||
start := time.Now()
|
||||
resp, err := client.Get(url)
|
||||
if err != nil {
|
||||
HealthDuration.Observe(timeout.Seconds())
|
||||
HealthDuration.Observe(time.Since(start).Seconds())
|
||||
HealthFailures.Inc()
|
||||
log.Warningf("Local health request to %q failed: %s", url, err)
|
||||
continue
|
||||
}
|
||||
@@ -49,7 +50,14 @@ var (
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "health",
|
||||
Name: "request_duration_seconds",
|
||||
Buckets: plugin.TimeBuckets,
|
||||
Buckets: plugin.SlimTimeBuckets,
|
||||
Help: "Histogram of the time (in seconds) each request took.",
|
||||
})
|
||||
// HealthFailures is the metric used to count how many times the thealth request failed
|
||||
HealthFailures = promauto.NewCounter(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "health",
|
||||
Name: "request_failures_total",
|
||||
Help: "The number of times the health check failed.",
|
||||
})
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user