Cleanup metrics (#3776)

Cleanup a variety of metric issues.
* Eliminate department of redundancy "count_total" naming.
* Use the plural of the unit when appropriate. (ex, "requests")
* Remove label names from metric names where appropriate. (ex, "rcode")
* Simplify request metrics by consolidating type label in to the base
request counter.
* Re-generate man pages.

Signed-off-by: Ben Kochie <superq@gmail.com>

Co-authored-by: Ben Kochie <superq@gmail.com>
This commit is contained in:
Miek Gieben
2020-03-26 09:17:33 +01:00
committed by GitHub
parent eb23cce1a7
commit 19cfa2960c
16 changed files with 39 additions and 50 deletions

View File

@@ -108,12 +108,12 @@ On each endpoint, the timeouts of the communication are set by default and autom
If monitoring is enabled (via the *prometheus* plugin) then the following metric are exported:
* `coredns_forward_request_duration_seconds{to}` - duration per upstream interaction.
* `coredns_forward_request_count_total{to}` - query count per upstream.
* `coredns_forward_response_rcode_count_total{to, rcode}` - count of RCODEs per upstream.
* `coredns_forward_healthcheck_failure_count_total{to}` - number of failed health checks per upstream.
* `coredns_forward_healthcheck_broken_count_total{}` - counter of when all upstreams are unhealthy,
* `coredns_forward_requests_total{to}` - query count per upstream.
* `coredns_forward_responses_total{to, rcode}` - count of RCODEs per upstream.
* `coredns_forward_healthcheck_failures_total{to}` - number of failed health checks per upstream.
* `coredns_forward_healthcheck_broken_total{}` - counter of when all upstreams are unhealthy,
and we are randomly (this always uses the `random` policy) spraying to an upstream.
* `max_concurrent_reject_count_total{}` - counter of the number of queries rejected because the
* `max_concurrent_rejects_total{}` - counter of the number of queries rejected because the
number of concurrent queries were at maximum.
Where `to` is one of the upstream servers (**TO** from the config), `rcode` is the returned RCODE
from the upstream.

View File

@@ -11,13 +11,13 @@ var (
RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "forward",
Name: "request_count_total",
Name: "requests_total",
Help: "Counter of requests made per upstream.",
}, []string{"to"})
RcodeCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "forward",
Name: "response_rcode_count_total",
Name: "responses_total",
Help: "Counter of requests made per upstream.",
}, []string{"rcode", "to"})
RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
@@ -30,13 +30,13 @@ var (
HealthcheckFailureCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "forward",
Name: "healthcheck_failure_count_total",
Name: "healthcheck_failures_total",
Help: "Counter of the number of failed healthchecks.",
}, []string{"to"})
HealthcheckBrokenCount = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "forward",
Name: "healthcheck_broken_count_total",
Name: "healthcheck_broken_total",
Help: "Counter of the number of complete failures of the healthchecks.",
})
SocketGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
@@ -48,7 +48,7 @@ var (
MaxConcurrentRejectCount = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "forward",
Name: "max_concurrent_reject_count_total",
Name: "max_concurrent_rejects_total",
Help: "Counter of the number of queries rejected because the concurrent queries were at maximum.",
})
)