middleware/metrics: cleanup (#355)

* middleware/metrics: add more metrics

middleware/cache:
Add metrics for number of elements in the cache. Also export the total
size. Update README to detail the new metrics.

middleware/metrics

Move metrics into subpackage called "vars". This breaks the import
cycle and is cleaner. This allows vars.Report to be used in the
the dnsserver to log refused queries.

middleware/metrics: tests

Add tests to the metrics framework. The metrics/test subpackage allows
scraping of the local server. Do a few test scrape of the metrics that
are defined in the metrics middleware.

This also allows metrics integration tests to check if the caching and
dnssec middleware export their metrics correctly.

* update README

* typos

* fix tests
This commit is contained in:
Miek Gieben
2016-10-26 10:01:52 +01:00
committed by GitHub
parent 6d9d60081d
commit 219bfd0493
39 changed files with 828 additions and 259 deletions

View File

@@ -35,24 +35,24 @@ There is a third category (`error`) but those responses are never cached.
The minimum TTL allowed on resource records is 5 seconds.
If monitoring is enabled (via the *prometheus* directive) then the following extra metrics are added:
## Metrics
* coredns_cache_hit_count_total, and
* coredns_cache_miss_count_total
If monitoring is enabled (via the *prometheus* directive) then the following metrics are exported:
They both work on a per-zone basis and just count the hit and miss counts for each query.
* coredns_cache_size_guage{type} - total elements in the case, type is either "denial" or "success".
* coredns_cache_capacity_guage{type} - total capacity of the cache, type is either "denial" or "success".
## Examples
Enable caching for all zones, but cap everything to a TTL of 10 seconds:
~~~
cache 10
~~~
Enable caching for all zones, but cap everything to a TTL of 10 seconds.
Proxy to Google Public DNS and only cache responses for example.org (or below).
~~~
proxy . 8.8.8.8:53
cache example.org
~~~
Proxy to Google Public DNS and only cache responses for example.org (or below).

View File

@@ -79,6 +79,9 @@ func (c *ResponseWriter) WriteMsg(res *dns.Msg) error {
if key != "" {
c.set(res, key, mt, duration)
cacheSize.WithLabelValues(Success).Set(float64(c.pcache.Len()))
cacheSize.WithLabelValues(Denial).Set(float64(c.ncache.Len()))
}
setMsgTTL(res, uint32(duration.Seconds()))
@@ -103,7 +106,6 @@ func (c *ResponseWriter) set(m *dns.Msg, key string, mt response.Type, duration
case response.OtherError:
// don't cache these
// TODO(miek): what do we do with these?
default:
log.Printf("[WARNING] Caching called with unknown classification: %d", mt)
}
@@ -122,4 +124,9 @@ const (
minTTL = 5 * time.Second
defaultCap = 10000 // default capacity of the cache.
// Success is the class for caching postive caching.
Success = "success"
// Denial is the class defined for negative caching.
Denial = "denial"
)

View File

@@ -30,17 +30,15 @@ func (c *Cache) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg)
state.SizeAndDo(resp)
w.WriteMsg(resp)
cacheHitCount.WithLabelValues(zone).Inc()
return dns.RcodeSuccess, nil
}
cacheMissCount.WithLabelValues(zone).Inc()
crr := &ResponseWriter{w, c}
return c.Next.ServeDNS(ctx, crr, r)
}
func (c *Cache) Name() string { return "cache" }
func (c *Cache) get(qname string, qtype uint16, do bool) (*item, bool, bool) {
k := rawKey(qname, qtype, do)
@@ -55,24 +53,24 @@ func (c *Cache) get(qname string, qtype uint16, do bool) (*item, bool, bool) {
}
var (
cacheHitCount = prometheus.NewCounterVec(prometheus.CounterOpts{
cacheSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: middleware.Namespace,
Subsystem: subsystem,
Name: "hit_count_total",
Help: "Counter of DNS requests that were found in the cache.",
}, []string{"zone"})
Name: "size_guage",
Help: "Gauge of number of elements in the cache.",
}, []string{"type"})
cacheMissCount = prometheus.NewCounterVec(prometheus.CounterOpts{
cacheCapacity = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: middleware.Namespace,
Subsystem: subsystem,
Name: "miss_count_total",
Help: "Counter of DNS requests that were not found in the cache.",
}, []string{"zone"})
Name: "capacity_gauge",
Help: "Gauge of cache's capacity.",
}, []string{"type"})
)
const subsystem = "cache"
func init() {
prometheus.MustRegister(cacheHitCount)
prometheus.MustRegister(cacheMissCount)
prometheus.MustRegister(cacheSize)
prometheus.MustRegister(cacheCapacity)
}

View File

@@ -28,6 +28,10 @@ func setup(c *caddy.Controller) error {
return ca
})
// Export the capacity for the metrics. This only happens once, because this is a re-load change only.
cacheCapacity.WithLabelValues(Success).Set(float64(ca.pcap))
cacheCapacity.WithLabelValues(Denial).Set(float64(ca.ncap))
return nil
}
@@ -58,7 +62,7 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
for c.NextBlock() {
switch c.Val() {
// first number is cap, second is an new ttl
case "success":
case Success:
args := c.RemainingArgs()
if len(args) == 0 {
return nil, c.ArgErr()
@@ -75,7 +79,7 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
}
ca.pttl = time.Duration(pttl) * time.Second
}
case "denial":
case Denial:
args := c.RemainingArgs()
if len(args) == 0 {
return nil, c.ArgErr()