Overloaded (#1364)

* plugin/health: add 'overloaded metrics'

Query our on health endpoint and record (and export as a metric) the
time it takes. The Get has a 5s timeout, that, when reached, will set
the metric duration to 5s. The actually call "I'm I overloaded" is left
to an external entity.

* README

* golint and govet

* and the tests
This commit is contained in:
Miek Gieben
2018-01-10 11:41:22 +00:00
committed by GitHub
parent cced1a4c12
commit 48059a6c3e
5 changed files with 90 additions and 7 deletions

View File

@@ -25,6 +25,14 @@ supports health checks has a section "Health" in their README.
Any plugin that implements the Healther interface will be used to report health. Any plugin that implements the Healther interface will be used to report health.
## Metrics
If monitoring is enabled (via the *prometheus* directive) then the following metric is exported:
* `coredns_health_request_duration_seconds{}` - duration to process a /health query. As this should
be a local operation it should be fast. A (large) increases in this duration indicates the
CoreDNS process is having trouble keeping up.
## Examples ## Examples
Run another health endpoint on http://localhost:8091. Run another health endpoint on http://localhost:8091.

View File

@@ -21,9 +21,11 @@ type health struct {
h []Healther h []Healther
sync.RWMutex sync.RWMutex
ok bool // ok is the global boolean indicating an all healthy plugin stack ok bool // ok is the global boolean indicating an all healthy plugin stack
stop chan bool
} }
func (h *health) Startup() error { func (h *health) OnStartup() error {
if h.Addr == "" { if h.Addr == "" {
h.Addr = defAddr h.Addr = defAddr
} }
@@ -51,14 +53,20 @@ func (h *health) Startup() error {
go func() { go func() {
http.Serve(h.ln, h.mux) http.Serve(h.ln, h.mux)
}() }()
go func() {
h.overloaded()
}()
}) })
return nil return nil
} }
func (h *health) Shutdown() error { func (h *health) OnShutdown() error {
if h.ln != nil { if h.ln != nil {
return h.ln.Close() return h.ln.Close()
} }
h.stop <- true
return nil return nil
} }

View File

@@ -13,10 +13,10 @@ func TestHealth(t *testing.T) {
h := health{Addr: ":0"} h := health{Addr: ":0"}
h.h = append(h.h, &erratic.Erratic{}) h.h = append(h.h, &erratic.Erratic{})
if err := h.Startup(); err != nil { if err := h.OnStartup(); err != nil {
t.Fatalf("Unable to startup the health server: %v", err) t.Fatalf("Unable to startup the health server: %v", err)
} }
defer h.Shutdown() defer h.OnShutdown()
// Reconstruct the http address based on the port allocated by operating system. // Reconstruct the http address based on the port allocated by operating system.
address := fmt.Sprintf("http://%s%s", h.ln.Addr().String(), path) address := fmt.Sprintf("http://%s%s", h.ln.Addr().String(), path)

View File

@@ -0,0 +1,52 @@
package health
import (
"net/http"
"sync"
"time"
"github.com/coredns/coredns/plugin"
"github.com/prometheus/client_golang/prometheus"
)
// overloaded queries the health end point and updates a metrics showing how long it took.
func (h *health) overloaded() {
timeout := time.Duration(5 * time.Second)
client := http.Client{
Timeout: timeout,
}
url := "http://" + h.Addr
tick := time.NewTicker(1 * time.Second)
for {
select {
case <-tick.C:
start := time.Now()
resp, err := client.Get(url)
if err != nil {
HealthDuration.Observe(timeout.Seconds())
continue
}
resp.Body.Close()
HealthDuration.Observe(time.Since(start).Seconds())
case <-h.stop:
tick.Stop()
return
}
}
}
var (
// HealthDuration is the metric used for exporting how fast we can retrieve the /health endpoint.
HealthDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: plugin.Namespace,
Subsystem: "health",
Name: "request_duration_seconds",
Buckets: plugin.TimeBuckets,
Help: "Histogram of the time (in seconds) each request took.",
})
)
var onceMetric sync.Once

View File

@@ -6,6 +6,7 @@ import (
"github.com/coredns/coredns/core/dnsserver" "github.com/coredns/coredns/core/dnsserver"
"github.com/coredns/coredns/plugin" "github.com/coredns/coredns/plugin"
"github.com/coredns/coredns/plugin/metrics"
"github.com/mholt/caddy" "github.com/mholt/caddy"
) )
@@ -23,7 +24,7 @@ func setup(c *caddy.Controller) error {
return plugin.Error("health", err) return plugin.Error("health", err)
} }
h := &health{Addr: addr} h := &health{Addr: addr, stop: make(chan bool)}
c.OnStartup(func() error { c.OnStartup(func() error {
plugins := dnsserver.GetConfig(c).Handlers() plugins := dnsserver.GetConfig(c).Handlers()
@@ -36,6 +37,7 @@ func setup(c *caddy.Controller) error {
}) })
c.OnStartup(func() error { c.OnStartup(func() error {
// Poll all middleware every second.
h.poll() h.poll()
go func() { go func() {
for { for {
@@ -46,8 +48,21 @@ func setup(c *caddy.Controller) error {
return nil return nil
}) })
c.OnStartup(h.Startup) c.OnStartup(func() error {
c.OnFinalShutdown(h.Shutdown) onceMetric.Do(func() {
m := dnsserver.GetConfig(c).Handler("prometheus")
if m == nil {
return
}
if x, ok := m.(*metrics.Metrics); ok {
x.MustRegister(HealthDuration)
}
})
return nil
})
c.OnStartup(h.OnStartup)
c.OnFinalShutdown(h.OnShutdown)
// Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running. // Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running.
return nil return nil