mirror of
https://github.com/coredns/coredns.git
synced 2025-10-27 08:14:18 -04:00
Overloaded (#1364)
* plugin/health: add 'overloaded metrics' Query our on health endpoint and record (and export as a metric) the time it takes. The Get has a 5s timeout, that, when reached, will set the metric duration to 5s. The actually call "I'm I overloaded" is left to an external entity. * README * golint and govet * and the tests
This commit is contained in:
@@ -25,6 +25,14 @@ supports health checks has a section "Health" in their README.
|
||||
|
||||
Any plugin that implements the Healther interface will be used to report health.
|
||||
|
||||
## Metrics
|
||||
|
||||
If monitoring is enabled (via the *prometheus* directive) then the following metric is exported:
|
||||
|
||||
* `coredns_health_request_duration_seconds{}` - duration to process a /health query. As this should
|
||||
be a local operation it should be fast. A (large) increases in this duration indicates the
|
||||
CoreDNS process is having trouble keeping up.
|
||||
|
||||
## Examples
|
||||
|
||||
Run another health endpoint on http://localhost:8091.
|
||||
|
||||
@@ -21,9 +21,11 @@ type health struct {
|
||||
h []Healther
|
||||
sync.RWMutex
|
||||
ok bool // ok is the global boolean indicating an all healthy plugin stack
|
||||
|
||||
stop chan bool
|
||||
}
|
||||
|
||||
func (h *health) Startup() error {
|
||||
func (h *health) OnStartup() error {
|
||||
if h.Addr == "" {
|
||||
h.Addr = defAddr
|
||||
}
|
||||
@@ -51,14 +53,20 @@ func (h *health) Startup() error {
|
||||
go func() {
|
||||
http.Serve(h.ln, h.mux)
|
||||
}()
|
||||
go func() {
|
||||
h.overloaded()
|
||||
}()
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *health) Shutdown() error {
|
||||
func (h *health) OnShutdown() error {
|
||||
if h.ln != nil {
|
||||
return h.ln.Close()
|
||||
}
|
||||
|
||||
h.stop <- true
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -13,10 +13,10 @@ func TestHealth(t *testing.T) {
|
||||
h := health{Addr: ":0"}
|
||||
h.h = append(h.h, &erratic.Erratic{})
|
||||
|
||||
if err := h.Startup(); err != nil {
|
||||
if err := h.OnStartup(); err != nil {
|
||||
t.Fatalf("Unable to startup the health server: %v", err)
|
||||
}
|
||||
defer h.Shutdown()
|
||||
defer h.OnShutdown()
|
||||
|
||||
// Reconstruct the http address based on the port allocated by operating system.
|
||||
address := fmt.Sprintf("http://%s%s", h.ln.Addr().String(), path)
|
||||
|
||||
52
plugin/health/overloaded.go
Normal file
52
plugin/health/overloaded.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package health
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/coredns/coredns/plugin"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
// overloaded queries the health end point and updates a metrics showing how long it took.
|
||||
func (h *health) overloaded() {
|
||||
timeout := time.Duration(5 * time.Second)
|
||||
client := http.Client{
|
||||
Timeout: timeout,
|
||||
}
|
||||
url := "http://" + h.Addr
|
||||
tick := time.NewTicker(1 * time.Second)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-tick.C:
|
||||
start := time.Now()
|
||||
resp, err := client.Get(url)
|
||||
if err != nil {
|
||||
HealthDuration.Observe(timeout.Seconds())
|
||||
continue
|
||||
}
|
||||
resp.Body.Close()
|
||||
HealthDuration.Observe(time.Since(start).Seconds())
|
||||
|
||||
case <-h.stop:
|
||||
tick.Stop()
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
// HealthDuration is the metric used for exporting how fast we can retrieve the /health endpoint.
|
||||
HealthDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "health",
|
||||
Name: "request_duration_seconds",
|
||||
Buckets: plugin.TimeBuckets,
|
||||
Help: "Histogram of the time (in seconds) each request took.",
|
||||
})
|
||||
)
|
||||
|
||||
var onceMetric sync.Once
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
|
||||
"github.com/coredns/coredns/core/dnsserver"
|
||||
"github.com/coredns/coredns/plugin"
|
||||
"github.com/coredns/coredns/plugin/metrics"
|
||||
|
||||
"github.com/mholt/caddy"
|
||||
)
|
||||
@@ -23,7 +24,7 @@ func setup(c *caddy.Controller) error {
|
||||
return plugin.Error("health", err)
|
||||
}
|
||||
|
||||
h := &health{Addr: addr}
|
||||
h := &health{Addr: addr, stop: make(chan bool)}
|
||||
|
||||
c.OnStartup(func() error {
|
||||
plugins := dnsserver.GetConfig(c).Handlers()
|
||||
@@ -36,6 +37,7 @@ func setup(c *caddy.Controller) error {
|
||||
})
|
||||
|
||||
c.OnStartup(func() error {
|
||||
// Poll all middleware every second.
|
||||
h.poll()
|
||||
go func() {
|
||||
for {
|
||||
@@ -46,8 +48,21 @@ func setup(c *caddy.Controller) error {
|
||||
return nil
|
||||
})
|
||||
|
||||
c.OnStartup(h.Startup)
|
||||
c.OnFinalShutdown(h.Shutdown)
|
||||
c.OnStartup(func() error {
|
||||
onceMetric.Do(func() {
|
||||
m := dnsserver.GetConfig(c).Handler("prometheus")
|
||||
if m == nil {
|
||||
return
|
||||
}
|
||||
if x, ok := m.(*metrics.Metrics); ok {
|
||||
x.MustRegister(HealthDuration)
|
||||
}
|
||||
})
|
||||
return nil
|
||||
})
|
||||
|
||||
c.OnStartup(h.OnStartup)
|
||||
c.OnFinalShutdown(h.OnShutdown)
|
||||
|
||||
// Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running.
|
||||
return nil
|
||||
|
||||
Reference in New Issue
Block a user