mirror of
https://github.com/coredns/coredns.git
synced 2025-10-28 08:44:17 -04:00
Overloaded (#1364)
* plugin/health: add 'overloaded metrics' Query our on health endpoint and record (and export as a metric) the time it takes. The Get has a 5s timeout, that, when reached, will set the metric duration to 5s. The actually call "I'm I overloaded" is left to an external entity. * README * golint and govet * and the tests
This commit is contained in:
@@ -25,6 +25,14 @@ supports health checks has a section "Health" in their README.
|
|||||||
|
|
||||||
Any plugin that implements the Healther interface will be used to report health.
|
Any plugin that implements the Healther interface will be used to report health.
|
||||||
|
|
||||||
|
## Metrics
|
||||||
|
|
||||||
|
If monitoring is enabled (via the *prometheus* directive) then the following metric is exported:
|
||||||
|
|
||||||
|
* `coredns_health_request_duration_seconds{}` - duration to process a /health query. As this should
|
||||||
|
be a local operation it should be fast. A (large) increases in this duration indicates the
|
||||||
|
CoreDNS process is having trouble keeping up.
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
Run another health endpoint on http://localhost:8091.
|
Run another health endpoint on http://localhost:8091.
|
||||||
|
|||||||
@@ -21,9 +21,11 @@ type health struct {
|
|||||||
h []Healther
|
h []Healther
|
||||||
sync.RWMutex
|
sync.RWMutex
|
||||||
ok bool // ok is the global boolean indicating an all healthy plugin stack
|
ok bool // ok is the global boolean indicating an all healthy plugin stack
|
||||||
|
|
||||||
|
stop chan bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *health) Startup() error {
|
func (h *health) OnStartup() error {
|
||||||
if h.Addr == "" {
|
if h.Addr == "" {
|
||||||
h.Addr = defAddr
|
h.Addr = defAddr
|
||||||
}
|
}
|
||||||
@@ -51,14 +53,20 @@ func (h *health) Startup() error {
|
|||||||
go func() {
|
go func() {
|
||||||
http.Serve(h.ln, h.mux)
|
http.Serve(h.ln, h.mux)
|
||||||
}()
|
}()
|
||||||
|
go func() {
|
||||||
|
h.overloaded()
|
||||||
|
}()
|
||||||
})
|
})
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *health) Shutdown() error {
|
func (h *health) OnShutdown() error {
|
||||||
if h.ln != nil {
|
if h.ln != nil {
|
||||||
return h.ln.Close()
|
return h.ln.Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
h.stop <- true
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -13,10 +13,10 @@ func TestHealth(t *testing.T) {
|
|||||||
h := health{Addr: ":0"}
|
h := health{Addr: ":0"}
|
||||||
h.h = append(h.h, &erratic.Erratic{})
|
h.h = append(h.h, &erratic.Erratic{})
|
||||||
|
|
||||||
if err := h.Startup(); err != nil {
|
if err := h.OnStartup(); err != nil {
|
||||||
t.Fatalf("Unable to startup the health server: %v", err)
|
t.Fatalf("Unable to startup the health server: %v", err)
|
||||||
}
|
}
|
||||||
defer h.Shutdown()
|
defer h.OnShutdown()
|
||||||
|
|
||||||
// Reconstruct the http address based on the port allocated by operating system.
|
// Reconstruct the http address based on the port allocated by operating system.
|
||||||
address := fmt.Sprintf("http://%s%s", h.ln.Addr().String(), path)
|
address := fmt.Sprintf("http://%s%s", h.ln.Addr().String(), path)
|
||||||
|
|||||||
52
plugin/health/overloaded.go
Normal file
52
plugin/health/overloaded.go
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
package health
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/coredns/coredns/plugin"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
|
// overloaded queries the health end point and updates a metrics showing how long it took.
|
||||||
|
func (h *health) overloaded() {
|
||||||
|
timeout := time.Duration(5 * time.Second)
|
||||||
|
client := http.Client{
|
||||||
|
Timeout: timeout,
|
||||||
|
}
|
||||||
|
url := "http://" + h.Addr
|
||||||
|
tick := time.NewTicker(1 * time.Second)
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-tick.C:
|
||||||
|
start := time.Now()
|
||||||
|
resp, err := client.Get(url)
|
||||||
|
if err != nil {
|
||||||
|
HealthDuration.Observe(timeout.Seconds())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
resp.Body.Close()
|
||||||
|
HealthDuration.Observe(time.Since(start).Seconds())
|
||||||
|
|
||||||
|
case <-h.stop:
|
||||||
|
tick.Stop()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
// HealthDuration is the metric used for exporting how fast we can retrieve the /health endpoint.
|
||||||
|
HealthDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||||
|
Namespace: plugin.Namespace,
|
||||||
|
Subsystem: "health",
|
||||||
|
Name: "request_duration_seconds",
|
||||||
|
Buckets: plugin.TimeBuckets,
|
||||||
|
Help: "Histogram of the time (in seconds) each request took.",
|
||||||
|
})
|
||||||
|
)
|
||||||
|
|
||||||
|
var onceMetric sync.Once
|
||||||
@@ -6,6 +6,7 @@ import (
|
|||||||
|
|
||||||
"github.com/coredns/coredns/core/dnsserver"
|
"github.com/coredns/coredns/core/dnsserver"
|
||||||
"github.com/coredns/coredns/plugin"
|
"github.com/coredns/coredns/plugin"
|
||||||
|
"github.com/coredns/coredns/plugin/metrics"
|
||||||
|
|
||||||
"github.com/mholt/caddy"
|
"github.com/mholt/caddy"
|
||||||
)
|
)
|
||||||
@@ -23,7 +24,7 @@ func setup(c *caddy.Controller) error {
|
|||||||
return plugin.Error("health", err)
|
return plugin.Error("health", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
h := &health{Addr: addr}
|
h := &health{Addr: addr, stop: make(chan bool)}
|
||||||
|
|
||||||
c.OnStartup(func() error {
|
c.OnStartup(func() error {
|
||||||
plugins := dnsserver.GetConfig(c).Handlers()
|
plugins := dnsserver.GetConfig(c).Handlers()
|
||||||
@@ -36,6 +37,7 @@ func setup(c *caddy.Controller) error {
|
|||||||
})
|
})
|
||||||
|
|
||||||
c.OnStartup(func() error {
|
c.OnStartup(func() error {
|
||||||
|
// Poll all middleware every second.
|
||||||
h.poll()
|
h.poll()
|
||||||
go func() {
|
go func() {
|
||||||
for {
|
for {
|
||||||
@@ -46,8 +48,21 @@ func setup(c *caddy.Controller) error {
|
|||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
|
|
||||||
c.OnStartup(h.Startup)
|
c.OnStartup(func() error {
|
||||||
c.OnFinalShutdown(h.Shutdown)
|
onceMetric.Do(func() {
|
||||||
|
m := dnsserver.GetConfig(c).Handler("prometheus")
|
||||||
|
if m == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if x, ok := m.(*metrics.Metrics); ok {
|
||||||
|
x.MustRegister(HealthDuration)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
|
c.OnStartup(h.OnStartup)
|
||||||
|
c.OnFinalShutdown(h.OnShutdown)
|
||||||
|
|
||||||
// Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running.
|
// Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running.
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
Reference in New Issue
Block a user