mirror of
				https://github.com/coredns/coredns.git
				synced 2025-10-27 08:14:18 -04:00 
			
		
		
		
	Overloaded (#1364)
* plugin/health: add 'overloaded metrics' Query our on health endpoint and record (and export as a metric) the time it takes. The Get has a 5s timeout, that, when reached, will set the metric duration to 5s. The actually call "I'm I overloaded" is left to an external entity. * README * golint and govet * and the tests
This commit is contained in:
		| @@ -25,6 +25,14 @@ supports health checks has a section "Health" in their README. | ||||
|  | ||||
| Any plugin that implements the Healther interface will be used to report health. | ||||
|  | ||||
| ## Metrics | ||||
|  | ||||
| If monitoring is enabled (via the *prometheus* directive) then the following metric is exported: | ||||
|  | ||||
| * `coredns_health_request_duration_seconds{}` - duration to process a /health query. As this should | ||||
|   be a local operation it should be fast. A (large) increases in this duration indicates the | ||||
|   CoreDNS process is having trouble keeping up. | ||||
|  | ||||
| ## Examples | ||||
|  | ||||
| Run another health endpoint on http://localhost:8091. | ||||
|   | ||||
| @@ -21,9 +21,11 @@ type health struct { | ||||
| 	h []Healther | ||||
| 	sync.RWMutex | ||||
| 	ok bool // ok is the global boolean indicating an all healthy plugin stack | ||||
|  | ||||
| 	stop chan bool | ||||
| } | ||||
|  | ||||
| func (h *health) Startup() error { | ||||
| func (h *health) OnStartup() error { | ||||
| 	if h.Addr == "" { | ||||
| 		h.Addr = defAddr | ||||
| 	} | ||||
| @@ -51,14 +53,20 @@ func (h *health) Startup() error { | ||||
| 		go func() { | ||||
| 			http.Serve(h.ln, h.mux) | ||||
| 		}() | ||||
| 		go func() { | ||||
| 			h.overloaded() | ||||
| 		}() | ||||
| 	}) | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func (h *health) Shutdown() error { | ||||
| func (h *health) OnShutdown() error { | ||||
| 	if h.ln != nil { | ||||
| 		return h.ln.Close() | ||||
| 	} | ||||
|  | ||||
| 	h.stop <- true | ||||
|  | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -13,10 +13,10 @@ func TestHealth(t *testing.T) { | ||||
| 	h := health{Addr: ":0"} | ||||
| 	h.h = append(h.h, &erratic.Erratic{}) | ||||
|  | ||||
| 	if err := h.Startup(); err != nil { | ||||
| 	if err := h.OnStartup(); err != nil { | ||||
| 		t.Fatalf("Unable to startup the health server: %v", err) | ||||
| 	} | ||||
| 	defer h.Shutdown() | ||||
| 	defer h.OnShutdown() | ||||
|  | ||||
| 	// Reconstruct the http address based on the port allocated by operating system. | ||||
| 	address := fmt.Sprintf("http://%s%s", h.ln.Addr().String(), path) | ||||
|   | ||||
							
								
								
									
										52
									
								
								plugin/health/overloaded.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								plugin/health/overloaded.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,52 @@ | ||||
| package health | ||||
|  | ||||
| import ( | ||||
| 	"net/http" | ||||
| 	"sync" | ||||
| 	"time" | ||||
|  | ||||
| 	"github.com/coredns/coredns/plugin" | ||||
|  | ||||
| 	"github.com/prometheus/client_golang/prometheus" | ||||
| ) | ||||
|  | ||||
| // overloaded queries the health end point and updates a metrics showing how long it took. | ||||
| func (h *health) overloaded() { | ||||
| 	timeout := time.Duration(5 * time.Second) | ||||
| 	client := http.Client{ | ||||
| 		Timeout: timeout, | ||||
| 	} | ||||
| 	url := "http://" + h.Addr | ||||
| 	tick := time.NewTicker(1 * time.Second) | ||||
|  | ||||
| 	for { | ||||
| 		select { | ||||
| 		case <-tick.C: | ||||
| 			start := time.Now() | ||||
| 			resp, err := client.Get(url) | ||||
| 			if err != nil { | ||||
| 				HealthDuration.Observe(timeout.Seconds()) | ||||
| 				continue | ||||
| 			} | ||||
| 			resp.Body.Close() | ||||
| 			HealthDuration.Observe(time.Since(start).Seconds()) | ||||
|  | ||||
| 		case <-h.stop: | ||||
| 			tick.Stop() | ||||
| 			return | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
|  | ||||
| var ( | ||||
| 	// HealthDuration is the metric used for exporting how fast we can retrieve the /health endpoint. | ||||
| 	HealthDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ | ||||
| 		Namespace: plugin.Namespace, | ||||
| 		Subsystem: "health", | ||||
| 		Name:      "request_duration_seconds", | ||||
| 		Buckets:   plugin.TimeBuckets, | ||||
| 		Help:      "Histogram of the time (in seconds) each request took.", | ||||
| 	}) | ||||
| ) | ||||
|  | ||||
| var onceMetric sync.Once | ||||
| @@ -6,6 +6,7 @@ import ( | ||||
|  | ||||
| 	"github.com/coredns/coredns/core/dnsserver" | ||||
| 	"github.com/coredns/coredns/plugin" | ||||
| 	"github.com/coredns/coredns/plugin/metrics" | ||||
|  | ||||
| 	"github.com/mholt/caddy" | ||||
| ) | ||||
| @@ -23,7 +24,7 @@ func setup(c *caddy.Controller) error { | ||||
| 		return plugin.Error("health", err) | ||||
| 	} | ||||
|  | ||||
| 	h := &health{Addr: addr} | ||||
| 	h := &health{Addr: addr, stop: make(chan bool)} | ||||
|  | ||||
| 	c.OnStartup(func() error { | ||||
| 		plugins := dnsserver.GetConfig(c).Handlers() | ||||
| @@ -36,6 +37,7 @@ func setup(c *caddy.Controller) error { | ||||
| 	}) | ||||
|  | ||||
| 	c.OnStartup(func() error { | ||||
| 		// Poll all middleware every second. | ||||
| 		h.poll() | ||||
| 		go func() { | ||||
| 			for { | ||||
| @@ -46,8 +48,21 @@ func setup(c *caddy.Controller) error { | ||||
| 		return nil | ||||
| 	}) | ||||
|  | ||||
| 	c.OnStartup(h.Startup) | ||||
| 	c.OnFinalShutdown(h.Shutdown) | ||||
| 	c.OnStartup(func() error { | ||||
| 		onceMetric.Do(func() { | ||||
| 			m := dnsserver.GetConfig(c).Handler("prometheus") | ||||
| 			if m == nil { | ||||
| 				return | ||||
| 			} | ||||
| 			if x, ok := m.(*metrics.Metrics); ok { | ||||
| 				x.MustRegister(HealthDuration) | ||||
| 			} | ||||
| 		}) | ||||
| 		return nil | ||||
| 	}) | ||||
|  | ||||
| 	c.OnStartup(h.OnStartup) | ||||
| 	c.OnFinalShutdown(h.OnShutdown) | ||||
|  | ||||
| 	// Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running. | ||||
| 	return nil | ||||
|   | ||||
		Reference in New Issue
	
	Block a user