package health import ( "context" "net/http" "time" "github.com/coredns/coredns/plugin" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) // overloaded queries the health end point and updates a metrics showing how long it took. func (h *health) overloaded(ctx context.Context) { timeout := 3 * time.Second client := http.Client{ Timeout: timeout, } url := "http://" + h.Addr + "/health" req, _ := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) tick := time.NewTicker(1 * time.Second) defer tick.Stop() for { select { case <-tick.C: start := time.Now() resp, err := client.Do(req) if err != nil && ctx.Err() == context.Canceled { // request was cancelled by parent goroutine return } if err != nil { HealthDuration.Observe(time.Since(start).Seconds()) HealthFailures.Inc() log.Warningf("Local health request to %q failed: %s", url, err) continue } resp.Body.Close() elapsed := time.Since(start) HealthDuration.Observe(elapsed.Seconds()) if elapsed > time.Second { // 1s is pretty random, but a *local* scrape taking that long isn't good log.Warningf("Local health request to %q took more than 1s: %s", url, elapsed) } case <-ctx.Done(): return } } } var ( // HealthDuration is the metric used for exporting how fast we can retrieve the /health endpoint. HealthDuration = promauto.NewHistogram(prometheus.HistogramOpts{ Namespace: plugin.Namespace, Subsystem: "health", Name: "request_duration_seconds", Buckets: plugin.SlimTimeBuckets, Help: "Histogram of the time (in seconds) each request took.", }) // HealthFailures is the metric used to count how many times the health request failed HealthFailures = promauto.NewCounter(prometheus.CounterOpts{ Namespace: plugin.Namespace, Subsystem: "health", Name: "request_failures_total", Help: "The number of times the health check failed.", }) )