Files
coredns/plugin/pkg/healthcheck/healthcheck.go
Miek Gieben e34e2c251f plugin/proxy: kick of HC on every 3rd failure (#1110)
* healthchecks: check on every 3rd failure

Check on every third failure and some cleanups to make this possible. A
failed healthcheck will never increase Fails, a successfull healthceck
will reset Fails to 0. This is a chance this counter now drops below 0,
making the upstream super? healthy.

This removes the okUntil smartness and condences everything back to 1
metrics: Fails; so it's simpler in that regard.

Timout errors are *not* attributed to the local upstream, and don't get
counted into the Fails anymore. Meaning the 'dig any isc.org' won't kill
your upstream.

Added extra test the see if the Fails counter gets reset after 3 failed
connection.

There is still a disconnect beween HTTP healthceck working the proxy (or
lookup) not being able to connect to the upstream.

* Fix tests
2017-10-15 19:38:39 +02:00

235 lines
5.6 KiB
Go

package healthcheck
import (
"io"
"io/ioutil"
"log"
"net"
"net/http"
"net/url"
"sync"
"sync/atomic"
"time"
)
// UpstreamHostDownFunc can be used to customize how Down behaves.
type UpstreamHostDownFunc func(*UpstreamHost) bool
// UpstreamHost represents a single proxy upstream
type UpstreamHost struct {
Conns int64 // must be first field to be 64-bit aligned on 32-bit systems
Name string // IP address (and port) of this upstream host
Fails int32
FailTimeout time.Duration
CheckDown UpstreamHostDownFunc
CheckURL string
Checking bool
sync.Mutex
}
// Down checks whether the upstream host is down or not.
// Down will try to use uh.CheckDown first, and will fall
// back to some default criteria if necessary.
func (uh *UpstreamHost) Down() bool {
if uh.CheckDown == nil {
fails := atomic.LoadInt32(&uh.Fails)
return fails > 0
}
return uh.CheckDown(uh)
}
// HostPool is a collection of UpstreamHosts.
type HostPool []*UpstreamHost
// HealthCheck is used for performing healthcheck
// on a collection of upstream hosts and select
// one based on the policy.
type HealthCheck struct {
wg sync.WaitGroup // Used to wait for running goroutines to stop.
stop chan struct{} // Signals running goroutines to stop.
Hosts HostPool
Policy Policy
Spray Policy
FailTimeout time.Duration
MaxFails int32
Path string
Port string
Interval time.Duration
}
// Start starts the healthcheck
func (u *HealthCheck) Start() {
for i, h := range u.Hosts {
u.Hosts[i].CheckURL = u.normalizeCheckURL(h.Name)
}
u.stop = make(chan struct{})
if u.Path != "" {
u.wg.Add(1)
go func() {
defer u.wg.Done()
u.healthCheckWorker(u.stop)
}()
}
}
// Stop sends a signal to all goroutines started by this staticUpstream to exit
// and waits for them to finish before returning.
func (u *HealthCheck) Stop() error {
close(u.stop)
u.wg.Wait()
return nil
}
// This was moved into a thread so that each host could throw a health
// check at the same time. The reason for this is that if we are checking
// 3 hosts, and the first one is gone, and we spend minutes timing out to
// fail it, we would not have been doing any other health checks in that
// time. So we now have a per-host lock and a threaded health check.
//
// We use the Checking bool to avoid concurrent checks against the same
// host; if one is taking a long time, the next one will find a check in
// progress and simply return before trying.
//
// We are carefully avoiding having the mutex locked while we check,
// otherwise checks will back up, potentially a lot of them if a host is
// absent for a long time. This arrangement makes checks quickly see if
// they are the only one running and abort otherwise.
// HealthCheckURL performs the http.Get that implements healthcheck.
func (uh *UpstreamHost) HealthCheckURL() {
// Lock for our bool check. We don't just defer the unlock because
// we don't want the lock held while http.Get runs.
uh.Lock()
// We call HealthCheckURL from proxy.go and lookup.go, bail out when nothing
// is configured to healthcheck. Or we mid check? Don't run another one.
if uh.CheckURL == "" || uh.Checking { // nothing configured
uh.Unlock()
return
}
uh.Checking = true
uh.Unlock()
// default timeout (5s)
r, err := healthClient.Get(uh.CheckURL)
defer func() {
uh.Lock()
uh.Checking = false
uh.Unlock()
}()
if err != nil {
log.Printf("[WARNING] Host %s health check probe failed: %v", uh.Name, err)
return
}
if err == nil {
io.Copy(ioutil.Discard, r.Body)
r.Body.Close()
if r.StatusCode < 200 || r.StatusCode >= 400 {
log.Printf("[WARNING] Host %s health check returned HTTP code %d", uh.Name, r.StatusCode)
return
}
// We are healthy again, reset fails.
atomic.StoreInt32(&uh.Fails, 0)
return
}
}
func (u *HealthCheck) healthCheck() {
for _, host := range u.Hosts {
// locks/bools should prevent requests backing up
go host.HealthCheckURL()
}
}
func (u *HealthCheck) healthCheckWorker(stop chan struct{}) {
ticker := time.NewTicker(u.Interval)
u.healthCheck()
for {
select {
case <-ticker.C:
u.healthCheck()
case <-stop:
ticker.Stop()
return
}
}
}
// Select selects an upstream host based on the policy
// and the healthcheck result.
func (u *HealthCheck) Select() *UpstreamHost {
pool := u.Hosts
if len(pool) == 1 {
if pool[0].Down() && u.Spray == nil {
return nil
}
return pool[0]
}
allDown := true
for _, host := range pool {
if !host.Down() {
allDown = false
break
}
}
if allDown {
if u.Spray == nil {
return nil
}
return u.Spray.Select(pool)
}
if u.Policy == nil {
h := (&Random{}).Select(pool)
if h != nil {
return h
}
if h == nil && u.Spray == nil {
return nil
}
return u.Spray.Select(pool)
}
h := u.Policy.Select(pool)
if h != nil {
return h
}
if u.Spray == nil {
return nil
}
return u.Spray.Select(pool)
}
// normalizeCheckURL creates a proper URL for the health check.
func (u *HealthCheck) normalizeCheckURL(name string) string {
// The DNS server might be an HTTP server. If so, extract its name.
hostName := name
ret, err := url.Parse(name)
if err == nil && len(ret.Host) > 0 {
hostName = ret.Host
}
// Extract the port number from the parsed server name.
checkHostName, checkPort, err := net.SplitHostPort(hostName)
if err != nil {
checkHostName = hostName
}
if u.Port != "" {
checkPort = u.Port
}
checkURL := "http://" + net.JoinHostPort(checkHostName, checkPort) + u.Path
return checkURL
}
var healthClient = func() *http.Client { return &http.Client{Timeout: 5 * time.Second} }()