plugin/proxy: kick of HC on every 3rd failure (#1110)

* healthchecks: check on every 3rd failure

Check on every third failure and some cleanups to make this possible. A
failed healthcheck will never increase Fails, a successfull healthceck
will reset Fails to 0. This is a chance this counter now drops below 0,
making the upstream super? healthy.

This removes the okUntil smartness and condences everything back to 1
metrics: Fails; so it's simpler in that regard.

Timout errors are *not* attributed to the local upstream, and don't get
counted into the Fails anymore. Meaning the 'dig any isc.org' won't kill
your upstream.

Added extra test the see if the Fails counter gets reset after 3 failed
connection.

There is still a disconnect beween HTTP healthceck working the proxy (or
lookup) not being able to connect to the upstream.

* Fix tests
This commit is contained in:
Miek Gieben
2017-10-15 19:38:39 +02:00
committed by GitHub
parent c7ff44fb3a
commit e34e2c251f
13 changed files with 180 additions and 190 deletions

View File

@@ -21,7 +21,6 @@ type UpstreamHost struct {
Name string // IP address (and port) of this upstream host
Fails int32
FailTimeout time.Duration
OkUntil time.Time
CheckDown UpstreamHostDownFunc
CheckURL string
Checking bool
@@ -33,19 +32,8 @@ type UpstreamHost struct {
// back to some default criteria if necessary.
func (uh *UpstreamHost) Down() bool {
if uh.CheckDown == nil {
// Default settings
fails := atomic.LoadInt32(&uh.Fails)
after := false
uh.Lock()
until := uh.OkUntil
uh.Unlock()
if !until.IsZero() && time.Now().After(until) {
after = true
}
return after || fails > 0
return fails > 0
}
return uh.CheckDown(uh)
}
@@ -64,7 +52,6 @@ type HealthCheck struct {
Spray Policy
FailTimeout time.Duration
MaxFails int32
Future time.Duration
Path string
Port string
Interval time.Duration
@@ -72,6 +59,10 @@ type HealthCheck struct {
// Start starts the healthcheck
func (u *HealthCheck) Start() {
for i, h := range u.Hosts {
u.Hosts[i].CheckURL = u.normalizeCheckURL(h.Name)
}
u.stop = make(chan struct{})
if u.Path != "" {
u.wg.Add(1)
@@ -104,14 +95,16 @@ func (u *HealthCheck) Stop() error {
// otherwise checks will back up, potentially a lot of them if a host is
// absent for a long time. This arrangement makes checks quickly see if
// they are the only one running and abort otherwise.
func (uh *UpstreamHost) healthCheckURL(nextTs time.Time) {
// lock for our bool check. We don't just defer the unlock because
// we don't want the lock held while http.Get runs
// HealthCheckURL performs the http.Get that implements healthcheck.
func (uh *UpstreamHost) HealthCheckURL() {
// Lock for our bool check. We don't just defer the unlock because
// we don't want the lock held while http.Get runs.
uh.Lock()
// are we mid check? Don't run another one
if uh.Checking {
// We call HealthCheckURL from proxy.go and lookup.go, bail out when nothing
// is configured to healthcheck. Or we mid check? Don't run another one.
if uh.CheckURL == "" || uh.Checking { // nothing configured
uh.Unlock()
return
}
@@ -119,64 +112,39 @@ func (uh *UpstreamHost) healthCheckURL(nextTs time.Time) {
uh.Checking = true
uh.Unlock()
// fetch that url. This has been moved into a go func because
// when the remote host is not merely not serving, but actually
// absent, then tcp syn timeouts can be very long, and so one
// fetch could last several check intervals
if r, err := http.Get(uh.CheckURL); err == nil {
// default timeout (5s)
r, err := healthClient.Get(uh.CheckURL)
defer func() {
uh.Lock()
uh.Checking = false
uh.Unlock()
}()
if err != nil {
log.Printf("[WARNING] Host %s health check probe failed: %v", uh.Name, err)
return
}
if err == nil {
io.Copy(ioutil.Discard, r.Body)
r.Body.Close()
if r.StatusCode < 200 || r.StatusCode >= 400 {
log.Printf("[WARNING] Host %s health check returned HTTP code %d", uh.Name, r.StatusCode)
nextTs = time.Unix(0, 0)
} else {
// We are healthy again, reset fails
atomic.StoreInt32(&uh.Fails, 0)
return
}
} else {
log.Printf("[WARNING] Host %s health check probe failed: %v", uh.Name, err)
nextTs = time.Unix(0, 0)
}
uh.Lock()
uh.Checking = false
uh.OkUntil = nextTs
uh.Unlock()
// We are healthy again, reset fails.
atomic.StoreInt32(&uh.Fails, 0)
return
}
}
func (u *HealthCheck) healthCheck() {
for _, host := range u.Hosts {
if host.CheckURL == "" {
var hostName, checkPort string
// The DNS server might be an HTTP server. If so, extract its name.
ret, err := url.Parse(host.Name)
if err == nil && len(ret.Host) > 0 {
hostName = ret.Host
} else {
hostName = host.Name
}
// Extract the port number from the parsed server name.
checkHostName, checkPort, err := net.SplitHostPort(hostName)
if err != nil {
checkHostName = hostName
}
if u.Port != "" {
checkPort = u.Port
}
host.CheckURL = "http://" + net.JoinHostPort(checkHostName, checkPort) + u.Path
}
// calculate next timestamp before the get
nextTs := time.Now().Add(u.Future)
// locks/bools should prevent requests backing up
go host.healthCheckURL(nextTs)
go host.HealthCheckURL()
}
}
@@ -239,3 +207,28 @@ func (u *HealthCheck) Select() *UpstreamHost {
}
return u.Spray.Select(pool)
}
// normalizeCheckURL creates a proper URL for the health check.
func (u *HealthCheck) normalizeCheckURL(name string) string {
// The DNS server might be an HTTP server. If so, extract its name.
hostName := name
ret, err := url.Parse(name)
if err == nil && len(ret.Host) > 0 {
hostName = ret.Host
}
// Extract the port number from the parsed server name.
checkHostName, checkPort, err := net.SplitHostPort(hostName)
if err != nil {
checkHostName = hostName
}
if u.Port != "" {
checkPort = u.Port
}
checkURL := "http://" + net.JoinHostPort(checkHostName, checkPort) + u.Path
return checkURL
}
var healthClient = func() *http.Client { return &http.Client{Timeout: 5 * time.Second} }()

View File

@@ -13,6 +13,8 @@ import (
var workableServer *httptest.Server
func TestMain(m *testing.M) {
log.SetOutput(ioutil.Discard)
workableServer = httptest.NewServer(http.HandlerFunc(
func(w http.ResponseWriter, r *http.Request) {
// do nothing
@@ -30,15 +32,9 @@ func (r *customPolicy) Select(pool HostPool) *UpstreamHost {
func testPool() HostPool {
pool := []*UpstreamHost{
{
Name: workableServer.URL, // this should resolve (healthcheck test)
},
{
Name: "http://shouldnot.resolve", // this shouldn't
},
{
Name: "http://C",
},
{Name: workableServer.URL}, // this should resolve (healthcheck test)
{Name: "http://shouldnot.resolve"}, // this shouldn't
{Name: "http://C"},
}
return HostPool(pool)
}
@@ -53,21 +49,21 @@ func TestRegisterPolicy(t *testing.T) {
}
// TODO(miek): Disabled for now, we should get out of the habit of using
// realtime in these tests .
func testHealthCheck(t *testing.T) {
log.SetOutput(ioutil.Discard)
func TestHealthCheck(t *testing.T) {
u := &HealthCheck{
Hosts: testPool(),
FailTimeout: 10 * time.Second,
Future: 60 * time.Second,
MaxFails: 1,
}
for i, h := range u.Hosts {
u.Hosts[i].Fails = 1
u.Hosts[i].CheckURL = u.normalizeCheckURL(h.Name)
}
u.healthCheck()
// sleep a bit, it's async now
time.Sleep(time.Duration(2 * time.Second))
time.Sleep(time.Duration(1 * time.Second)) // sleep a bit, it's async now
if u.Hosts[0].Down() {
t.Error("Expected first host in testpool to not fail healthcheck.")
@@ -77,25 +73,6 @@ func testHealthCheck(t *testing.T) {
}
}
func TestSelect(t *testing.T) {
u := &HealthCheck{
Hosts: testPool()[:3],
FailTimeout: 10 * time.Second,
Future: 60 * time.Second,
MaxFails: 1,
}
u.Hosts[0].OkUntil = time.Unix(0, 0)
u.Hosts[1].OkUntil = time.Unix(0, 0)
u.Hosts[2].OkUntil = time.Unix(0, 0)
if h := u.Select(); h != nil {
t.Error("Expected select to return nil as all host are down")
}
u.Hosts[2].OkUntil = time.Time{}
if h := u.Select(); h == nil {
t.Error("Expected select to not return nil")
}
}
func TestRoundRobinPolicy(t *testing.T) {
pool := testPool()
rrPolicy := &RoundRobin{}
@@ -109,10 +86,8 @@ func TestRoundRobinPolicy(t *testing.T) {
if h != pool[2] {
t.Error("Expected second round robin host to be third host in the pool.")
}
// mark host as down
pool[0].OkUntil = time.Unix(0, 0)
h = rrPolicy.Select(pool)
if h != pool[1] {
if h != pool[0] {
t.Error("Expected third round robin host to be first host in the pool.")
}
}