mirror of
https://github.com/coredns/coredns.git
synced 2025-11-11 22:42:21 -05:00
plugin/proxy: kick of HC on every 3rd failure (#1110)
* healthchecks: check on every 3rd failure Check on every third failure and some cleanups to make this possible. A failed healthcheck will never increase Fails, a successfull healthceck will reset Fails to 0. This is a chance this counter now drops below 0, making the upstream super? healthy. This removes the okUntil smartness and condences everything back to 1 metrics: Fails; so it's simpler in that regard. Timout errors are *not* attributed to the local upstream, and don't get counted into the Fails anymore. Meaning the 'dig any isc.org' won't kill your upstream. Added extra test the see if the Fails counter gets reset after 3 failed connection. There is still a disconnect beween HTTP healthceck working the proxy (or lookup) not being able to connect to the upstream. * Fix tests
This commit is contained in:
@@ -21,7 +21,6 @@ type UpstreamHost struct {
|
||||
Name string // IP address (and port) of this upstream host
|
||||
Fails int32
|
||||
FailTimeout time.Duration
|
||||
OkUntil time.Time
|
||||
CheckDown UpstreamHostDownFunc
|
||||
CheckURL string
|
||||
Checking bool
|
||||
@@ -33,19 +32,8 @@ type UpstreamHost struct {
|
||||
// back to some default criteria if necessary.
|
||||
func (uh *UpstreamHost) Down() bool {
|
||||
if uh.CheckDown == nil {
|
||||
// Default settings
|
||||
fails := atomic.LoadInt32(&uh.Fails)
|
||||
after := false
|
||||
|
||||
uh.Lock()
|
||||
until := uh.OkUntil
|
||||
uh.Unlock()
|
||||
|
||||
if !until.IsZero() && time.Now().After(until) {
|
||||
after = true
|
||||
}
|
||||
|
||||
return after || fails > 0
|
||||
return fails > 0
|
||||
}
|
||||
return uh.CheckDown(uh)
|
||||
}
|
||||
@@ -64,7 +52,6 @@ type HealthCheck struct {
|
||||
Spray Policy
|
||||
FailTimeout time.Duration
|
||||
MaxFails int32
|
||||
Future time.Duration
|
||||
Path string
|
||||
Port string
|
||||
Interval time.Duration
|
||||
@@ -72,6 +59,10 @@ type HealthCheck struct {
|
||||
|
||||
// Start starts the healthcheck
|
||||
func (u *HealthCheck) Start() {
|
||||
for i, h := range u.Hosts {
|
||||
u.Hosts[i].CheckURL = u.normalizeCheckURL(h.Name)
|
||||
}
|
||||
|
||||
u.stop = make(chan struct{})
|
||||
if u.Path != "" {
|
||||
u.wg.Add(1)
|
||||
@@ -104,14 +95,16 @@ func (u *HealthCheck) Stop() error {
|
||||
// otherwise checks will back up, potentially a lot of them if a host is
|
||||
// absent for a long time. This arrangement makes checks quickly see if
|
||||
// they are the only one running and abort otherwise.
|
||||
func (uh *UpstreamHost) healthCheckURL(nextTs time.Time) {
|
||||
|
||||
// lock for our bool check. We don't just defer the unlock because
|
||||
// we don't want the lock held while http.Get runs
|
||||
// HealthCheckURL performs the http.Get that implements healthcheck.
|
||||
func (uh *UpstreamHost) HealthCheckURL() {
|
||||
// Lock for our bool check. We don't just defer the unlock because
|
||||
// we don't want the lock held while http.Get runs.
|
||||
uh.Lock()
|
||||
|
||||
// are we mid check? Don't run another one
|
||||
if uh.Checking {
|
||||
// We call HealthCheckURL from proxy.go and lookup.go, bail out when nothing
|
||||
// is configured to healthcheck. Or we mid check? Don't run another one.
|
||||
if uh.CheckURL == "" || uh.Checking { // nothing configured
|
||||
uh.Unlock()
|
||||
return
|
||||
}
|
||||
@@ -119,64 +112,39 @@ func (uh *UpstreamHost) healthCheckURL(nextTs time.Time) {
|
||||
uh.Checking = true
|
||||
uh.Unlock()
|
||||
|
||||
// fetch that url. This has been moved into a go func because
|
||||
// when the remote host is not merely not serving, but actually
|
||||
// absent, then tcp syn timeouts can be very long, and so one
|
||||
// fetch could last several check intervals
|
||||
if r, err := http.Get(uh.CheckURL); err == nil {
|
||||
// default timeout (5s)
|
||||
r, err := healthClient.Get(uh.CheckURL)
|
||||
|
||||
defer func() {
|
||||
uh.Lock()
|
||||
uh.Checking = false
|
||||
uh.Unlock()
|
||||
}()
|
||||
|
||||
if err != nil {
|
||||
log.Printf("[WARNING] Host %s health check probe failed: %v", uh.Name, err)
|
||||
return
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
io.Copy(ioutil.Discard, r.Body)
|
||||
r.Body.Close()
|
||||
|
||||
if r.StatusCode < 200 || r.StatusCode >= 400 {
|
||||
log.Printf("[WARNING] Host %s health check returned HTTP code %d", uh.Name, r.StatusCode)
|
||||
nextTs = time.Unix(0, 0)
|
||||
} else {
|
||||
// We are healthy again, reset fails
|
||||
atomic.StoreInt32(&uh.Fails, 0)
|
||||
return
|
||||
}
|
||||
} else {
|
||||
log.Printf("[WARNING] Host %s health check probe failed: %v", uh.Name, err)
|
||||
nextTs = time.Unix(0, 0)
|
||||
}
|
||||
|
||||
uh.Lock()
|
||||
uh.Checking = false
|
||||
uh.OkUntil = nextTs
|
||||
uh.Unlock()
|
||||
// We are healthy again, reset fails.
|
||||
atomic.StoreInt32(&uh.Fails, 0)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func (u *HealthCheck) healthCheck() {
|
||||
for _, host := range u.Hosts {
|
||||
|
||||
if host.CheckURL == "" {
|
||||
var hostName, checkPort string
|
||||
|
||||
// The DNS server might be an HTTP server. If so, extract its name.
|
||||
ret, err := url.Parse(host.Name)
|
||||
if err == nil && len(ret.Host) > 0 {
|
||||
hostName = ret.Host
|
||||
} else {
|
||||
hostName = host.Name
|
||||
}
|
||||
|
||||
// Extract the port number from the parsed server name.
|
||||
checkHostName, checkPort, err := net.SplitHostPort(hostName)
|
||||
if err != nil {
|
||||
checkHostName = hostName
|
||||
}
|
||||
|
||||
if u.Port != "" {
|
||||
checkPort = u.Port
|
||||
}
|
||||
|
||||
host.CheckURL = "http://" + net.JoinHostPort(checkHostName, checkPort) + u.Path
|
||||
}
|
||||
|
||||
// calculate next timestamp before the get
|
||||
nextTs := time.Now().Add(u.Future)
|
||||
|
||||
// locks/bools should prevent requests backing up
|
||||
go host.healthCheckURL(nextTs)
|
||||
go host.HealthCheckURL()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -239,3 +207,28 @@ func (u *HealthCheck) Select() *UpstreamHost {
|
||||
}
|
||||
return u.Spray.Select(pool)
|
||||
}
|
||||
|
||||
// normalizeCheckURL creates a proper URL for the health check.
|
||||
func (u *HealthCheck) normalizeCheckURL(name string) string {
|
||||
// The DNS server might be an HTTP server. If so, extract its name.
|
||||
hostName := name
|
||||
ret, err := url.Parse(name)
|
||||
if err == nil && len(ret.Host) > 0 {
|
||||
hostName = ret.Host
|
||||
}
|
||||
|
||||
// Extract the port number from the parsed server name.
|
||||
checkHostName, checkPort, err := net.SplitHostPort(hostName)
|
||||
if err != nil {
|
||||
checkHostName = hostName
|
||||
}
|
||||
|
||||
if u.Port != "" {
|
||||
checkPort = u.Port
|
||||
}
|
||||
|
||||
checkURL := "http://" + net.JoinHostPort(checkHostName, checkPort) + u.Path
|
||||
return checkURL
|
||||
}
|
||||
|
||||
var healthClient = func() *http.Client { return &http.Client{Timeout: 5 * time.Second} }()
|
||||
|
||||
@@ -13,6 +13,8 @@ import (
|
||||
var workableServer *httptest.Server
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
log.SetOutput(ioutil.Discard)
|
||||
|
||||
workableServer = httptest.NewServer(http.HandlerFunc(
|
||||
func(w http.ResponseWriter, r *http.Request) {
|
||||
// do nothing
|
||||
@@ -30,15 +32,9 @@ func (r *customPolicy) Select(pool HostPool) *UpstreamHost {
|
||||
|
||||
func testPool() HostPool {
|
||||
pool := []*UpstreamHost{
|
||||
{
|
||||
Name: workableServer.URL, // this should resolve (healthcheck test)
|
||||
},
|
||||
{
|
||||
Name: "http://shouldnot.resolve", // this shouldn't
|
||||
},
|
||||
{
|
||||
Name: "http://C",
|
||||
},
|
||||
{Name: workableServer.URL}, // this should resolve (healthcheck test)
|
||||
{Name: "http://shouldnot.resolve"}, // this shouldn't
|
||||
{Name: "http://C"},
|
||||
}
|
||||
return HostPool(pool)
|
||||
}
|
||||
@@ -53,21 +49,21 @@ func TestRegisterPolicy(t *testing.T) {
|
||||
|
||||
}
|
||||
|
||||
// TODO(miek): Disabled for now, we should get out of the habit of using
|
||||
// realtime in these tests .
|
||||
func testHealthCheck(t *testing.T) {
|
||||
log.SetOutput(ioutil.Discard)
|
||||
|
||||
func TestHealthCheck(t *testing.T) {
|
||||
u := &HealthCheck{
|
||||
Hosts: testPool(),
|
||||
FailTimeout: 10 * time.Second,
|
||||
Future: 60 * time.Second,
|
||||
MaxFails: 1,
|
||||
}
|
||||
|
||||
for i, h := range u.Hosts {
|
||||
u.Hosts[i].Fails = 1
|
||||
u.Hosts[i].CheckURL = u.normalizeCheckURL(h.Name)
|
||||
}
|
||||
|
||||
u.healthCheck()
|
||||
// sleep a bit, it's async now
|
||||
time.Sleep(time.Duration(2 * time.Second))
|
||||
|
||||
time.Sleep(time.Duration(1 * time.Second)) // sleep a bit, it's async now
|
||||
|
||||
if u.Hosts[0].Down() {
|
||||
t.Error("Expected first host in testpool to not fail healthcheck.")
|
||||
@@ -77,25 +73,6 @@ func testHealthCheck(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestSelect(t *testing.T) {
|
||||
u := &HealthCheck{
|
||||
Hosts: testPool()[:3],
|
||||
FailTimeout: 10 * time.Second,
|
||||
Future: 60 * time.Second,
|
||||
MaxFails: 1,
|
||||
}
|
||||
u.Hosts[0].OkUntil = time.Unix(0, 0)
|
||||
u.Hosts[1].OkUntil = time.Unix(0, 0)
|
||||
u.Hosts[2].OkUntil = time.Unix(0, 0)
|
||||
if h := u.Select(); h != nil {
|
||||
t.Error("Expected select to return nil as all host are down")
|
||||
}
|
||||
u.Hosts[2].OkUntil = time.Time{}
|
||||
if h := u.Select(); h == nil {
|
||||
t.Error("Expected select to not return nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRoundRobinPolicy(t *testing.T) {
|
||||
pool := testPool()
|
||||
rrPolicy := &RoundRobin{}
|
||||
@@ -109,10 +86,8 @@ func TestRoundRobinPolicy(t *testing.T) {
|
||||
if h != pool[2] {
|
||||
t.Error("Expected second round robin host to be third host in the pool.")
|
||||
}
|
||||
// mark host as down
|
||||
pool[0].OkUntil = time.Unix(0, 0)
|
||||
h = rrPolicy.Select(pool)
|
||||
if h != pool[1] {
|
||||
if h != pool[0] {
|
||||
t.Error("Expected third round robin host to be first host in the pool.")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user