Implement debugging with TXT records

Signed-off-by: Miek Gieben <miek@miek.nl>
This commit is contained in:
Miek Gieben
2020-03-06 09:13:27 +01:00
parent deb582259a
commit b00ff13ca0
6 changed files with 64 additions and 38 deletions

View File

@@ -27,13 +27,21 @@ the *management cluster* (see `cluster` below in "Syntax"). By default the name
When bootstrapping *traffic* tries to retrieve the cluster endpoints for the management cluster, When bootstrapping *traffic* tries to retrieve the cluster endpoints for the management cluster,
when the cluster is not found *traffic* will return a fatal error. when the cluster is not found *traffic* will return a fatal error.
The *traffic* plugin handles A, AAAA and SRV queries. Queries for non-existent clusters get a The *traffic* plugin handles A, AAAA, SRV and TXT queries. TXT queries are purely used for debugging
NXDOMAIN, where the minimal TTL is also set to 5s. as health status of the endpoints is ignored in that case.
Queries for non-existent clusters get a NXDOMAIN, where the minimal TTL is also set to 5s.
For A and AAAA queries each DNS response contains a single IP address that's considered the best For A and AAAA queries each DNS response contains a single IP address that's considered the best
one. The TTL on these answer is set to 5s. It will only return successful responses either with an one. The TTL on these answer is set to 5s. It will only return successful responses either with an
answer or, otherwise, a NODATA response. answer or, otherwise, a NODATA response.
TXT replies will use the SRV record format augmented with the health status of each backend, as this
is useful for debugging.
~~~
web.lb.example.org. 5 IN TXT "100" "100" "18008" "endpoint-0.web.lb.example.org." "HEALTHY"
~~~
For SRV queries *all* healthy backends will be returned - assuming the client doing the query For SRV queries *all* healthy backends will be returned - assuming the client doing the query
is smart enough to select the best one. When SRV records are returned, the endpoint DNS names is smart enough to select the best one. When SRV records are returned, the endpoint DNS names
are synthesized `endpoint-<N>.<cluster>.<zone>` that carries the IP address. Querying for these are synthesized `endpoint-<N>.<cluster>.<zone>` that carries the IP address. Querying for these
@@ -55,8 +63,7 @@ traffic TO...
This enabled the *traffic* plugin, with a default node ID of `coredns` and no TLS. This enabled the *traffic* plugin, with a default node ID of `coredns` and no TLS.
* **TO...** are the control plane endpoints to bootstrap from. These must start with `grpc://`. The * **TO...** are the control plane endpoints to bootstrap from. These must start with `grpc://`. The
port number defaults to 443, if not specified. These endpoint will be tried in the order given. port number defaults to 443, if not specified. These endpoints will be tried in the order given.
First successful connection will be used to resolve the management cluster `xds`.
The extended syntax is available if you want more control. The extended syntax is available if you want more control.
@@ -66,7 +73,6 @@ traffic TO... {
id ID id ID
tls CERT KEY CA tls CERT KEY CA
tls_servername NAME tls_servername NAME
ignore_health
} }
~~~ ~~~
@@ -93,8 +99,6 @@ traffic TO... {
* `tls_servername` **NAME** allows you to set a server name in the TLS configuration. This is * `tls_servername` **NAME** allows you to set a server name in the TLS configuration. This is
needed because *traffic* connects to an IP address, so it can't infer the server name from it. needed because *traffic* connects to an IP address, so it can't infer the server name from it.
* `ignore_health` can be enabled to ignore endpoint health status, this can aid in debugging.
## Naming Clusters ## Naming Clusters
When a cluster is named this usually consists out of a single word, i.e. "cluster-v0", or "web". When a cluster is named this usually consists out of a single word, i.e. "cluster-v0", or "web".

View File

@@ -61,8 +61,6 @@ func setup(c *caddy.Controller) error {
i++ i++
goto redo goto redo
} }
// err == nil, we are connected
break
} }
}() }()
metrics.MustRegister(c, xds.ClusterGauge) metrics.MustRegister(c, xds.ClusterGauge)
@@ -132,8 +130,6 @@ func parseTraffic(c *caddy.Controller) (*Traffic, error) {
return nil, c.ArgErr() return nil, c.ArgErr()
} }
tlsServerName = c.Val() tlsServerName = c.Val()
case "ignore_health":
t.health = true
default: default:
return nil, c.Errf("unknown property '%s'", c.Val()) return nil, c.Errf("unknown property '%s'", c.Val())
} }

View File

@@ -13,6 +13,7 @@ import (
"github.com/coredns/coredns/plugin/traffic/xds" "github.com/coredns/coredns/plugin/traffic/xds"
"github.com/coredns/coredns/request" "github.com/coredns/coredns/request"
corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
"github.com/miekg/dns" "github.com/miekg/dns"
) )
@@ -25,7 +26,6 @@ type Traffic struct {
hosts []string hosts []string
id string id string
health bool
origins []string origins []string
Next plugin.Handler Next plugin.Handler
@@ -48,7 +48,8 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
m.SetReply(r) m.SetReply(r)
m.Authoritative = true m.Authoritative = true
sockaddr, ok := t.c.Select(cluster, t.health) healthy := state.QType() == dns.TypeTXT
sockaddr, ok := t.c.Select(cluster, healthy)
if !ok { if !ok {
// ok this cluster doesn't exist, potentially due to extra labels, which may be garbage or legit queries: // ok this cluster doesn't exist, potentially due to extra labels, which may be garbage or legit queries:
// legit is: // legit is:
@@ -69,14 +70,14 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
if strings.HasPrefix(strings.ToLower(labels[0]), "endpoint-") { if strings.HasPrefix(strings.ToLower(labels[0]), "endpoint-") {
// recheck if the cluster exist. // recheck if the cluster exist.
cluster = labels[1] cluster = labels[1]
sockaddr, ok = t.c.Select(cluster, t.health) sockaddr, ok = t.c.Select(cluster, healthy)
if !ok { if !ok {
m.Ns = soa(state.Zone) m.Ns = soa(state.Zone)
m.Rcode = dns.RcodeNameError m.Rcode = dns.RcodeNameError
w.WriteMsg(m) w.WriteMsg(m)
return 0, nil return 0, nil
} }
return t.serveEndpoint(ctx, state, labels[0], cluster) return t.serveEndpoint(ctx, state, labels[0], cluster, healthy)
} }
case 3: case 3:
if strings.ToLower(labels[0]) != "_grpclb" || strings.ToLower(labels[1]) != "_tcp" { if strings.ToLower(labels[0]) != "_grpclb" || strings.ToLower(labels[1]) != "_tcp" {
@@ -88,7 +89,7 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
// OK, _grcplb._tcp query; we need to return the endpoint for the mgmt cluster *NOT* the cluster // OK, _grcplb._tcp query; we need to return the endpoint for the mgmt cluster *NOT* the cluster
// we got the query for. This should exist, but we'll check later anyway. // we got the query for. This should exist, but we'll check later anyway.
cluster = t.mgmt cluster = t.mgmt
sockaddr, _ = t.c.Select(cluster, t.health) sockaddr, _ = t.c.Select(cluster, healthy)
break break
default: default:
m.Ns = soa(state.Zone) m.Ns = soa(state.Zone)
@@ -120,7 +121,7 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
} }
m.Answer = []dns.RR{&dns.AAAA{Hdr: dns.RR_Header{Name: state.QName(), Rrtype: dns.TypeAAAA, Class: dns.ClassINET, Ttl: 5}, AAAA: sockaddr.Address()}} m.Answer = []dns.RR{&dns.AAAA{Hdr: dns.RR_Header{Name: state.QName(), Rrtype: dns.TypeAAAA, Class: dns.ClassINET, Ttl: 5}, AAAA: sockaddr.Address()}}
case dns.TypeSRV: case dns.TypeSRV:
sockaddrs, _ := t.c.All(cluster, t.health) sockaddrs, _ := t.c.All(cluster, true)
m.Answer = make([]dns.RR, 0, len(sockaddrs)) m.Answer = make([]dns.RR, 0, len(sockaddrs))
m.Extra = make([]dns.RR, 0, len(sockaddrs)) m.Extra = make([]dns.RR, 0, len(sockaddrs))
for i, sa := range sockaddrs { for i, sa := range sockaddrs {
@@ -136,6 +137,18 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
m.Extra = append(m.Extra, &dns.A{Hdr: dns.RR_Header{Name: target, Rrtype: dns.TypeA, Class: dns.ClassINET, Ttl: 5}, A: sa.Address()}) m.Extra = append(m.Extra, &dns.A{Hdr: dns.RR_Header{Name: target, Rrtype: dns.TypeA, Class: dns.ClassINET, Ttl: 5}, A: sa.Address()})
} }
} }
case dns.TypeTXT:
sockaddrs, _ := t.c.All(cluster, false)
m.Answer = make([]dns.RR, 0, len(sockaddrs))
m.Extra = make([]dns.RR, 0, len(sockaddrs))
for i, sa := range sockaddrs {
target := fmt.Sprintf("endpoint-%d.%s.%s", i, cluster, state.Zone)
m.Answer = append(m.Answer, &dns.TXT{
Hdr: dns.RR_Header{Name: state.QName(), Rrtype: dns.TypeTXT, Class: dns.ClassINET, Ttl: 5},
Txt: []string{"100", "100", strconv.Itoa(int(sa.Port())), target, corepb.HealthStatus_name[int32(sa.Health)]}})
m.Extra = append(m.Extra, &dns.TXT{Hdr: dns.RR_Header{Name: target, Rrtype: dns.TypeTXT, Class: dns.ClassINET, Ttl: 5}, Txt: []string{sa.Address().String()}})
}
default: default:
m.Ns = soa(state.Zone) m.Ns = soa(state.Zone)
} }
@@ -144,7 +157,7 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
return 0, nil return 0, nil
} }
func (t *Traffic) serveEndpoint(ctx context.Context, state request.Request, endpoint, cluster string) (int, error) { func (t *Traffic) serveEndpoint(ctx context.Context, state request.Request, endpoint, cluster string, healthy bool) (int, error) {
m := new(dns.Msg) m := new(dns.Msg)
m.SetReply(state.Req) m.SetReply(state.Req)
m.Authoritative = true m.Authoritative = true
@@ -167,7 +180,7 @@ func (t *Traffic) serveEndpoint(ctx context.Context, state request.Request, endp
return 0, nil return 0, nil
} }
sockaddrs, _ := t.c.All(cluster, t.health) sockaddrs, _ := t.c.All(cluster, healthy)
if len(sockaddrs) < nr { if len(sockaddrs) < nr {
m.Ns = soa(state.Zone) m.Ns = soa(state.Zone)
m.Rcode = dns.RcodeNameError m.Rcode = dns.RcodeNameError

View File

@@ -76,6 +76,16 @@ func TestTraffic(t *testing.T) {
}, },
cluster: "web", qtype: dns.TypeA, rcode: dns.RcodeSuccess, answer: "127.0.0.2", cluster: "web", qtype: dns.TypeA, rcode: dns.RcodeSuccess, answer: "127.0.0.2",
}, },
// unknown endpoint and healthy endpoint, TXT query
{
cla: &endpointpb.ClusterLoadAssignment{
ClusterName: "web",
Endpoints: endpoints([]EndpointHealth{
{"127.0.0.1", 18008, corepb.HealthStatus_UNKNOWN},
}),
},
cluster: "web", qtype: dns.TypeTXT, rcode: dns.RcodeSuccess, answer: "endpoint-0.web.lb.example.org.",
},
// SRV query healthy endpoint // SRV query healthy endpoint
{ {
cla: &endpointpb.ClusterLoadAssignment{ cla: &endpointpb.ClusterLoadAssignment{
@@ -144,6 +154,8 @@ func TestTraffic(t *testing.T) {
addr = x.AAAA.String() addr = x.AAAA.String()
case *dns.SRV: case *dns.SRV:
addr = x.Target addr = x.Target
case *dns.TXT:
addr = x.Txt[3]
} }
if tc.answer != addr { if tc.answer != addr {
t.Errorf("Test %d: Expected answer %s, but got %s", i, tc.answer, addr) t.Errorf("Test %d: Expected answer %s, but got %s", i, tc.answer, addr)

View File

@@ -9,9 +9,10 @@ import (
endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3" endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3"
) )
// SocketAddress holds a corepb.SocketAddress. // SocketAddress holds a corepb.SocketAddress and a health status
type SocketAddress struct { type SocketAddress struct {
*corepb.SocketAddress *corepb.SocketAddress
Health corepb.HealthStatus
} }
// Address returns the address from s. // Address returns the address from s.
@@ -71,38 +72,38 @@ func (a *assignment) clusters() []string {
} }
// Select selects a endpoint from cluster load assignments, using weighted random selection. It only selects endpoints that are reporting healthy. // Select selects a endpoint from cluster load assignments, using weighted random selection. It only selects endpoints that are reporting healthy.
func (a *assignment) Select(cluster string, ignore bool) (*SocketAddress, bool) { func (a *assignment) Select(cluster string, healthy bool) (*SocketAddress, bool) {
cla := a.ClusterLoadAssignment(cluster) cla := a.ClusterLoadAssignment(cluster)
if cla == nil { if cla == nil {
return nil, false return nil, false
} }
weight := 0 weight := 0
healthy := 0 health := 0
for _, ep := range cla.Endpoints { for _, ep := range cla.Endpoints {
for _, lb := range ep.GetLbEndpoints() { for _, lb := range ep.GetLbEndpoints() {
if !ignore && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY { if healthy && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY {
continue continue
} }
weight += int(lb.GetLoadBalancingWeight().GetValue()) weight += int(lb.GetLoadBalancingWeight().GetValue())
healthy++ health++
} }
} }
if healthy == 0 { if health == 0 {
return nil, true return nil, true
} }
// all weights are 0, randomly select one of the endpoints, // all weights are 0, randomly select one of the endpoints,
if weight == 0 { if weight == 0 {
r := rand.Intn(healthy) r := rand.Intn(health)
i := 0 i := 0
for _, ep := range cla.Endpoints { for _, ep := range cla.Endpoints {
for _, lb := range ep.GetLbEndpoints() { for _, lb := range ep.GetLbEndpoints() {
if !ignore && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY { if healthy && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY {
continue continue
} }
if r == i { if r == i {
return &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress()}, true return &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress(), lb.GetHealthStatus()}, true
} }
i++ i++
} }
@@ -110,15 +111,15 @@ func (a *assignment) Select(cluster string, ignore bool) (*SocketAddress, bool)
return nil, true return nil, true
} }
r := rand.Intn(healthy) + 1 r := rand.Intn(health) + 1
for _, ep := range cla.Endpoints { for _, ep := range cla.Endpoints {
for _, lb := range ep.GetLbEndpoints() { for _, lb := range ep.GetLbEndpoints() {
if !ignore && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY { if healthy && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY {
continue continue
} }
r -= int(lb.GetLoadBalancingWeight().GetValue()) r -= int(lb.GetLoadBalancingWeight().GetValue())
if r <= 0 { if r <= 0 {
return &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress()}, true return &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress(), lb.GetHealthStatus()}, true
} }
} }
} }
@@ -126,7 +127,7 @@ func (a *assignment) Select(cluster string, ignore bool) (*SocketAddress, bool)
} }
// All returns all healthy endpoints. // All returns all healthy endpoints.
func (a *assignment) All(cluster string, ignore bool) ([]*SocketAddress, bool) { func (a *assignment) All(cluster string, healthy bool) ([]*SocketAddress, bool) {
cla := a.ClusterLoadAssignment(cluster) cla := a.ClusterLoadAssignment(cluster)
if cla == nil { if cla == nil {
return nil, false return nil, false
@@ -135,10 +136,10 @@ func (a *assignment) All(cluster string, ignore bool) ([]*SocketAddress, bool) {
sa := []*SocketAddress{} sa := []*SocketAddress{}
for _, ep := range cla.Endpoints { for _, ep := range cla.Endpoints {
for _, lb := range ep.GetLbEndpoints() { for _, lb := range ep.GetLbEndpoints() {
if !ignore && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY { if healthy && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY {
continue continue
} }
sa = append(sa, &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress()}) sa = append(sa, &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress(), lb.GetHealthStatus()})
} }
} }
return sa, true return sa, true

View File

@@ -199,19 +199,19 @@ func (c *Client) receive(stream adsStream) error {
// Select returns an address that is deemed to be the correct one for this cluster. The returned // Select returns an address that is deemed to be the correct one for this cluster. The returned
// boolean indicates if the cluster exists. // boolean indicates if the cluster exists.
func (c *Client) Select(cluster string, ignore bool) (*SocketAddress, bool) { func (c *Client) Select(cluster string, healty bool) (*SocketAddress, bool) {
if cluster == "" { if cluster == "" {
return nil, false return nil, false
} }
return c.assignments.Select(cluster, ignore) return c.assignments.Select(cluster, healty)
} }
// All returns all endpoints. // All returns all endpoints.
func (c *Client) All(cluster string, ignore bool) ([]*SocketAddress, bool) { func (c *Client) All(cluster string, healty bool) ([]*SocketAddress, bool) {
if cluster == "" { if cluster == "" {
return nil, false return nil, false
} }
return c.assignments.All(cluster, ignore) return c.assignments.All(cluster, healty)
} }
// Locality holds the locality for this server. It contains a Region, Zone and SubZone. // Locality holds the locality for this server. It contains a Region, Zone and SubZone.