From b00ff13ca0254a6bceba7a785ce92b1548f0a143 Mon Sep 17 00:00:00 2001 From: Miek Gieben Date: Fri, 6 Mar 2020 09:13:27 +0100 Subject: [PATCH] Implement debugging with TXT records Signed-off-by: Miek Gieben --- plugin/traffic/README.md | 18 +++++++++++------- plugin/traffic/setup.go | 4 ---- plugin/traffic/traffic.go | 29 +++++++++++++++++++++-------- plugin/traffic/traffic_test.go | 12 ++++++++++++ plugin/traffic/xds/assignment.go | 31 ++++++++++++++++--------------- plugin/traffic/xds/client.go | 8 ++++---- 6 files changed, 64 insertions(+), 38 deletions(-) diff --git a/plugin/traffic/README.md b/plugin/traffic/README.md index 0e269ae24..98297be4d 100644 --- a/plugin/traffic/README.md +++ b/plugin/traffic/README.md @@ -27,13 +27,21 @@ the *management cluster* (see `cluster` below in "Syntax"). By default the name When bootstrapping *traffic* tries to retrieve the cluster endpoints for the management cluster, when the cluster is not found *traffic* will return a fatal error. -The *traffic* plugin handles A, AAAA and SRV queries. Queries for non-existent clusters get a -NXDOMAIN, where the minimal TTL is also set to 5s. +The *traffic* plugin handles A, AAAA, SRV and TXT queries. TXT queries are purely used for debugging +as health status of the endpoints is ignored in that case. +Queries for non-existent clusters get a NXDOMAIN, where the minimal TTL is also set to 5s. For A and AAAA queries each DNS response contains a single IP address that's considered the best one. The TTL on these answer is set to 5s. It will only return successful responses either with an answer or, otherwise, a NODATA response. +TXT replies will use the SRV record format augmented with the health status of each backend, as this +is useful for debugging. + +~~~ +web.lb.example.org. 5 IN TXT "100" "100" "18008" "endpoint-0.web.lb.example.org." "HEALTHY" +~~~ + For SRV queries *all* healthy backends will be returned - assuming the client doing the query is smart enough to select the best one. When SRV records are returned, the endpoint DNS names are synthesized `endpoint-..` that carries the IP address. Querying for these @@ -55,8 +63,7 @@ traffic TO... This enabled the *traffic* plugin, with a default node ID of `coredns` and no TLS. * **TO...** are the control plane endpoints to bootstrap from. These must start with `grpc://`. The - port number defaults to 443, if not specified. These endpoint will be tried in the order given. - First successful connection will be used to resolve the management cluster `xds`. + port number defaults to 443, if not specified. These endpoints will be tried in the order given. The extended syntax is available if you want more control. @@ -66,7 +73,6 @@ traffic TO... { id ID tls CERT KEY CA tls_servername NAME - ignore_health } ~~~ @@ -93,8 +99,6 @@ traffic TO... { * `tls_servername` **NAME** allows you to set a server name in the TLS configuration. This is needed because *traffic* connects to an IP address, so it can't infer the server name from it. - * `ignore_health` can be enabled to ignore endpoint health status, this can aid in debugging. - ## Naming Clusters When a cluster is named this usually consists out of a single word, i.e. "cluster-v0", or "web". diff --git a/plugin/traffic/setup.go b/plugin/traffic/setup.go index 91cfaf0ba..dfe6b0ccb 100644 --- a/plugin/traffic/setup.go +++ b/plugin/traffic/setup.go @@ -61,8 +61,6 @@ func setup(c *caddy.Controller) error { i++ goto redo } - // err == nil, we are connected - break } }() metrics.MustRegister(c, xds.ClusterGauge) @@ -132,8 +130,6 @@ func parseTraffic(c *caddy.Controller) (*Traffic, error) { return nil, c.ArgErr() } tlsServerName = c.Val() - case "ignore_health": - t.health = true default: return nil, c.Errf("unknown property '%s'", c.Val()) } diff --git a/plugin/traffic/traffic.go b/plugin/traffic/traffic.go index f5a8407ca..fe866ed24 100644 --- a/plugin/traffic/traffic.go +++ b/plugin/traffic/traffic.go @@ -13,6 +13,7 @@ import ( "github.com/coredns/coredns/plugin/traffic/xds" "github.com/coredns/coredns/request" + corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" "github.com/miekg/dns" ) @@ -25,7 +26,6 @@ type Traffic struct { hosts []string id string - health bool origins []string Next plugin.Handler @@ -48,7 +48,8 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg m.SetReply(r) m.Authoritative = true - sockaddr, ok := t.c.Select(cluster, t.health) + healthy := state.QType() == dns.TypeTXT + sockaddr, ok := t.c.Select(cluster, healthy) if !ok { // ok this cluster doesn't exist, potentially due to extra labels, which may be garbage or legit queries: // legit is: @@ -69,14 +70,14 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg if strings.HasPrefix(strings.ToLower(labels[0]), "endpoint-") { // recheck if the cluster exist. cluster = labels[1] - sockaddr, ok = t.c.Select(cluster, t.health) + sockaddr, ok = t.c.Select(cluster, healthy) if !ok { m.Ns = soa(state.Zone) m.Rcode = dns.RcodeNameError w.WriteMsg(m) return 0, nil } - return t.serveEndpoint(ctx, state, labels[0], cluster) + return t.serveEndpoint(ctx, state, labels[0], cluster, healthy) } case 3: if strings.ToLower(labels[0]) != "_grpclb" || strings.ToLower(labels[1]) != "_tcp" { @@ -88,7 +89,7 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg // OK, _grcplb._tcp query; we need to return the endpoint for the mgmt cluster *NOT* the cluster // we got the query for. This should exist, but we'll check later anyway. cluster = t.mgmt - sockaddr, _ = t.c.Select(cluster, t.health) + sockaddr, _ = t.c.Select(cluster, healthy) break default: m.Ns = soa(state.Zone) @@ -120,7 +121,7 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg } m.Answer = []dns.RR{&dns.AAAA{Hdr: dns.RR_Header{Name: state.QName(), Rrtype: dns.TypeAAAA, Class: dns.ClassINET, Ttl: 5}, AAAA: sockaddr.Address()}} case dns.TypeSRV: - sockaddrs, _ := t.c.All(cluster, t.health) + sockaddrs, _ := t.c.All(cluster, true) m.Answer = make([]dns.RR, 0, len(sockaddrs)) m.Extra = make([]dns.RR, 0, len(sockaddrs)) for i, sa := range sockaddrs { @@ -136,6 +137,18 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg m.Extra = append(m.Extra, &dns.A{Hdr: dns.RR_Header{Name: target, Rrtype: dns.TypeA, Class: dns.ClassINET, Ttl: 5}, A: sa.Address()}) } } + case dns.TypeTXT: + sockaddrs, _ := t.c.All(cluster, false) + m.Answer = make([]dns.RR, 0, len(sockaddrs)) + m.Extra = make([]dns.RR, 0, len(sockaddrs)) + for i, sa := range sockaddrs { + target := fmt.Sprintf("endpoint-%d.%s.%s", i, cluster, state.Zone) + + m.Answer = append(m.Answer, &dns.TXT{ + Hdr: dns.RR_Header{Name: state.QName(), Rrtype: dns.TypeTXT, Class: dns.ClassINET, Ttl: 5}, + Txt: []string{"100", "100", strconv.Itoa(int(sa.Port())), target, corepb.HealthStatus_name[int32(sa.Health)]}}) + m.Extra = append(m.Extra, &dns.TXT{Hdr: dns.RR_Header{Name: target, Rrtype: dns.TypeTXT, Class: dns.ClassINET, Ttl: 5}, Txt: []string{sa.Address().String()}}) + } default: m.Ns = soa(state.Zone) } @@ -144,7 +157,7 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg return 0, nil } -func (t *Traffic) serveEndpoint(ctx context.Context, state request.Request, endpoint, cluster string) (int, error) { +func (t *Traffic) serveEndpoint(ctx context.Context, state request.Request, endpoint, cluster string, healthy bool) (int, error) { m := new(dns.Msg) m.SetReply(state.Req) m.Authoritative = true @@ -167,7 +180,7 @@ func (t *Traffic) serveEndpoint(ctx context.Context, state request.Request, endp return 0, nil } - sockaddrs, _ := t.c.All(cluster, t.health) + sockaddrs, _ := t.c.All(cluster, healthy) if len(sockaddrs) < nr { m.Ns = soa(state.Zone) m.Rcode = dns.RcodeNameError diff --git a/plugin/traffic/traffic_test.go b/plugin/traffic/traffic_test.go index a0c566161..cb9595d36 100644 --- a/plugin/traffic/traffic_test.go +++ b/plugin/traffic/traffic_test.go @@ -76,6 +76,16 @@ func TestTraffic(t *testing.T) { }, cluster: "web", qtype: dns.TypeA, rcode: dns.RcodeSuccess, answer: "127.0.0.2", }, + // unknown endpoint and healthy endpoint, TXT query + { + cla: &endpointpb.ClusterLoadAssignment{ + ClusterName: "web", + Endpoints: endpoints([]EndpointHealth{ + {"127.0.0.1", 18008, corepb.HealthStatus_UNKNOWN}, + }), + }, + cluster: "web", qtype: dns.TypeTXT, rcode: dns.RcodeSuccess, answer: "endpoint-0.web.lb.example.org.", + }, // SRV query healthy endpoint { cla: &endpointpb.ClusterLoadAssignment{ @@ -144,6 +154,8 @@ func TestTraffic(t *testing.T) { addr = x.AAAA.String() case *dns.SRV: addr = x.Target + case *dns.TXT: + addr = x.Txt[3] } if tc.answer != addr { t.Errorf("Test %d: Expected answer %s, but got %s", i, tc.answer, addr) diff --git a/plugin/traffic/xds/assignment.go b/plugin/traffic/xds/assignment.go index 5d127a0e9..cb80d9df4 100644 --- a/plugin/traffic/xds/assignment.go +++ b/plugin/traffic/xds/assignment.go @@ -9,9 +9,10 @@ import ( endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3" ) -// SocketAddress holds a corepb.SocketAddress. +// SocketAddress holds a corepb.SocketAddress and a health status type SocketAddress struct { *corepb.SocketAddress + Health corepb.HealthStatus } // Address returns the address from s. @@ -71,38 +72,38 @@ func (a *assignment) clusters() []string { } // Select selects a endpoint from cluster load assignments, using weighted random selection. It only selects endpoints that are reporting healthy. -func (a *assignment) Select(cluster string, ignore bool) (*SocketAddress, bool) { +func (a *assignment) Select(cluster string, healthy bool) (*SocketAddress, bool) { cla := a.ClusterLoadAssignment(cluster) if cla == nil { return nil, false } weight := 0 - healthy := 0 + health := 0 for _, ep := range cla.Endpoints { for _, lb := range ep.GetLbEndpoints() { - if !ignore && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY { + if healthy && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY { continue } weight += int(lb.GetLoadBalancingWeight().GetValue()) - healthy++ + health++ } } - if healthy == 0 { + if health == 0 { return nil, true } // all weights are 0, randomly select one of the endpoints, if weight == 0 { - r := rand.Intn(healthy) + r := rand.Intn(health) i := 0 for _, ep := range cla.Endpoints { for _, lb := range ep.GetLbEndpoints() { - if !ignore && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY { + if healthy && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY { continue } if r == i { - return &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress()}, true + return &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress(), lb.GetHealthStatus()}, true } i++ } @@ -110,15 +111,15 @@ func (a *assignment) Select(cluster string, ignore bool) (*SocketAddress, bool) return nil, true } - r := rand.Intn(healthy) + 1 + r := rand.Intn(health) + 1 for _, ep := range cla.Endpoints { for _, lb := range ep.GetLbEndpoints() { - if !ignore && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY { + if healthy && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY { continue } r -= int(lb.GetLoadBalancingWeight().GetValue()) if r <= 0 { - return &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress()}, true + return &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress(), lb.GetHealthStatus()}, true } } } @@ -126,7 +127,7 @@ func (a *assignment) Select(cluster string, ignore bool) (*SocketAddress, bool) } // All returns all healthy endpoints. -func (a *assignment) All(cluster string, ignore bool) ([]*SocketAddress, bool) { +func (a *assignment) All(cluster string, healthy bool) ([]*SocketAddress, bool) { cla := a.ClusterLoadAssignment(cluster) if cla == nil { return nil, false @@ -135,10 +136,10 @@ func (a *assignment) All(cluster string, ignore bool) ([]*SocketAddress, bool) { sa := []*SocketAddress{} for _, ep := range cla.Endpoints { for _, lb := range ep.GetLbEndpoints() { - if !ignore && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY { + if healthy && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY { continue } - sa = append(sa, &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress()}) + sa = append(sa, &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress(), lb.GetHealthStatus()}) } } return sa, true diff --git a/plugin/traffic/xds/client.go b/plugin/traffic/xds/client.go index 071bf8d08..46e980504 100644 --- a/plugin/traffic/xds/client.go +++ b/plugin/traffic/xds/client.go @@ -199,19 +199,19 @@ func (c *Client) receive(stream adsStream) error { // Select returns an address that is deemed to be the correct one for this cluster. The returned // boolean indicates if the cluster exists. -func (c *Client) Select(cluster string, ignore bool) (*SocketAddress, bool) { +func (c *Client) Select(cluster string, healty bool) (*SocketAddress, bool) { if cluster == "" { return nil, false } - return c.assignments.Select(cluster, ignore) + return c.assignments.Select(cluster, healty) } // All returns all endpoints. -func (c *Client) All(cluster string, ignore bool) ([]*SocketAddress, bool) { +func (c *Client) All(cluster string, healty bool) ([]*SocketAddress, bool) { if cluster == "" { return nil, false } - return c.assignments.All(cluster, ignore) + return c.assignments.All(cluster, healty) } // Locality holds the locality for this server. It contains a Region, Zone and SubZone.