Implement debugging with TXT records

Signed-off-by: Miek Gieben <miek@miek.nl>
This commit is contained in:
Miek Gieben
2020-03-06 09:13:27 +01:00
parent deb582259a
commit b00ff13ca0
6 changed files with 64 additions and 38 deletions

View File

@@ -27,13 +27,21 @@ the *management cluster* (see `cluster` below in "Syntax"). By default the name
When bootstrapping *traffic* tries to retrieve the cluster endpoints for the management cluster,
when the cluster is not found *traffic* will return a fatal error.
The *traffic* plugin handles A, AAAA and SRV queries. Queries for non-existent clusters get a
NXDOMAIN, where the minimal TTL is also set to 5s.
The *traffic* plugin handles A, AAAA, SRV and TXT queries. TXT queries are purely used for debugging
as health status of the endpoints is ignored in that case.
Queries for non-existent clusters get a NXDOMAIN, where the minimal TTL is also set to 5s.
For A and AAAA queries each DNS response contains a single IP address that's considered the best
one. The TTL on these answer is set to 5s. It will only return successful responses either with an
answer or, otherwise, a NODATA response.
TXT replies will use the SRV record format augmented with the health status of each backend, as this
is useful for debugging.
~~~
web.lb.example.org. 5 IN TXT "100" "100" "18008" "endpoint-0.web.lb.example.org." "HEALTHY"
~~~
For SRV queries *all* healthy backends will be returned - assuming the client doing the query
is smart enough to select the best one. When SRV records are returned, the endpoint DNS names
are synthesized `endpoint-<N>.<cluster>.<zone>` that carries the IP address. Querying for these
@@ -55,8 +63,7 @@ traffic TO...
This enabled the *traffic* plugin, with a default node ID of `coredns` and no TLS.
* **TO...** are the control plane endpoints to bootstrap from. These must start with `grpc://`. The
port number defaults to 443, if not specified. These endpoint will be tried in the order given.
First successful connection will be used to resolve the management cluster `xds`.
port number defaults to 443, if not specified. These endpoints will be tried in the order given.
The extended syntax is available if you want more control.
@@ -66,7 +73,6 @@ traffic TO... {
id ID
tls CERT KEY CA
tls_servername NAME
ignore_health
}
~~~
@@ -93,8 +99,6 @@ traffic TO... {
* `tls_servername` **NAME** allows you to set a server name in the TLS configuration. This is
needed because *traffic* connects to an IP address, so it can't infer the server name from it.
* `ignore_health` can be enabled to ignore endpoint health status, this can aid in debugging.
## Naming Clusters
When a cluster is named this usually consists out of a single word, i.e. "cluster-v0", or "web".

View File

@@ -61,8 +61,6 @@ func setup(c *caddy.Controller) error {
i++
goto redo
}
// err == nil, we are connected
break
}
}()
metrics.MustRegister(c, xds.ClusterGauge)
@@ -132,8 +130,6 @@ func parseTraffic(c *caddy.Controller) (*Traffic, error) {
return nil, c.ArgErr()
}
tlsServerName = c.Val()
case "ignore_health":
t.health = true
default:
return nil, c.Errf("unknown property '%s'", c.Val())
}

View File

@@ -13,6 +13,7 @@ import (
"github.com/coredns/coredns/plugin/traffic/xds"
"github.com/coredns/coredns/request"
corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
"github.com/miekg/dns"
)
@@ -25,7 +26,6 @@ type Traffic struct {
hosts []string
id string
health bool
origins []string
Next plugin.Handler
@@ -48,7 +48,8 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
m.SetReply(r)
m.Authoritative = true
sockaddr, ok := t.c.Select(cluster, t.health)
healthy := state.QType() == dns.TypeTXT
sockaddr, ok := t.c.Select(cluster, healthy)
if !ok {
// ok this cluster doesn't exist, potentially due to extra labels, which may be garbage or legit queries:
// legit is:
@@ -69,14 +70,14 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
if strings.HasPrefix(strings.ToLower(labels[0]), "endpoint-") {
// recheck if the cluster exist.
cluster = labels[1]
sockaddr, ok = t.c.Select(cluster, t.health)
sockaddr, ok = t.c.Select(cluster, healthy)
if !ok {
m.Ns = soa(state.Zone)
m.Rcode = dns.RcodeNameError
w.WriteMsg(m)
return 0, nil
}
return t.serveEndpoint(ctx, state, labels[0], cluster)
return t.serveEndpoint(ctx, state, labels[0], cluster, healthy)
}
case 3:
if strings.ToLower(labels[0]) != "_grpclb" || strings.ToLower(labels[1]) != "_tcp" {
@@ -88,7 +89,7 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
// OK, _grcplb._tcp query; we need to return the endpoint for the mgmt cluster *NOT* the cluster
// we got the query for. This should exist, but we'll check later anyway.
cluster = t.mgmt
sockaddr, _ = t.c.Select(cluster, t.health)
sockaddr, _ = t.c.Select(cluster, healthy)
break
default:
m.Ns = soa(state.Zone)
@@ -120,7 +121,7 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
}
m.Answer = []dns.RR{&dns.AAAA{Hdr: dns.RR_Header{Name: state.QName(), Rrtype: dns.TypeAAAA, Class: dns.ClassINET, Ttl: 5}, AAAA: sockaddr.Address()}}
case dns.TypeSRV:
sockaddrs, _ := t.c.All(cluster, t.health)
sockaddrs, _ := t.c.All(cluster, true)
m.Answer = make([]dns.RR, 0, len(sockaddrs))
m.Extra = make([]dns.RR, 0, len(sockaddrs))
for i, sa := range sockaddrs {
@@ -136,6 +137,18 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
m.Extra = append(m.Extra, &dns.A{Hdr: dns.RR_Header{Name: target, Rrtype: dns.TypeA, Class: dns.ClassINET, Ttl: 5}, A: sa.Address()})
}
}
case dns.TypeTXT:
sockaddrs, _ := t.c.All(cluster, false)
m.Answer = make([]dns.RR, 0, len(sockaddrs))
m.Extra = make([]dns.RR, 0, len(sockaddrs))
for i, sa := range sockaddrs {
target := fmt.Sprintf("endpoint-%d.%s.%s", i, cluster, state.Zone)
m.Answer = append(m.Answer, &dns.TXT{
Hdr: dns.RR_Header{Name: state.QName(), Rrtype: dns.TypeTXT, Class: dns.ClassINET, Ttl: 5},
Txt: []string{"100", "100", strconv.Itoa(int(sa.Port())), target, corepb.HealthStatus_name[int32(sa.Health)]}})
m.Extra = append(m.Extra, &dns.TXT{Hdr: dns.RR_Header{Name: target, Rrtype: dns.TypeTXT, Class: dns.ClassINET, Ttl: 5}, Txt: []string{sa.Address().String()}})
}
default:
m.Ns = soa(state.Zone)
}
@@ -144,7 +157,7 @@ func (t *Traffic) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
return 0, nil
}
func (t *Traffic) serveEndpoint(ctx context.Context, state request.Request, endpoint, cluster string) (int, error) {
func (t *Traffic) serveEndpoint(ctx context.Context, state request.Request, endpoint, cluster string, healthy bool) (int, error) {
m := new(dns.Msg)
m.SetReply(state.Req)
m.Authoritative = true
@@ -167,7 +180,7 @@ func (t *Traffic) serveEndpoint(ctx context.Context, state request.Request, endp
return 0, nil
}
sockaddrs, _ := t.c.All(cluster, t.health)
sockaddrs, _ := t.c.All(cluster, healthy)
if len(sockaddrs) < nr {
m.Ns = soa(state.Zone)
m.Rcode = dns.RcodeNameError

View File

@@ -76,6 +76,16 @@ func TestTraffic(t *testing.T) {
},
cluster: "web", qtype: dns.TypeA, rcode: dns.RcodeSuccess, answer: "127.0.0.2",
},
// unknown endpoint and healthy endpoint, TXT query
{
cla: &endpointpb.ClusterLoadAssignment{
ClusterName: "web",
Endpoints: endpoints([]EndpointHealth{
{"127.0.0.1", 18008, corepb.HealthStatus_UNKNOWN},
}),
},
cluster: "web", qtype: dns.TypeTXT, rcode: dns.RcodeSuccess, answer: "endpoint-0.web.lb.example.org.",
},
// SRV query healthy endpoint
{
cla: &endpointpb.ClusterLoadAssignment{
@@ -144,6 +154,8 @@ func TestTraffic(t *testing.T) {
addr = x.AAAA.String()
case *dns.SRV:
addr = x.Target
case *dns.TXT:
addr = x.Txt[3]
}
if tc.answer != addr {
t.Errorf("Test %d: Expected answer %s, but got %s", i, tc.answer, addr)

View File

@@ -9,9 +9,10 @@ import (
endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3"
)
// SocketAddress holds a corepb.SocketAddress.
// SocketAddress holds a corepb.SocketAddress and a health status
type SocketAddress struct {
*corepb.SocketAddress
Health corepb.HealthStatus
}
// Address returns the address from s.
@@ -71,38 +72,38 @@ func (a *assignment) clusters() []string {
}
// Select selects a endpoint from cluster load assignments, using weighted random selection. It only selects endpoints that are reporting healthy.
func (a *assignment) Select(cluster string, ignore bool) (*SocketAddress, bool) {
func (a *assignment) Select(cluster string, healthy bool) (*SocketAddress, bool) {
cla := a.ClusterLoadAssignment(cluster)
if cla == nil {
return nil, false
}
weight := 0
healthy := 0
health := 0
for _, ep := range cla.Endpoints {
for _, lb := range ep.GetLbEndpoints() {
if !ignore && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY {
if healthy && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY {
continue
}
weight += int(lb.GetLoadBalancingWeight().GetValue())
healthy++
health++
}
}
if healthy == 0 {
if health == 0 {
return nil, true
}
// all weights are 0, randomly select one of the endpoints,
if weight == 0 {
r := rand.Intn(healthy)
r := rand.Intn(health)
i := 0
for _, ep := range cla.Endpoints {
for _, lb := range ep.GetLbEndpoints() {
if !ignore && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY {
if healthy && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY {
continue
}
if r == i {
return &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress()}, true
return &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress(), lb.GetHealthStatus()}, true
}
i++
}
@@ -110,15 +111,15 @@ func (a *assignment) Select(cluster string, ignore bool) (*SocketAddress, bool)
return nil, true
}
r := rand.Intn(healthy) + 1
r := rand.Intn(health) + 1
for _, ep := range cla.Endpoints {
for _, lb := range ep.GetLbEndpoints() {
if !ignore && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY {
if healthy && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY {
continue
}
r -= int(lb.GetLoadBalancingWeight().GetValue())
if r <= 0 {
return &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress()}, true
return &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress(), lb.GetHealthStatus()}, true
}
}
}
@@ -126,7 +127,7 @@ func (a *assignment) Select(cluster string, ignore bool) (*SocketAddress, bool)
}
// All returns all healthy endpoints.
func (a *assignment) All(cluster string, ignore bool) ([]*SocketAddress, bool) {
func (a *assignment) All(cluster string, healthy bool) ([]*SocketAddress, bool) {
cla := a.ClusterLoadAssignment(cluster)
if cla == nil {
return nil, false
@@ -135,10 +136,10 @@ func (a *assignment) All(cluster string, ignore bool) ([]*SocketAddress, bool) {
sa := []*SocketAddress{}
for _, ep := range cla.Endpoints {
for _, lb := range ep.GetLbEndpoints() {
if !ignore && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY {
if healthy && lb.GetHealthStatus() != corepb.HealthStatus_HEALTHY {
continue
}
sa = append(sa, &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress()})
sa = append(sa, &SocketAddress{lb.GetEndpoint().GetAddress().GetSocketAddress(), lb.GetHealthStatus()})
}
}
return sa, true

View File

@@ -199,19 +199,19 @@ func (c *Client) receive(stream adsStream) error {
// Select returns an address that is deemed to be the correct one for this cluster. The returned
// boolean indicates if the cluster exists.
func (c *Client) Select(cluster string, ignore bool) (*SocketAddress, bool) {
func (c *Client) Select(cluster string, healty bool) (*SocketAddress, bool) {
if cluster == "" {
return nil, false
}
return c.assignments.Select(cluster, ignore)
return c.assignments.Select(cluster, healty)
}
// All returns all endpoints.
func (c *Client) All(cluster string, ignore bool) ([]*SocketAddress, bool) {
func (c *Client) All(cluster string, healty bool) ([]*SocketAddress, bool) {
if cluster == "" {
return nil, false
}
return c.assignments.All(cluster, ignore)
return c.assignments.All(cluster, healty)
}
// Locality holds the locality for this server. It contains a Region, Zone and SubZone.