fix(dnstap): Better error handling (redial & logging) when Dnstap is busy (#7619)

* Fix dnstap redial & improve logging

Signed-off-by: xyang378 <xyang378@bloomberg.net>

* fix CR comments

Signed-off-by: xyang378 <xyang378@bloomberg.net>

* redial at interval

Signed-off-by: xyang378 <xyang378@bloomberg.net>

* CR comments & lint
Signed-off-by: xyang378 <xyang378@bloomberg.net>

CR comment

* fix lint

Signed-off-by: xyang378 <xyang378@bloomberg.net>

---------

Signed-off-by: xyang378 <xyang378@bloomberg.net>
This commit is contained in:
Alicia Y
2025-11-06 21:11:08 +00:00
committed by GitHub
parent 18e70fcde6
commit 59afd4b65e
2 changed files with 198 additions and 57 deletions

View File

@@ -2,6 +2,7 @@ package dnstap
import (
"crypto/tls"
"errors"
"net"
"sync/atomic"
"time"
@@ -15,6 +16,7 @@ const (
tcpTimeout = 4 * time.Second
flushTimeout = 1 * time.Second
errorCheckInterval = 10 * time.Second
skipVerify = false // by default, every tls connection is verified to be secure
)
@@ -24,6 +26,10 @@ type tapper interface {
Dnstap(*tap.Dnstap)
}
type WarnLogger interface {
Warningf(format string, v ...any)
}
// dio implements the Tapper interface.
type dio struct {
endpoint string
@@ -36,8 +42,12 @@ type dio struct {
tcpTimeout time.Duration
skipVerify bool
tcpWriteBufSize int
logger WarnLogger
errorCheckInterval time.Duration
}
var errNoOutput = errors.New("dnstap not connected to output socket")
// newIO returns a new and initialized pointer to a dio.
func newIO(proto, endpoint string, multipleQueue int, multipleTcpWriteBuf int) *dio {
return &dio{
@@ -49,6 +59,8 @@ func newIO(proto, endpoint string, multipleQueue int, multipleTcpWriteBuf int) *
tcpTimeout: tcpTimeout,
skipVerify: skipVerify,
tcpWriteBufSize: multipleTcpWriteBuf * tcpWriteBufSize,
logger: log,
errorCheckInterval: errorCheckInterval,
}
}
@@ -104,21 +116,21 @@ func (d *dio) close() { close(d.quit) }
func (d *dio) write(payload *tap.Dnstap) error {
if d.enc == nil {
atomic.AddUint32(&d.dropped, 1)
return nil
return errNoOutput
}
if err := d.enc.writeMsg(payload); err != nil {
atomic.AddUint32(&d.dropped, 1)
return err
}
return nil
}
func (d *dio) serve() {
timeout := time.NewTimer(d.flushTimeout)
defer timeout.Stop()
flushTicker := time.NewTicker(d.flushTimeout)
errorCheckTicker := time.NewTicker(d.errorCheckInterval)
defer flushTicker.Stop()
defer errorCheckTicker.Stop()
for {
timeout.Reset(d.flushTimeout)
select {
case <-d.quit:
if d.enc == nil {
@@ -129,16 +141,22 @@ func (d *dio) serve() {
return
case payload := <-d.queue:
if err := d.write(payload); err != nil {
atomic.AddUint32(&d.dropped, 1)
if !errors.Is(err, errNoOutput) {
// Redial immediately if it's not an output connection error
d.dial()
}
case <-timeout.C:
}
case <-flushTicker.C:
if d.enc != nil {
d.enc.flush()
}
case <-errorCheckTicker.C:
if dropped := atomic.SwapUint32(&d.dropped, 0); dropped > 0 {
log.Warningf("Dropped dnstap messages: %d", dropped)
d.logger.Warningf("Dropped dnstap messages: %d\n", dropped)
}
if d.enc == nil {
d.dial()
} else {
d.enc.flush()
}
}
}

View File

@@ -1,6 +1,7 @@
package dnstap
import (
"fmt"
"net"
"sync"
"testing"
@@ -10,6 +11,7 @@ import (
tap "github.com/dnstap/golang-dnstap"
fs "github.com/farsightsec/golang-framestream"
"github.com/stretchr/testify/require"
)
var (
@@ -17,6 +19,16 @@ var (
tmsg = tap.Dnstap{Type: &msgType}
)
type MockLogger struct {
WarnCount int
WarnLog string
}
func (l *MockLogger) Warningf(format string, v ...any) {
l.WarnCount++
l.WarnLog += fmt.Sprintf(format, v...)
}
func accept(t *testing.T, l net.Listener, count int) {
t.Helper()
server, err := l.Accept()
@@ -64,6 +76,7 @@ func TestTransport(t *testing.T) {
dio := newIO(param[0], l.Addr().String(), 1, 1)
dio.tcpTimeout = 10 * time.Millisecond
dio.flushTimeout = 30 * time.Millisecond
dio.errorCheckInterval = 50 * time.Millisecond
dio.connect()
dio.Dnstap(&tmsg)
@@ -93,6 +106,7 @@ func TestRace(t *testing.T) {
dio := newIO("tcp", l.Addr().String(), 1, 1)
dio.tcpTimeout = 10 * time.Millisecond
dio.flushTimeout = 30 * time.Millisecond
dio.errorCheckInterval = 50 * time.Millisecond
dio.connect()
defer dio.close()
@@ -108,8 +122,10 @@ func TestRace(t *testing.T) {
}
func TestReconnect(t *testing.T) {
count := 5
t.Run("ConnectedOnStart", func(t *testing.T) {
// GIVEN
// TCP connection available before DnsTap start up
// DnsTap successfully established output connection on start up
l, err := reuseport.Listen("tcp", ":0")
if err != nil {
t.Fatalf("Cannot start listener: %s", err)
@@ -123,14 +139,20 @@ func TestReconnect(t *testing.T) {
}()
addr := l.Addr().String()
logger := MockLogger{}
dio := newIO("tcp", addr, 1, 1)
dio.tcpTimeout = 10 * time.Millisecond
dio.flushTimeout = 30 * time.Millisecond
dio.errorCheckInterval = 50 * time.Millisecond
dio.logger = &logger
dio.connect()
defer dio.close()
// WHEN
// TCP connection closed when DnsTap is still running
// TCP listener starts again on the same port
// DnsTap send multiple messages
dio.Dnstap(&tmsg)
wg.Wait()
// Close listener
@@ -148,9 +170,110 @@ func TestReconnect(t *testing.T) {
wg.Done()
}()
for range count {
messageCount := 5
for range messageCount {
time.Sleep(100 * time.Millisecond)
dio.Dnstap(&tmsg)
}
wg.Wait()
// THEN
// DnsTap is able to reconnect
// Messages can be sent eventually
require.NotNil(t, dio.enc)
require.Equal(t, 0, len(dio.queue))
require.Less(t, logger.WarnCount, messageCount)
})
t.Run("NotConnectedOnStart", func(t *testing.T) {
// GIVEN
// No TCP connection established at DnsTap start up
l, err := reuseport.Listen("tcp", ":0")
if err != nil {
t.Fatalf("Cannot start listener: %s", err)
}
l.Close()
logger := MockLogger{}
addr := l.Addr().String()
dio := newIO("tcp", addr, 1, 1)
dio.tcpTimeout = 10 * time.Millisecond
dio.flushTimeout = 30 * time.Millisecond
dio.errorCheckInterval = 50 * time.Millisecond
dio.logger = &logger
dio.connect()
defer dio.close()
// WHEN
// DnsTap is already running
// TCP listener starts on DnsTap's configured port
// DnsTap send multiple messages
dio.Dnstap(&tmsg)
l, err = reuseport.Listen("tcp", addr)
if err != nil {
t.Fatalf("Cannot start listener: %s", err)
}
defer l.Close()
var wg sync.WaitGroup
wg.Add(1)
messageCount := 5
go func() {
accept(t, l, messageCount)
wg.Done()
}()
for range messageCount {
time.Sleep(100 * time.Millisecond)
dio.Dnstap(&tmsg)
}
wg.Wait()
// THEN
// DnsTap is able to reconnect
// Messages can be sent eventually
require.NotNil(t, dio.enc)
require.Equal(t, 0, len(dio.queue))
require.Less(t, logger.WarnCount, messageCount)
})
}
func TestFullQueueWriteFail(t *testing.T) {
// GIVEN
// DnsTap I/O with a small queue
l, err := reuseport.Listen("unix", "dn2stap.sock")
if err != nil {
t.Fatalf("Cannot start listener: %s", err)
}
defer l.Close()
var wg sync.WaitGroup
wg.Add(1)
go func() {
accept(t, l, 1)
wg.Done()
}()
logger := MockLogger{}
dio := newIO("unix", l.Addr().String(), 1, 1)
dio.flushTimeout = 500 * time.Millisecond
dio.errorCheckInterval = 50 * time.Millisecond
dio.logger = &logger
dio.queue = make(chan *tap.Dnstap, 1)
dio.connect()
defer dio.close()
// WHEN
// messages overwhelms the queue
count := 100
for range count {
dio.Dnstap(&tmsg)
}
wg.Wait()
// THEN
// Dropped messages are logged
require.NotEqual(t, 0, logger.WarnCount)
require.Contains(t, logger.WarnLog, "Dropped dnstap messages")
}