fix(dnstap): Better error handling (redial & logging) when Dnstap is busy (#7619)

* Fix dnstap redial & improve logging

Signed-off-by: xyang378 <xyang378@bloomberg.net>

* fix CR comments

Signed-off-by: xyang378 <xyang378@bloomberg.net>

* redial at interval

Signed-off-by: xyang378 <xyang378@bloomberg.net>

* CR comments & lint
Signed-off-by: xyang378 <xyang378@bloomberg.net>

CR comment

* fix lint

Signed-off-by: xyang378 <xyang378@bloomberg.net>

---------

Signed-off-by: xyang378 <xyang378@bloomberg.net>
This commit is contained in:
Alicia Y
2025-11-06 21:11:08 +00:00
committed by GitHub
parent 18e70fcde6
commit 59afd4b65e
2 changed files with 198 additions and 57 deletions

View File

@@ -1,6 +1,7 @@
package dnstap
import (
"fmt"
"net"
"sync"
"testing"
@@ -10,6 +11,7 @@ import (
tap "github.com/dnstap/golang-dnstap"
fs "github.com/farsightsec/golang-framestream"
"github.com/stretchr/testify/require"
)
var (
@@ -17,6 +19,16 @@ var (
tmsg = tap.Dnstap{Type: &msgType}
)
type MockLogger struct {
WarnCount int
WarnLog string
}
func (l *MockLogger) Warningf(format string, v ...any) {
l.WarnCount++
l.WarnLog += fmt.Sprintf(format, v...)
}
func accept(t *testing.T, l net.Listener, count int) {
t.Helper()
server, err := l.Accept()
@@ -64,6 +76,7 @@ func TestTransport(t *testing.T) {
dio := newIO(param[0], l.Addr().String(), 1, 1)
dio.tcpTimeout = 10 * time.Millisecond
dio.flushTimeout = 30 * time.Millisecond
dio.errorCheckInterval = 50 * time.Millisecond
dio.connect()
dio.Dnstap(&tmsg)
@@ -93,6 +106,7 @@ func TestRace(t *testing.T) {
dio := newIO("tcp", l.Addr().String(), 1, 1)
dio.tcpTimeout = 10 * time.Millisecond
dio.flushTimeout = 30 * time.Millisecond
dio.errorCheckInterval = 50 * time.Millisecond
dio.connect()
defer dio.close()
@@ -108,12 +122,131 @@ func TestRace(t *testing.T) {
}
func TestReconnect(t *testing.T) {
count := 5
t.Run("ConnectedOnStart", func(t *testing.T) {
// GIVEN
// TCP connection available before DnsTap start up
// DnsTap successfully established output connection on start up
l, err := reuseport.Listen("tcp", ":0")
if err != nil {
t.Fatalf("Cannot start listener: %s", err)
}
l, err := reuseport.Listen("tcp", ":0")
var wg sync.WaitGroup
wg.Add(1)
go func() {
accept(t, l, 1)
wg.Done()
}()
addr := l.Addr().String()
logger := MockLogger{}
dio := newIO("tcp", addr, 1, 1)
dio.tcpTimeout = 10 * time.Millisecond
dio.flushTimeout = 30 * time.Millisecond
dio.errorCheckInterval = 50 * time.Millisecond
dio.logger = &logger
dio.connect()
defer dio.close()
// WHEN
// TCP connection closed when DnsTap is still running
// TCP listener starts again on the same port
// DnsTap send multiple messages
dio.Dnstap(&tmsg)
wg.Wait()
// Close listener
l.Close()
// And start TCP listener again on the same port
l, err = reuseport.Listen("tcp", addr)
if err != nil {
t.Fatalf("Cannot start listener: %s", err)
}
defer l.Close()
wg.Add(1)
go func() {
accept(t, l, 1)
wg.Done()
}()
messageCount := 5
for range messageCount {
time.Sleep(100 * time.Millisecond)
dio.Dnstap(&tmsg)
}
wg.Wait()
// THEN
// DnsTap is able to reconnect
// Messages can be sent eventually
require.NotNil(t, dio.enc)
require.Equal(t, 0, len(dio.queue))
require.Less(t, logger.WarnCount, messageCount)
})
t.Run("NotConnectedOnStart", func(t *testing.T) {
// GIVEN
// No TCP connection established at DnsTap start up
l, err := reuseport.Listen("tcp", ":0")
if err != nil {
t.Fatalf("Cannot start listener: %s", err)
}
l.Close()
logger := MockLogger{}
addr := l.Addr().String()
dio := newIO("tcp", addr, 1, 1)
dio.tcpTimeout = 10 * time.Millisecond
dio.flushTimeout = 30 * time.Millisecond
dio.errorCheckInterval = 50 * time.Millisecond
dio.logger = &logger
dio.connect()
defer dio.close()
// WHEN
// DnsTap is already running
// TCP listener starts on DnsTap's configured port
// DnsTap send multiple messages
dio.Dnstap(&tmsg)
l, err = reuseport.Listen("tcp", addr)
if err != nil {
t.Fatalf("Cannot start listener: %s", err)
}
defer l.Close()
var wg sync.WaitGroup
wg.Add(1)
messageCount := 5
go func() {
accept(t, l, messageCount)
wg.Done()
}()
for range messageCount {
time.Sleep(100 * time.Millisecond)
dio.Dnstap(&tmsg)
}
wg.Wait()
// THEN
// DnsTap is able to reconnect
// Messages can be sent eventually
require.NotNil(t, dio.enc)
require.Equal(t, 0, len(dio.queue))
require.Less(t, logger.WarnCount, messageCount)
})
}
func TestFullQueueWriteFail(t *testing.T) {
// GIVEN
// DnsTap I/O with a small queue
l, err := reuseport.Listen("unix", "dn2stap.sock")
if err != nil {
t.Fatalf("Cannot start listener: %s", err)
}
defer l.Close()
var wg sync.WaitGroup
wg.Add(1)
@@ -122,35 +255,25 @@ func TestReconnect(t *testing.T) {
wg.Done()
}()
addr := l.Addr().String()
dio := newIO("tcp", addr, 1, 1)
dio.tcpTimeout = 10 * time.Millisecond
dio.flushTimeout = 30 * time.Millisecond
logger := MockLogger{}
dio := newIO("unix", l.Addr().String(), 1, 1)
dio.flushTimeout = 500 * time.Millisecond
dio.errorCheckInterval = 50 * time.Millisecond
dio.logger = &logger
dio.queue = make(chan *tap.Dnstap, 1)
dio.connect()
defer dio.close()
dio.Dnstap(&tmsg)
wg.Wait()
// Close listener
l.Close()
// And start TCP listener again on the same port
l, err = reuseport.Listen("tcp", addr)
if err != nil {
t.Fatalf("Cannot start listener: %s", err)
}
defer l.Close()
wg.Add(1)
go func() {
accept(t, l, 1)
wg.Done()
}()
// WHEN
// messages overwhelms the queue
count := 100
for range count {
time.Sleep(100 * time.Millisecond)
dio.Dnstap(&tmsg)
}
wg.Wait()
// THEN
// Dropped messages are logged
require.NotEqual(t, 0, logger.WarnCount)
require.Contains(t, logger.WarnLog, "Dropped dnstap messages")
}