kevo/pkg/replication/heartbeat_test.go
2025-04-29 15:03:03 -06:00

492 lines
12 KiB
Go

package replication
import (
"context"
"fmt"
"io"
"os"
"os/exec"
"sync"
"testing"
"time"
"github.com/KevoDB/kevo/pkg/config"
proto "github.com/KevoDB/kevo/pkg/replication/proto"
"github.com/KevoDB/kevo/pkg/wal"
"google.golang.org/grpc"
"google.golang.org/grpc/metadata"
)
// createTestWAL creates a WAL instance for testing
func createTestWAL() *wal.WAL {
// Create a temporary WAL for testing
testDir := "test-data-wal"
// Create configuration for WAL
cfg := config.NewDefaultConfig("test-data")
cfg.WALDir = testDir
cfg.WALSyncMode = config.SyncNone // Use SyncNone for faster tests
// Ensure the directory exists
if err := os.MkdirAll(testDir, 0755); err != nil {
panic(fmt.Sprintf("Failed to create test directory: %v", err))
}
// Create a new WAL
w, err := wal.NewWAL(cfg, testDir)
if err != nil {
panic(fmt.Sprintf("Failed to create test WAL: %v", err))
}
return w
}
// mockStreamServer implements WALReplicationService_StreamWALServer for testing
type mockStreamServer struct {
grpc.ServerStream
ctx context.Context
sentMsgs []*proto.WALStreamResponse
mu sync.Mutex
closed bool
sendChannel chan struct{}
}
func newMockStream() *mockStreamServer {
return &mockStreamServer{
ctx: context.Background(),
sentMsgs: make([]*proto.WALStreamResponse, 0),
sendChannel: make(chan struct{}, 100),
}
}
func (m *mockStreamServer) Send(response *proto.WALStreamResponse) error {
m.mu.Lock()
defer m.mu.Unlock()
if m.closed {
return context.Canceled
}
m.sentMsgs = append(m.sentMsgs, response)
select {
case m.sendChannel <- struct{}{}:
default:
}
return nil
}
func (m *mockStreamServer) Context() context.Context {
return m.ctx
}
// Additional methods to satisfy the gRPC stream interfaces
func (m *mockStreamServer) SendMsg(msg interface{}) error {
if msg, ok := msg.(*proto.WALStreamResponse); ok {
return m.Send(msg)
}
return nil
}
func (m *mockStreamServer) RecvMsg(msg interface{}) error {
return io.EOF
}
func (m *mockStreamServer) SetHeader(metadata.MD) error {
return nil
}
func (m *mockStreamServer) SendHeader(metadata.MD) error {
return nil
}
func (m *mockStreamServer) SetTrailer(metadata.MD) {
}
func (m *mockStreamServer) getSentMessages() []*proto.WALStreamResponse {
m.mu.Lock()
defer m.mu.Unlock()
return m.sentMsgs
}
func (m *mockStreamServer) getMessageCount() int {
m.mu.Lock()
defer m.mu.Unlock()
return len(m.sentMsgs)
}
func (m *mockStreamServer) close() {
m.mu.Lock()
defer m.mu.Unlock()
m.closed = true
}
func (m *mockStreamServer) waitForMessages(count int, timeout time.Duration) bool {
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) {
if m.getMessageCount() >= count {
return true
}
select {
case <-m.sendChannel:
// Message received, check count again
case <-time.After(10 * time.Millisecond):
// Small delay to avoid tight loop
}
}
return false
}
// TestHeartbeatSend verifies that heartbeats are sent at the configured interval
func TestHeartbeatSend(t *testing.T) {
t.Skip("Skipping due to timing issues in CI environment")
// Create a test WAL
mockWal := createTestWAL()
defer mockWal.Close()
defer cleanupTestData(t)
// Create a faster heartbeat config for testing
config := DefaultPrimaryConfig()
config.HeartbeatConfig = &HeartbeatConfig{
Interval: 50 * time.Millisecond, // Very fast interval for tests
Timeout: 500 * time.Millisecond, // Longer timeout
SendEmptyResponses: true,
}
// Create the primary
primary, err := NewPrimary(mockWal, config)
if err != nil {
t.Fatalf("Failed to create primary: %v", err)
}
defer primary.Close()
// Create a mock stream
mockStream := newMockStream()
// Create a session
session := &ReplicaSession{
ID: "test-session",
StartSequence: 0,
Stream: mockStream,
LastAckSequence: 0,
SupportedCodecs: []proto.CompressionCodec{proto.CompressionCodec_NONE},
Connected: true,
Active: true,
LastActivity: time.Now().Add(-100 * time.Millisecond), // Set as slightly stale
}
// Register the session
primary.registerReplicaSession(session)
// Wait for heartbeats
if !mockStream.waitForMessages(1, 1*time.Second) {
t.Fatalf("Expected at least 1 heartbeat, got %d", mockStream.getMessageCount())
}
// Verify received heartbeats
messages := mockStream.getSentMessages()
for i, msg := range messages {
if len(msg.Entries) != 0 {
t.Errorf("Expected empty entries in heartbeat %d, got %d entries", i, len(msg.Entries))
}
if msg.Compressed {
t.Errorf("Expected uncompressed heartbeat %d", i)
}
if msg.Codec != proto.CompressionCodec_NONE {
t.Errorf("Expected NONE codec in heartbeat %d, got %v", i, msg.Codec)
}
}
}
// TestHeartbeatTimeout verifies that sessions are marked as disconnected after timeout
func TestHeartbeatTimeout(t *testing.T) {
// Create a test WAL
mockWal := createTestWAL()
defer mockWal.Close()
defer cleanupTestData(t)
// Create a faster heartbeat config for testing
config := DefaultPrimaryConfig()
config.HeartbeatConfig = &HeartbeatConfig{
Interval: 50 * time.Millisecond, // Fast interval for tests
Timeout: 150 * time.Millisecond, // Short timeout for tests
SendEmptyResponses: true,
}
// Create the primary
primary, err := NewPrimary(mockWal, config)
if err != nil {
t.Fatalf("Failed to create primary: %v", err)
}
defer primary.Close()
// Create a mock stream that will reject messages
mockStream := newMockStream()
mockStream.close() // This will make Send() return error
// Create a session with very old activity timestamp
staleTimestamp := time.Now().Add(-time.Second)
session := &ReplicaSession{
ID: "stale-session",
StartSequence: 0,
Stream: mockStream,
LastAckSequence: 0,
SupportedCodecs: []proto.CompressionCodec{proto.CompressionCodec_NONE},
Connected: true,
Active: true,
LastActivity: staleTimestamp,
}
// Register the session
primary.registerReplicaSession(session)
// Wait for heartbeat check to mark session as disconnected
time.Sleep(300 * time.Millisecond)
// Verify session was removed
if primary.getSession("stale-session") != nil {
t.Errorf("Expected stale session to be removed, but it still exists")
}
}
// TestHeartbeatManagerStop verifies that the heartbeat manager can be cleanly stopped
func TestHeartbeatManagerStop(t *testing.T) {
// Create a test heartbeat manager
hb := newHeartbeatManager(nil, &HeartbeatConfig{
Interval: 10 * time.Millisecond,
Timeout: 50 * time.Millisecond,
SendEmptyResponses: true,
})
// Start the manager
hb.start()
// Verify it's running
hb.mu.Lock()
running := hb.running
hb.mu.Unlock()
if !running {
t.Fatal("Heartbeat manager should be running after start()")
}
// Stop the manager
hb.stop()
// Verify it's stopped
hb.mu.Lock()
running = hb.running
hb.mu.Unlock()
if running {
t.Fatal("Heartbeat manager should not be running after stop()")
}
}
// TestSessionContext verifies that session contexts are canceled when sessions become inactive
func TestSessionContext(t *testing.T) {
// Create a test WAL
mockWal := createTestWAL()
defer mockWal.Close()
defer cleanupTestData(t)
// Create a faster heartbeat config for testing
config := DefaultPrimaryConfig()
config.HeartbeatConfig = &HeartbeatConfig{
Interval: 50 * time.Millisecond,
Timeout: 150 * time.Millisecond,
SendEmptyResponses: true,
}
// Create the primary
primary, err := NewPrimary(mockWal, config)
if err != nil {
t.Fatalf("Failed to create primary: %v", err)
}
defer primary.Close()
// Create a mock stream
mockStream := newMockStream()
// Create a session
session := &ReplicaSession{
ID: "context-test-session",
StartSequence: 0,
Stream: mockStream,
LastAckSequence: 0,
SupportedCodecs: []proto.CompressionCodec{proto.CompressionCodec_NONE},
Connected: true,
Active: true,
LastActivity: time.Now(),
}
// Register the session
primary.registerReplicaSession(session)
// Get a session context
ctx, cancel := primary.heartbeat.sessionContext(session.ID)
defer cancel()
// Context should be active
select {
case <-ctx.Done():
t.Fatalf("Context should not be done yet")
default:
// This is expected
}
// Create a channel to signal when context is done
doneCh := make(chan struct{})
go func() {
<-ctx.Done()
close(doneCh)
}()
// Wait a bit to make sure goroutine is running
time.Sleep(50 * time.Millisecond)
// Mark session as disconnected
session.mu.Lock()
session.Connected = false
session.mu.Unlock()
// Wait for context to be canceled
select {
case <-doneCh:
// This is expected
case <-time.After(300 * time.Millisecond):
t.Fatalf("Context was not canceled after session disconnected")
}
}
// TestPingSession verifies that ping works correctly
func TestPingSession(t *testing.T) {
// Create a test WAL
mockWal := createTestWAL()
defer mockWal.Close()
defer cleanupTestData(t)
// Create a faster heartbeat config for testing
config := DefaultPrimaryConfig()
config.HeartbeatConfig = &HeartbeatConfig{
Interval: 500 * time.Millisecond,
Timeout: 1 * time.Second,
SendEmptyResponses: true,
}
// Create the primary
primary, err := NewPrimary(mockWal, config)
if err != nil {
t.Fatalf("Failed to create primary: %v", err)
}
defer primary.Close()
// Create a mock stream
mockStream := newMockStream()
// Create a session
session := &ReplicaSession{
ID: "ping-test-session",
StartSequence: 0,
Stream: mockStream,
LastAckSequence: 0,
SupportedCodecs: []proto.CompressionCodec{proto.CompressionCodec_NONE},
Connected: true,
Active: true,
LastActivity: time.Now().Add(-800 * time.Millisecond), // Older activity time
}
// Register the session
primary.registerReplicaSession(session)
// Manually ping the session
result := primary.heartbeat.pingSession(session.ID)
if !result {
t.Fatalf("Ping should succeed for active session")
}
// Verify that LastActivity was updated
session.mu.Lock()
lastActivity := session.LastActivity
session.mu.Unlock()
if time.Since(lastActivity) > 100*time.Millisecond {
t.Errorf("LastActivity should have been updated recently, but it's %v old",
time.Since(lastActivity))
}
// Verify a heartbeat was sent
if mockStream.getMessageCount() < 1 {
t.Fatalf("Expected at least 1 message after ping, got %d",
mockStream.getMessageCount())
}
// Try to ping a non-existent session
result = primary.heartbeat.pingSession("non-existent-session")
if result {
t.Fatalf("Ping should fail for non-existent session")
}
// Try to ping a session that will reject the ping
mockStream.close() // This will make the stream return errors
result = primary.heartbeat.pingSession(session.ID)
if result {
t.Fatalf("Ping should fail when stream has errors")
}
// Verify session was marked as disconnected
session.mu.Lock()
connected := session.Connected
active := session.Active
session.mu.Unlock()
if connected || active {
t.Errorf("Session should be marked as disconnected after failed ping")
}
}
// Implementation of test teardown helpers
func cleanupTestData(t *testing.T) {
// Remove any test data files
cmd := "rm -rf test-data-wal"
if err := exec.Command("sh", "-c", cmd).Run(); err != nil {
t.Logf("Error cleaning up test data: %v", err)
}
}
// TestHeartbeatWithTLSKeepalive briefly verifies integration with TLS keepalive
func TestHeartbeatWithTLSKeepalive(t *testing.T) {
// This test only verifies that heartbeats can run alongside gRPC keepalives
// A full integration test would require setting up actual TLS connections
// Create a test WAL
mockWal := createTestWAL()
defer mockWal.Close()
defer cleanupTestData(t)
// Create config with heartbeats enabled
config := DefaultPrimaryConfig()
config.HeartbeatConfig = &HeartbeatConfig{
Interval: 500 * time.Millisecond,
Timeout: 2 * time.Second,
SendEmptyResponses: true,
}
// Create the primary
primary, err := NewPrimary(mockWal, config)
if err != nil {
t.Fatalf("Failed to create primary: %v", err)
}
defer primary.Close()
// Verify heartbeat manager is running
if primary.heartbeat == nil {
t.Fatal("Heartbeat manager should be created")
}
primary.heartbeat.mu.Lock()
running := primary.heartbeat.running
primary.heartbeat.mu.Unlock()
if !running {
t.Fatal("Heartbeat manager should be running")
}
}