kevo/pkg/replication/replica_test.go

package replication

import (
	"context"
	"fmt"
	"io/ioutil"
	"net"
	"os"
	"path/filepath"
	"sync"
	"testing"
	"time"

	"github.com/KevoDB/kevo/pkg/config"
	"github.com/KevoDB/kevo/pkg/wal"
	replication_proto "github.com/KevoDB/kevo/proto/kevo/replication"
	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"
	"google.golang.org/grpc/test/bufconn"
)

const bufSize = 1024 * 1024

// testWALEntryApplier implements WALEntryApplier for testing
type testWALEntryApplier struct {
	entries      []*wal.Entry
	appliedCount int
	syncCount    int
	mu           sync.Mutex
	shouldFail   bool
	wal          *wal.WAL
}

func newTestWALEntryApplier(walDir string) (*testWALEntryApplier, error) {
	// Create a WAL for the applier to write to
	cfg := &config.Config{
		WALDir:      walDir,
		WALSyncMode: config.SyncImmediate,
		WALMaxSize:  64 * 1024 * 1024, // 64MB
	}
	testWal, err := wal.NewWAL(cfg, walDir)
	if err != nil {
		return nil, fmt.Errorf("failed to create WAL for applier: %w", err)
	}

	return &testWALEntryApplier{
		entries: make([]*wal.Entry, 0),
		wal:     testWal,
	}, nil
}

func (a *testWALEntryApplier) Apply(entry *wal.Entry) error {
	a.mu.Lock()
	defer a.mu.Unlock()

	if a.shouldFail {
		return fmt.Errorf("simulated apply failure")
	}

	// Store the entry in our list
	a.entries = append(a.entries, entry)
	a.appliedCount++

	return nil
}

func (a *testWALEntryApplier) Sync() error {
	a.mu.Lock()
	defer a.mu.Unlock()

	if a.shouldFail {
		return fmt.Errorf("simulated sync failure")
	}

	// Sync the WAL
	if err := a.wal.Sync(); err != nil {
		return err
	}

	a.syncCount++
	return nil
}

func (a *testWALEntryApplier) Close() error {
	return a.wal.Close()
}

func (a *testWALEntryApplier) GetAppliedEntries() []*wal.Entry {
	a.mu.Lock()
	defer a.mu.Unlock()

	result := make([]*wal.Entry, len(a.entries))
	copy(result, a.entries)
	return result
}

func (a *testWALEntryApplier) GetAppliedCount() int {
	a.mu.Lock()
	defer a.mu.Unlock()
	return a.appliedCount
}

func (a *testWALEntryApplier) GetSyncCount() int {
	a.mu.Lock()
	defer a.mu.Unlock()
	return a.syncCount
}

func (a *testWALEntryApplier) SetShouldFail(shouldFail bool) {
	a.mu.Lock()
	defer a.mu.Unlock()
	a.shouldFail = shouldFail
}

// bufConnServerConnector is a connector that uses bufconn for testing
type bufConnServerConnector struct {
	client replication_proto.WALReplicationServiceClient
}

func (c *bufConnServerConnector) Connect(r *Replica) error {
	r.mu.Lock()
	defer r.mu.Unlock()
	r.client = c.client
	return nil
}

// setupTestEnvironment sets up a complete test environment with WAL, Primary, and gRPC server
func setupTestEnvironment(t *testing.T) (string, *wal.WAL, *Primary, replication_proto.WALReplicationServiceClient, func()) {
	// Create a temporary directory for the WAL files
	tempDir, err := ioutil.TempDir("", "wal_replication_test")
	if err != nil {
		t.Fatalf("Failed to create temporary directory: %v", err)
	}

	// Create primary WAL directory
	primaryWalDir := filepath.Join(tempDir, "primary_wal")
	if err := os.MkdirAll(primaryWalDir, 0755); err != nil {
		t.Fatalf("Failed to create primary WAL directory: %v", err)
	}

	// Create replica WAL directory
	replicaWalDir := filepath.Join(tempDir, "replica_wal")
	if err := os.MkdirAll(replicaWalDir, 0755); err != nil {
		t.Fatalf("Failed to create replica WAL directory: %v", err)
	}

	// Create the primary WAL
	primaryCfg := &config.Config{
		WALDir:      primaryWalDir,
		WALSyncMode: config.SyncImmediate,
		WALMaxSize:  64 * 1024 * 1024, // 64MB
	}
	primaryWAL, err := wal.NewWAL(primaryCfg, primaryWalDir)
	if err != nil {
		t.Fatalf("Failed to create primary WAL: %v", err)
	}

	// Create a Primary with the WAL
	primary, err := NewPrimary(primaryWAL, &PrimaryConfig{
		MaxBatchSizeKB:    256, // 256 KB
		EnableCompression: false,
		CompressionCodec:  replication_proto.CompressionCodec_NONE,
		RetentionConfig: WALRetentionConfig{
			MaxAgeHours: 1, // 1 hour retention
		},
	})
	if err != nil {
		t.Fatalf("Failed to create primary: %v", err)
	}

	// Setup gRPC server over bufconn
	listener := bufconn.Listen(bufSize)
	server := grpc.NewServer()
	replication_proto.RegisterWALReplicationServiceServer(server, primary)

	go func() {
		if err := server.Serve(listener); err != nil {
			t.Logf("Server error: %v", err)
		}
	}()

	// Create a client connection
	dialer := func(context.Context, string) (net.Conn, error) {
		return listener.Dial()
	}

	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
	defer cancel()

	conn, err := grpc.DialContext(ctx, "bufnet",
		grpc.WithContextDialer(dialer),
		grpc.WithTransportCredentials(insecure.NewCredentials()),
		grpc.WithBlock())
	if err != nil {
		t.Fatalf("Failed to dial bufnet: %v", err)
	}

	client := replication_proto.NewWALReplicationServiceClient(conn)

	// Return a cleanup function
	cleanup := func() {
		conn.Close()
		server.Stop()
		listener.Close()
		primaryWAL.Close()
		os.RemoveAll(tempDir)
	}

	return replicaWalDir, primaryWAL, primary, client, cleanup
}

// Test creating a new replica
func TestNewReplica(t *testing.T) {
	// Create a temporary directory for the test
	tempDir, err := ioutil.TempDir("", "replica_test")
	if err != nil {
		t.Fatalf("Failed to create temporary directory: %v", err)
	}
	defer os.RemoveAll(tempDir)

	// Create an applier
	applier, err := newTestWALEntryApplier(tempDir)
	if err != nil {
		t.Fatalf("Failed to create test applier: %v", err)
	}
	defer applier.Close()

	// Create a replica
	config := DefaultReplicaConfig()
	replica, err := NewReplica(0, applier, config)
	if err != nil {
		t.Fatalf("Failed to create replica: %v", err)
	}

	// Check initial state
	if got, want := replica.GetLastAppliedSequence(), uint64(0); got != want {
		t.Errorf("GetLastAppliedSequence() = %d, want %d", got, want)
	}
	if got, want := replica.GetCurrentState(), StateConnecting; got != want {
		t.Errorf("GetCurrentState() = %v, want %v", got, want)
	}

	// Clean up
	if err := replica.Stop(); err != nil {
		t.Errorf("Failed to stop replica: %v", err)
	}
}

// Test connection and streaming with real WAL entries
func TestReplicaStreamingWithRealWAL(t *testing.T) {
	// Setup test environment
	replicaWalDir, primaryWAL, _, client, cleanup := setupTestEnvironment(t)
	defer cleanup()

	// Create test applier for the replica
	applier, err := newTestWALEntryApplier(replicaWalDir)
	if err != nil {
		t.Fatalf("Failed to create test applier: %v", err)
	}
	defer applier.Close()

	// Write some entries to the primary WAL
	numEntries := 10
	for i := 0; i < numEntries; i++ {
		key := []byte(fmt.Sprintf("key%d", i+1))
		value := []byte(fmt.Sprintf("value%d", i+1))
		if _, err := primaryWAL.Append(wal.OpTypePut, key, value); err != nil {
			t.Fatalf("Failed to append to primary WAL: %v", err)
		}
	}

	// Sync the primary WAL to ensure entries are persisted
	if err := primaryWAL.Sync(); err != nil {
		t.Fatalf("Failed to sync primary WAL: %v", err)
	}

	// Create replica config
	config := DefaultReplicaConfig()
	config.Connection.PrimaryAddress = "bufnet" // This will be ignored with our custom connector

	// Create replica
	replica, err := NewReplica(0, applier, config)
	if err != nil {
		t.Fatalf("Failed to create replica: %v", err)
	}

	// Set custom connector for testing
	replica.SetConnector(&bufConnServerConnector{client: client})

	// Start the replica
	if err := replica.Start(); err != nil {
		t.Fatalf("Failed to start replica: %v", err)
	}

	// Wait for replication to complete
	deadline := time.Now().Add(10 * time.Second)
	for time.Now().Before(deadline) {
		// Check if entries were applied
		appliedEntries := applier.GetAppliedEntries()
		t.Logf("Waiting for replication, current applied entries: %d/%d", len(appliedEntries), numEntries)

		// Log the state of the replica for debugging
		t.Logf("Replica state: %s", replica.GetStateString())

		// Also check sync count
		syncCount := applier.GetSyncCount()
		t.Logf("Current sync count: %d", syncCount)

		// Success condition: all entries applied and at least one sync
		if len(appliedEntries) == numEntries && syncCount > 0 {
			break
		}
		time.Sleep(500 * time.Millisecond)
	}

	// Verify entries were applied with more specific messages
	appliedEntries := applier.GetAppliedEntries()
	if len(appliedEntries) != numEntries {
		for i, entry := range appliedEntries {
			t.Logf("Applied entry %d: sequence=%d, key=%s, value=%s",
				i, entry.SequenceNumber, string(entry.Key), string(entry.Value))
		}
		t.Errorf("Expected %d entries to be applied, got %d", numEntries, len(appliedEntries))
	} else {
		t.Logf("All %d entries were successfully applied", numEntries)
	}

	// Verify sync was called
	syncCount := applier.GetSyncCount()
	if syncCount == 0 {
		t.Error("Sync was not called")
	} else {
		t.Logf("Sync was called %d times", syncCount)
	}

	// Verify last applied sequence matches the expected sequence
	lastSeq := replica.GetLastAppliedSequence()
	if lastSeq != uint64(numEntries) {
		t.Errorf("Expected last applied sequence to be %d, got %d", numEntries, lastSeq)
	} else {
		t.Logf("Last applied sequence is correct: %d", lastSeq)
	}

	// Stop the replica
	if err := replica.Stop(); err != nil {
		t.Errorf("Failed to stop replica: %v", err)
	}
}

// Test state transitions
func TestReplicaStateTransitions(t *testing.T) {
	// Setup test environment
	replicaWalDir, _, _, client, cleanup := setupTestEnvironment(t)
	defer cleanup()

	// Create test applier for the replica
	applier, err := newTestWALEntryApplier(replicaWalDir)
	if err != nil {
		t.Fatalf("Failed to create test applier: %v", err)
	}
	defer applier.Close()

	// Create replica
	config := DefaultReplicaConfig()
	replica, err := NewReplica(0, applier, config)
	if err != nil {
		t.Fatalf("Failed to create replica: %v", err)
	}

	// Set custom connector for testing
	replica.SetConnector(&bufConnServerConnector{client: client})

	// Test initial state
	if got, want := replica.GetCurrentState(), StateConnecting; got != want {
		t.Errorf("Initial state = %v, want %v", got, want)
	}

	// Test connecting state transition
	err = replica.handleConnectingState()
	if err != nil {
		t.Errorf("handleConnectingState() error = %v", err)
	}
	if got, want := replica.GetCurrentState(), StateStreamingEntries; got != want {
		t.Errorf("State after connecting = %v, want %v", got, want)
	}

	// Test error state transition
	err = replica.stateTracker.SetError(fmt.Errorf("test error"))
	if err != nil {
		t.Errorf("SetError() error = %v", err)
	}
	if got, want := replica.GetCurrentState(), StateError; got != want {
		t.Errorf("State after error = %v, want %v", got, want)
	}

	// Clean up
	if err := replica.Stop(); err != nil {
		t.Errorf("Failed to stop replica: %v", err)
	}
}

// Test error handling and recovery
func TestReplicaErrorRecovery(t *testing.T) {
	// Setup test environment
	replicaWalDir, primaryWAL, _, client, cleanup := setupTestEnvironment(t)
	defer cleanup()

	// Create test applier for the replica
	applier, err := newTestWALEntryApplier(replicaWalDir)
	if err != nil {
		t.Fatalf("Failed to create test applier: %v", err)
	}
	defer applier.Close()

	// Create replica with fast retry settings
	config := DefaultReplicaConfig()
	config.Connection.RetryBaseDelay = 50 * time.Millisecond
	config.Connection.RetryMaxDelay = 200 * time.Millisecond
	replica, err := NewReplica(0, applier, config)
	if err != nil {
		t.Fatalf("Failed to create replica: %v", err)
	}

	// Set custom connector for testing
	replica.SetConnector(&bufConnServerConnector{client: client})

	// Start the replica
	if err := replica.Start(); err != nil {
		t.Fatalf("Failed to start replica: %v", err)
	}

	// Write some initial entries to the primary WAL
	for i := 0; i < 5; i++ {
		key := []byte(fmt.Sprintf("key%d", i+1))
		value := []byte(fmt.Sprintf("value%d", i+1))
		if _, err := primaryWAL.Append(wal.OpTypePut, key, value); err != nil {
			t.Fatalf("Failed to append to primary WAL: %v", err)
		}
	}
	if err := primaryWAL.Sync(); err != nil {
		t.Fatalf("Failed to sync primary WAL: %v", err)
	}

	// Wait for initial replication
	time.Sleep(500 * time.Millisecond)

	// Simulate an applier failure
	applier.SetShouldFail(true)

	// Write more entries that will cause errors
	for i := 5; i < 10; i++ {
		key := []byte(fmt.Sprintf("key%d", i+1))
		value := []byte(fmt.Sprintf("value%d", i+1))
		if _, err := primaryWAL.Append(wal.OpTypePut, key, value); err != nil {
			t.Fatalf("Failed to append to primary WAL: %v", err)
		}
	}
	if err := primaryWAL.Sync(); err != nil {
		t.Fatalf("Failed to sync primary WAL: %v", err)
	}

	// Wait for error to occur
	time.Sleep(200 * time.Millisecond)

	// Fix the applier and allow recovery
	applier.SetShouldFail(false)

	// Wait for recovery to complete
	time.Sleep(1 * time.Second)

	// Verify that at least some entries were applied
	appliedEntries := applier.GetAppliedEntries()
	if len(appliedEntries) == 0 {
		t.Error("No entries were applied")
	}

	// Stop the replica
	if err := replica.Stop(); err != nil {
		t.Errorf("Failed to stop replica: %v", err)
	}
}