kevo/pkg/grpc/transport/reconnect.go

package transport

import (
	"context"
	"math"
	"time"

	"github.com/KevoDB/kevo/pkg/transport"
)

// reconnectLoop continuously attempts to reconnect the client
func (c *ReplicationGRPCClient) reconnectLoop(initialDelay time.Duration) {
	// If we're shutting down, don't attempt to reconnect
	if c.shuttingDown {
		return
	}

	// Start with initial delay
	delay := initialDelay

	// Reset reconnect attempt counter on first try
	c.reconnectAttempt = 0

	for {
		// Check if we're shutting down
		if c.shuttingDown {
			return
		}

		// Wait for the delay
		time.Sleep(delay)

		// Attempt to reconnect
		c.reconnectAttempt++
		maxAttempts := c.options.RetryPolicy.MaxRetries

		c.logger.Info("Attempting to reconnect (%d/%d)", c.reconnectAttempt, maxAttempts)

		// Create context with timeout
		ctx, cancel := context.WithTimeout(context.Background(), c.options.Timeout)

		// Attempt connection
		err := c.Connect(ctx)
		cancel()

		if err == nil {
			// Connection successful
			c.logger.Info("Successfully reconnected after %d attempts", c.reconnectAttempt)

			// Reset circuit breaker
			c.circuitBreaker.Reset()

			// Register with primary if we have a replica ID
			if c.replicaID != "" {
				ctx, cancel := context.WithTimeout(context.Background(), c.options.Timeout)
				defer cancel()

				err := c.RegisterAsReplica(ctx, c.replicaID)
				if err != nil {
					c.logger.Error("Failed to re-register as replica: %v", err)
				} else {
					c.logger.Info("Successfully re-registered as replica %s", c.replicaID)
				}
			}

			return
		}

		// Log the reconnection failure
		c.logger.Error("Failed to reconnect (attempt %d/%d): %v",
			c.reconnectAttempt, maxAttempts, err)

		// Check if we've exceeded the maximum number of reconnection attempts
		if maxAttempts > 0 && c.reconnectAttempt >= maxAttempts {
			c.logger.Error("Maximum reconnection attempts (%d) exceeded", maxAttempts)
			// Trip the circuit breaker to prevent further attempts for a while
			c.circuitBreaker.Trip()
			return
		}

		// Increase delay for next attempt (with jitter)
		delay = calculateBackoff(c.reconnectAttempt, c.options.RetryPolicy)
	}
}

// calculateBackoff calculates the backoff duration for the next reconnection attempt
func calculateBackoff(attempt int, policy transport.RetryPolicy) time.Duration {
	// Calculate base backoff using exponential formula
	backoff := float64(policy.InitialBackoff) *
		math.Pow(2, float64(attempt-1)) // 2^(attempt-1)

	// Apply backoff factor if specified
	if policy.BackoffFactor > 0 {
		backoff *= policy.BackoffFactor
	}

	// Apply jitter if specified
	if policy.Jitter > 0 {
		jitter := 1.0 - policy.Jitter/2 + policy.Jitter*float64(time.Now().UnixNano()%1000)/1000.0
		backoff *= jitter
	}

	// Cap at max backoff
	if policy.MaxBackoff > 0 && time.Duration(backoff) > policy.MaxBackoff {
		return policy.MaxBackoff
	}

	return time.Duration(backoff)
}

// maybeReconnect checks if the connection is alive, and starts a reconnection
// loop if it's not
func (c *ReplicationGRPCClient) maybeReconnect() {
	// Check if we're connected
	if c.IsConnected() {
		return
	}

	// Check if the circuit breaker is open
	if c.circuitBreaker.IsOpen() {
		c.logger.Warn("Circuit breaker is open, not attempting to reconnect")
		return
	}

	// Start reconnection loop in a new goroutine
	go c.reconnectLoop(c.options.RetryPolicy.InitialBackoff)
}

// handleConnectionError processes a connection error and triggers reconnection if needed
func (c *ReplicationGRPCClient) handleConnectionError(err error) error {
	if err == nil {
		return nil
	}

	// Update status
	c.mu.Lock()
	c.status.LastError = err
	wasConnected := c.status.Connected
	c.status.Connected = false
	c.mu.Unlock()

	// Log the error
	c.logger.Error("Connection error: %v", err)

	// Check if we should attempt to reconnect
	if wasConnected && !c.shuttingDown {
		c.logger.Info("Connection lost, attempting to reconnect")
		go c.reconnectLoop(c.options.RetryPolicy.InitialBackoff)
	}

	return err
}