kevo/pkg/grpc/transport/reconnect.go
Jeremy Tregunna 61858f595e
feat: implement reliability features for replication transport
This commit adds comprehensive reliability features to the replication transport layer:

- Add retry logic with exponential backoff for all network operations
- Implement circuit breaker pattern to prevent cascading failures
- Add reconnection handling with automatic recovery
- Implement proper timeout handling for all network operations
- Add comprehensive logging for connection issues
- Improve error handling with temporary error classification
- Enhance stream processing with automatic recovery
2025-04-26 13:32:23 -06:00

152 lines
3.9 KiB
Go

package transport
import (
"context"
"math"
"time"
"github.com/KevoDB/kevo/pkg/transport"
)
// reconnectLoop continuously attempts to reconnect the client
func (c *ReplicationGRPCClient) reconnectLoop(initialDelay time.Duration) {
// If we're shutting down, don't attempt to reconnect
if c.shuttingDown {
return
}
// Start with initial delay
delay := initialDelay
// Reset reconnect attempt counter on first try
c.reconnectAttempt = 0
for {
// Check if we're shutting down
if c.shuttingDown {
return
}
// Wait for the delay
time.Sleep(delay)
// Attempt to reconnect
c.reconnectAttempt++
maxAttempts := c.options.RetryPolicy.MaxRetries
c.logger.Info("Attempting to reconnect (%d/%d)", c.reconnectAttempt, maxAttempts)
// Create context with timeout
ctx, cancel := context.WithTimeout(context.Background(), c.options.Timeout)
// Attempt connection
err := c.Connect(ctx)
cancel()
if err == nil {
// Connection successful
c.logger.Info("Successfully reconnected after %d attempts", c.reconnectAttempt)
// Reset circuit breaker
c.circuitBreaker.Reset()
// Register with primary if we have a replica ID
if c.replicaID != "" {
ctx, cancel := context.WithTimeout(context.Background(), c.options.Timeout)
defer cancel()
err := c.RegisterAsReplica(ctx, c.replicaID)
if err != nil {
c.logger.Error("Failed to re-register as replica: %v", err)
} else {
c.logger.Info("Successfully re-registered as replica %s", c.replicaID)
}
}
return
}
// Log the reconnection failure
c.logger.Error("Failed to reconnect (attempt %d/%d): %v",
c.reconnectAttempt, maxAttempts, err)
// Check if we've exceeded the maximum number of reconnection attempts
if maxAttempts > 0 && c.reconnectAttempt >= maxAttempts {
c.logger.Error("Maximum reconnection attempts (%d) exceeded", maxAttempts)
// Trip the circuit breaker to prevent further attempts for a while
c.circuitBreaker.Trip()
return
}
// Increase delay for next attempt (with jitter)
delay = calculateBackoff(c.reconnectAttempt, c.options.RetryPolicy)
}
}
// calculateBackoff calculates the backoff duration for the next reconnection attempt
func calculateBackoff(attempt int, policy transport.RetryPolicy) time.Duration {
// Calculate base backoff using exponential formula
backoff := float64(policy.InitialBackoff) *
math.Pow(2, float64(attempt-1)) // 2^(attempt-1)
// Apply backoff factor if specified
if policy.BackoffFactor > 0 {
backoff *= policy.BackoffFactor
}
// Apply jitter if specified
if policy.Jitter > 0 {
jitter := 1.0 - policy.Jitter/2 + policy.Jitter*float64(time.Now().UnixNano()%1000)/1000.0
backoff *= jitter
}
// Cap at max backoff
if policy.MaxBackoff > 0 && time.Duration(backoff) > policy.MaxBackoff {
return policy.MaxBackoff
}
return time.Duration(backoff)
}
// maybeReconnect checks if the connection is alive, and starts a reconnection
// loop if it's not
func (c *ReplicationGRPCClient) maybeReconnect() {
// Check if we're connected
if c.IsConnected() {
return
}
// Check if the circuit breaker is open
if c.circuitBreaker.IsOpen() {
c.logger.Warn("Circuit breaker is open, not attempting to reconnect")
return
}
// Start reconnection loop in a new goroutine
go c.reconnectLoop(c.options.RetryPolicy.InitialBackoff)
}
// handleConnectionError processes a connection error and triggers reconnection if needed
func (c *ReplicationGRPCClient) handleConnectionError(err error) error {
if err == nil {
return nil
}
// Update status
c.mu.Lock()
c.status.LastError = err
wasConnected := c.status.Connected
c.status.Connected = false
c.mu.Unlock()
// Log the error
c.logger.Error("Connection error: %v", err)
// Check if we should attempt to reconnect
if wasConnected && !c.shuttingDown {
c.logger.Info("Connection lost, attempting to reconnect")
go c.reconnectLoop(c.options.RetryPolicy.InitialBackoff)
}
return err
}