This commit adds comprehensive reliability features to the replication transport layer: - Add retry logic with exponential backoff for all network operations - Implement circuit breaker pattern to prevent cascading failures - Add reconnection handling with automatic recovery - Implement proper timeout handling for all network operations - Add comprehensive logging for connection issues - Improve error handling with temporary error classification - Enhance stream processing with automatic recovery
152 lines
3.9 KiB
Go
152 lines
3.9 KiB
Go
package transport
|
|
|
|
import (
|
|
"context"
|
|
"math"
|
|
"time"
|
|
|
|
"github.com/KevoDB/kevo/pkg/transport"
|
|
)
|
|
|
|
// reconnectLoop continuously attempts to reconnect the client
|
|
func (c *ReplicationGRPCClient) reconnectLoop(initialDelay time.Duration) {
|
|
// If we're shutting down, don't attempt to reconnect
|
|
if c.shuttingDown {
|
|
return
|
|
}
|
|
|
|
// Start with initial delay
|
|
delay := initialDelay
|
|
|
|
// Reset reconnect attempt counter on first try
|
|
c.reconnectAttempt = 0
|
|
|
|
for {
|
|
// Check if we're shutting down
|
|
if c.shuttingDown {
|
|
return
|
|
}
|
|
|
|
// Wait for the delay
|
|
time.Sleep(delay)
|
|
|
|
// Attempt to reconnect
|
|
c.reconnectAttempt++
|
|
maxAttempts := c.options.RetryPolicy.MaxRetries
|
|
|
|
c.logger.Info("Attempting to reconnect (%d/%d)", c.reconnectAttempt, maxAttempts)
|
|
|
|
// Create context with timeout
|
|
ctx, cancel := context.WithTimeout(context.Background(), c.options.Timeout)
|
|
|
|
// Attempt connection
|
|
err := c.Connect(ctx)
|
|
cancel()
|
|
|
|
if err == nil {
|
|
// Connection successful
|
|
c.logger.Info("Successfully reconnected after %d attempts", c.reconnectAttempt)
|
|
|
|
// Reset circuit breaker
|
|
c.circuitBreaker.Reset()
|
|
|
|
// Register with primary if we have a replica ID
|
|
if c.replicaID != "" {
|
|
ctx, cancel := context.WithTimeout(context.Background(), c.options.Timeout)
|
|
defer cancel()
|
|
|
|
err := c.RegisterAsReplica(ctx, c.replicaID)
|
|
if err != nil {
|
|
c.logger.Error("Failed to re-register as replica: %v", err)
|
|
} else {
|
|
c.logger.Info("Successfully re-registered as replica %s", c.replicaID)
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// Log the reconnection failure
|
|
c.logger.Error("Failed to reconnect (attempt %d/%d): %v",
|
|
c.reconnectAttempt, maxAttempts, err)
|
|
|
|
// Check if we've exceeded the maximum number of reconnection attempts
|
|
if maxAttempts > 0 && c.reconnectAttempt >= maxAttempts {
|
|
c.logger.Error("Maximum reconnection attempts (%d) exceeded", maxAttempts)
|
|
// Trip the circuit breaker to prevent further attempts for a while
|
|
c.circuitBreaker.Trip()
|
|
return
|
|
}
|
|
|
|
// Increase delay for next attempt (with jitter)
|
|
delay = calculateBackoff(c.reconnectAttempt, c.options.RetryPolicy)
|
|
}
|
|
}
|
|
|
|
// calculateBackoff calculates the backoff duration for the next reconnection attempt
|
|
func calculateBackoff(attempt int, policy transport.RetryPolicy) time.Duration {
|
|
// Calculate base backoff using exponential formula
|
|
backoff := float64(policy.InitialBackoff) *
|
|
math.Pow(2, float64(attempt-1)) // 2^(attempt-1)
|
|
|
|
// Apply backoff factor if specified
|
|
if policy.BackoffFactor > 0 {
|
|
backoff *= policy.BackoffFactor
|
|
}
|
|
|
|
// Apply jitter if specified
|
|
if policy.Jitter > 0 {
|
|
jitter := 1.0 - policy.Jitter/2 + policy.Jitter*float64(time.Now().UnixNano()%1000)/1000.0
|
|
backoff *= jitter
|
|
}
|
|
|
|
// Cap at max backoff
|
|
if policy.MaxBackoff > 0 && time.Duration(backoff) > policy.MaxBackoff {
|
|
return policy.MaxBackoff
|
|
}
|
|
|
|
return time.Duration(backoff)
|
|
}
|
|
|
|
// maybeReconnect checks if the connection is alive, and starts a reconnection
|
|
// loop if it's not
|
|
func (c *ReplicationGRPCClient) maybeReconnect() {
|
|
// Check if we're connected
|
|
if c.IsConnected() {
|
|
return
|
|
}
|
|
|
|
// Check if the circuit breaker is open
|
|
if c.circuitBreaker.IsOpen() {
|
|
c.logger.Warn("Circuit breaker is open, not attempting to reconnect")
|
|
return
|
|
}
|
|
|
|
// Start reconnection loop in a new goroutine
|
|
go c.reconnectLoop(c.options.RetryPolicy.InitialBackoff)
|
|
}
|
|
|
|
// handleConnectionError processes a connection error and triggers reconnection if needed
|
|
func (c *ReplicationGRPCClient) handleConnectionError(err error) error {
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
|
|
// Update status
|
|
c.mu.Lock()
|
|
c.status.LastError = err
|
|
wasConnected := c.status.Connected
|
|
c.status.Connected = false
|
|
c.mu.Unlock()
|
|
|
|
// Log the error
|
|
c.logger.Error("Connection error: %v", err)
|
|
|
|
// Check if we should attempt to reconnect
|
|
if wasConnected && !c.shuttingDown {
|
|
c.logger.Info("Connection lost, attempting to reconnect")
|
|
go c.reconnectLoop(c.options.RetryPolicy.InitialBackoff)
|
|
}
|
|
|
|
return err
|
|
} |