kevo/pkg/transport/retry.go
Jeremy Tregunna 61858f595e
feat: implement reliability features for replication transport
This commit adds comprehensive reliability features to the replication transport layer:

- Add retry logic with exponential backoff for all network operations
- Implement circuit breaker pattern to prevent cascading failures
- Add reconnection handling with automatic recovery
- Implement proper timeout handling for all network operations
- Add comprehensive logging for connection issues
- Improve error handling with temporary error classification
- Enhance stream processing with automatic recovery
2025-04-26 13:32:23 -06:00

209 lines
5.1 KiB
Go

package transport
import (
"context"
"math"
"math/rand"
"time"
)
// RetryableFunc is a function that can be retried
type RetryableFunc func(ctx context.Context) error
// WithRetry executes a function with retry logic based on the provided policy
func WithRetry(ctx context.Context, policy RetryPolicy, fn RetryableFunc) error {
var err error
backoff := policy.InitialBackoff
for attempt := 0; attempt <= policy.MaxRetries; attempt++ {
// Execute the function
err = fn(ctx)
if err == nil {
// Success
return nil
}
// Check if we should continue retrying
if attempt == policy.MaxRetries {
break
}
// Check if context is done
select {
case <-ctx.Done():
return ctx.Err()
default:
// Continue
}
// Add jitter to prevent thundering herd
jitter := 1.0
if policy.Jitter > 0 {
jitter = 1.0 + rand.Float64()*policy.Jitter
}
// Calculate next backoff with jitter
backoffWithJitter := time.Duration(float64(backoff) * jitter)
if backoffWithJitter > policy.MaxBackoff {
backoffWithJitter = policy.MaxBackoff
}
// Wait for backoff period
timer := time.NewTimer(backoffWithJitter)
select {
case <-ctx.Done():
timer.Stop()
return ctx.Err()
case <-timer.C:
// Continue with next attempt
}
// Increase backoff for next attempt
backoff = time.Duration(float64(backoff) * policy.BackoffFactor)
if backoff > policy.MaxBackoff {
backoff = policy.MaxBackoff
}
}
return err
}
// DefaultRetryPolicy returns a sensible default retry policy
func DefaultRetryPolicy() RetryPolicy {
return RetryPolicy{
MaxRetries: 3,
InitialBackoff: 100 * time.Millisecond,
MaxBackoff: 5 * time.Second,
BackoffFactor: 2.0,
Jitter: 0.2,
}
}
// CircuitBreakerState represents the state of a circuit breaker
type CircuitBreakerState int
const (
// CircuitClosed means the circuit is closed and operations are permitted
CircuitClosed CircuitBreakerState = iota
// CircuitOpen means the circuit is open and operations will fail fast
CircuitOpen
// CircuitHalfOpen means the circuit is allowing a test operation
CircuitHalfOpen
)
// CircuitBreaker implements the circuit breaker pattern
type CircuitBreaker struct {
state CircuitBreakerState
failureThreshold int
resetTimeout time.Duration
failureCount int
lastFailure time.Time
lastStateChange time.Time
successThreshold int
halfOpenSuccesses int
}
// NewCircuitBreaker creates a new circuit breaker
func NewCircuitBreaker(failureThreshold int, resetTimeout time.Duration) *CircuitBreaker {
return &CircuitBreaker{
state: CircuitClosed,
failureThreshold: failureThreshold,
resetTimeout: resetTimeout,
successThreshold: 1, // Default to 1 success required to close circuit
}
}
// Execute attempts to execute a function with circuit breaker protection
func (cb *CircuitBreaker) Execute(ctx context.Context, fn RetryableFunc) error {
// Check if circuit is open
if cb.IsOpen() && !cb.shouldAttemptReset() {
return ErrCircuitOpen
}
// Mark as half-open if we're attempting a reset
if cb.state == CircuitOpen {
cb.state = CircuitHalfOpen
cb.halfOpenSuccesses = 0
cb.lastStateChange = time.Now()
}
// Execute the function
err := fn(ctx)
// Handle result
if err != nil {
// Record failure
cb.recordFailure()
return err
}
// Record success
cb.recordSuccess()
return nil
}
// IsOpen returns whether the circuit is open
func (cb *CircuitBreaker) IsOpen() bool {
return cb.state == CircuitOpen || cb.state == CircuitHalfOpen
}
// Trip manually opens the circuit
func (cb *CircuitBreaker) Trip() {
cb.state = CircuitOpen
cb.lastStateChange = time.Now()
}
// Reset manually closes the circuit
func (cb *CircuitBreaker) Reset() {
cb.state = CircuitClosed
cb.failureCount = 0
cb.lastStateChange = time.Now()
}
// recordFailure records a failure and potentially opens the circuit
func (cb *CircuitBreaker) recordFailure() {
cb.lastFailure = time.Now()
switch cb.state {
case CircuitClosed:
cb.failureCount++
if cb.failureCount >= cb.failureThreshold {
cb.state = CircuitOpen
cb.lastStateChange = time.Now()
}
case CircuitHalfOpen:
cb.state = CircuitOpen
cb.lastStateChange = time.Now()
}
}
// recordSuccess records a success and potentially closes the circuit
func (cb *CircuitBreaker) recordSuccess() {
switch cb.state {
case CircuitHalfOpen:
cb.halfOpenSuccesses++
if cb.halfOpenSuccesses >= cb.successThreshold {
cb.state = CircuitClosed
cb.failureCount = 0
cb.lastStateChange = time.Now()
}
case CircuitClosed:
// Reset failure count after a success
cb.failureCount = 0
}
}
// shouldAttemptReset determines if enough time has passed to attempt a reset
func (cb *CircuitBreaker) shouldAttemptReset() bool {
return cb.state == CircuitOpen &&
time.Since(cb.lastStateChange) >= cb.resetTimeout
}
// ExponentialBackoff calculates the next backoff duration
func ExponentialBackoff(attempt int, initialBackoff time.Duration, maxBackoff time.Duration, factor float64) time.Duration {
backoff := float64(initialBackoff) * math.Pow(factor, float64(attempt))
if backoff > float64(maxBackoff) {
return maxBackoff
}
return time.Duration(backoff)
}