kevo/pkg/replication/heartbeat.go
2025-04-29 15:03:03 -06:00

231 lines
5.5 KiB
Go

package replication
import (
"context"
"sync"
"time"
"github.com/KevoDB/kevo/pkg/common/log"
proto "github.com/KevoDB/kevo/pkg/replication/proto"
)
// HeartbeatConfig contains configuration for heartbeat/keepalive.
type HeartbeatConfig struct {
// Interval between heartbeat checks
Interval time.Duration
// Timeout after which a session is considered dead if no activity
Timeout time.Duration
// Whether to send periodic empty WALStreamResponse as heartbeats
SendEmptyResponses bool
}
// DefaultHeartbeatConfig returns the default heartbeat configuration.
func DefaultHeartbeatConfig() *HeartbeatConfig {
return &HeartbeatConfig{
Interval: 10 * time.Second,
Timeout: 30 * time.Second,
SendEmptyResponses: true,
}
}
// heartbeatManager handles heartbeat and session monitoring for the primary node.
type heartbeatManager struct {
config *HeartbeatConfig
primary *Primary
stopChan chan struct{}
waitGroup sync.WaitGroup
mu sync.Mutex
running bool
}
// newHeartbeatManager creates a new heartbeat manager.
func newHeartbeatManager(primary *Primary, config *HeartbeatConfig) *heartbeatManager {
if config == nil {
config = DefaultHeartbeatConfig()
}
return &heartbeatManager{
config: config,
primary: primary,
stopChan: make(chan struct{}),
}
}
// start begins the heartbeat monitoring.
func (h *heartbeatManager) start() {
h.mu.Lock()
defer h.mu.Unlock()
if h.running {
return
}
h.running = true
h.waitGroup.Add(1)
go h.monitorLoop()
}
// stop halts the heartbeat monitoring.
func (h *heartbeatManager) stop() {
h.mu.Lock()
if !h.running {
h.mu.Unlock()
return
}
h.running = false
close(h.stopChan)
h.mu.Unlock()
h.waitGroup.Wait()
}
// monitorLoop periodically checks replica sessions for activity and sends heartbeats.
func (h *heartbeatManager) monitorLoop() {
defer h.waitGroup.Done()
ticker := time.NewTicker(h.config.Interval)
defer ticker.Stop()
for {
select {
case <-h.stopChan:
return
case <-ticker.C:
h.checkSessions()
}
}
}
// checkSessions verifies activity on all sessions and sends heartbeats as needed.
func (h *heartbeatManager) checkSessions() {
now := time.Now()
deadSessions := make([]string, 0)
// Get a snapshot of current sessions
h.primary.mu.RLock()
sessions := make(map[string]*ReplicaSession)
for id, session := range h.primary.sessions {
sessions[id] = session
}
h.primary.mu.RUnlock()
for id, session := range sessions {
// Skip already disconnected sessions
if !session.Connected || !session.Active {
continue
}
// Check if session has timed out
session.mu.Lock()
lastActivity := session.LastActivity
if now.Sub(lastActivity) > h.config.Timeout {
log.Warn("Session %s timed out after %.1fs of inactivity",
id, now.Sub(lastActivity).Seconds())
session.Connected = false
session.Active = false
deadSessions = append(deadSessions, id)
session.mu.Unlock()
continue
}
// If sending empty responses is enabled, send a heartbeat
if h.config.SendEmptyResponses && now.Sub(lastActivity) > h.config.Interval {
// Create empty WALStreamResponse as heartbeat
heartbeat := &proto.WALStreamResponse{
Entries: []*proto.WALEntry{},
Compressed: false,
Codec: proto.CompressionCodec_NONE,
}
// Send heartbeat (don't block on lock for too long)
if err := session.Stream.Send(heartbeat); err != nil {
log.Error("Failed to send heartbeat to session %s: %v", id, err)
session.Connected = false
session.Active = false
deadSessions = append(deadSessions, id)
} else {
session.LastActivity = now
log.Debug("Sent heartbeat to session %s", id)
}
}
session.mu.Unlock()
}
// Clean up dead sessions
for _, id := range deadSessions {
h.primary.unregisterReplicaSession(id)
}
}
// pingSession sends a single heartbeat ping to a specific session
func (h *heartbeatManager) pingSession(sessionID string) bool {
session := h.primary.getSession(sessionID)
if session == nil || !session.Connected || !session.Active {
return false
}
// Create empty WALStreamResponse as heartbeat
heartbeat := &proto.WALStreamResponse{
Entries: []*proto.WALEntry{},
Compressed: false,
Codec: proto.CompressionCodec_NONE,
}
// Attempt to send a heartbeat
session.mu.Lock()
defer session.mu.Unlock()
if err := session.Stream.Send(heartbeat); err != nil {
log.Error("Failed to ping session %s: %v", sessionID, err)
session.Connected = false
session.Active = false
return false
}
session.LastActivity = time.Now()
return true
}
// checkSessionActive verifies if a session is active
func (h *heartbeatManager) checkSessionActive(sessionID string) bool {
session := h.primary.getSession(sessionID)
if session == nil {
return false
}
session.mu.Lock()
defer session.mu.Unlock()
return session.Connected && session.Active &&
time.Since(session.LastActivity) <= h.config.Timeout
}
// sessionContext returns a context that is canceled when the session becomes inactive
func (h *heartbeatManager) sessionContext(sessionID string) (context.Context, context.CancelFunc) {
ctx, cancel := context.WithCancel(context.Background())
// Start a goroutine to monitor session and cancel if it becomes inactive
go func() {
ticker := time.NewTicker(h.config.Interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
// Context was canceled elsewhere
return
case <-ticker.C:
// Check if session is still active
if !h.checkSessionActive(sessionID) {
cancel()
return
}
}
}
}()
return ctx, cancel
}