kevo/pkg/memtable/skiplist.go

package memtable

import (
	"bytes"
	"math/rand"
	"sync"
	"sync/atomic"
	"time"
	"unsafe"
)

const (
	// MaxHeight is the maximum height of the skip list
	MaxHeight = 12

	// BranchingFactor determines the probability of increasing the height
	BranchingFactor = 4

	// DefaultCacheLineSize aligns nodes to cache lines for better performance
	DefaultCacheLineSize = 64
)

// ValueType represents the type of a key-value entry
type ValueType uint8

const (
	// TypeValue indicates the entry contains a value
	TypeValue ValueType = iota + 1

	// TypeDeletion indicates the entry is a tombstone (deletion marker)
	TypeDeletion
)

// entry represents a key-value pair with additional metadata
type entry struct {
	key       []byte
	value     []byte
	valueType ValueType
	seqNum    uint64
}

// newEntry creates a new entry
func newEntry(key, value []byte, valueType ValueType, seqNum uint64) *entry {
	return &entry{
		key:       key,
		value:     value,
		valueType: valueType,
		seqNum:    seqNum,
	}
}

// size returns the approximate size of the entry in memory
func (e *entry) size() int {
	return len(e.key) + len(e.value) + 16 // adding overhead for metadata
}

// compare compares this entry with another key
// Returns: negative if e.key < key, 0 if equal, positive if e.key > key
func (e *entry) compare(key []byte) int {
	return bytes.Compare(e.key, key)
}

// compareWithEntry compares this entry with another entry
// First by key, then by sequence number (in reverse order to prioritize newer entries)
func (e *entry) compareWithEntry(other *entry) int {
	cmp := bytes.Compare(e.key, other.key)
	if cmp == 0 {
		// If keys are equal, compare sequence numbers in reverse order (newer first)
		if e.seqNum > other.seqNum {
			return -1
		} else if e.seqNum < other.seqNum {
			return 1
		}
		return 0
	}
	return cmp
}

// node represents a node in the skip list
type node struct {
	entry  *entry
	height int32
	// next contains pointers to the next nodes at each level
	// This is allocated as a single block for cache efficiency
	next [MaxHeight]unsafe.Pointer
}

// newNode creates a new node with a random height
func newNode(e *entry, height int) *node {
	return &node{
		entry:  e,
		height: int32(height),
	}
}

// getNext returns the next node at the given level
func (n *node) getNext(level int) *node {
	return (*node)(atomic.LoadPointer(&n.next[level]))
}

// setNext sets the next node at the given level
func (n *node) setNext(level int, next *node) {
	atomic.StorePointer(&n.next[level], unsafe.Pointer(next))
}

// SkipList is a concurrent skip list implementation for the MemTable
type SkipList struct {
	head      *node
	maxHeight int32
	rnd       *rand.Rand
	rndMtx    sync.Mutex
	size      int64
}

// NewSkipList creates a new skip list
func NewSkipList() *SkipList {
	seed := time.Now().UnixNano()
	list := &SkipList{
		head:      newNode(nil, MaxHeight),
		maxHeight: 1,
		rnd:       rand.New(rand.NewSource(seed)),
	}
	return list
}

// randomHeight generates a random height for a new node
func (s *SkipList) randomHeight() int {
	s.rndMtx.Lock()
	defer s.rndMtx.Unlock()

	height := 1
	for height < MaxHeight && s.rnd.Intn(BranchingFactor) == 0 {
		height++
	}
	return height
}

// getCurrentHeight returns the current maximum height of the skip list
func (s *SkipList) getCurrentHeight() int {
	return int(atomic.LoadInt32(&s.maxHeight))
}

// Insert adds a new entry to the skip list
func (s *SkipList) Insert(e *entry) {
	height := s.randomHeight()
	prev := [MaxHeight]*node{}
	node := newNode(e, height)

	// Try to increase the height of the list
	currHeight := s.getCurrentHeight()
	if height > currHeight {
		// Attempt to increase the height
		if atomic.CompareAndSwapInt32(&s.maxHeight, int32(currHeight), int32(height)) {
			currHeight = height
		}
	}

	// Find where to insert at each level
	current := s.head
	for level := currHeight - 1; level >= 0; level-- {
		// Find the insertion point at this level
		for next := current.getNext(level); next != nil; next = current.getNext(level) {
			if next.entry.compareWithEntry(e) >= 0 {
				break
			}
			current = next
		}
		prev[level] = current
	}

	// Insert the node at each level
	for level := 0; level < height; level++ {
		node.setNext(level, prev[level].getNext(level))
		prev[level].setNext(level, node)
	}

	// Update approximate size
	atomic.AddInt64(&s.size, int64(e.size()))
}

// Find looks for an entry with the specified key
// If multiple entries have the same key, the most recent one is returned
func (s *SkipList) Find(key []byte) *entry {
	var result *entry
	current := s.head
	height := s.getCurrentHeight()

	// Start from the highest level for efficient search
	for level := height - 1; level >= 0; level-- {
		// Scan forward until we find a key greater than or equal to the target
		for next := current.getNext(level); next != nil; next = current.getNext(level) {
			cmp := next.entry.compare(key)
			if cmp > 0 {
				// Key at next is greater than target, go down a level
				break
			} else if cmp == 0 {
				// Found a match, check if it's newer than our current result
				if result == nil || next.entry.seqNum > result.seqNum {
					result = next.entry
				}
				// Continue at this level to see if there are more entries with same key
				current = next
			} else {
				// Key at next is less than target, move forward
				current = next
			}
		}
	}

	// For level 0, do one more sweep to ensure we get the newest entry
	current = s.head
	for next := current.getNext(0); next != nil; next = next.getNext(0) {
		cmp := next.entry.compare(key)
		if cmp > 0 {
			// Past the key
			break
		} else if cmp == 0 {
			// Found a match, update result if it's newer
			if result == nil || next.entry.seqNum > result.seqNum {
				result = next.entry
			}
		}
		current = next
	}

	return result
}

// ApproximateSize returns the approximate size of the skip list in bytes
func (s *SkipList) ApproximateSize() int64 {
	return atomic.LoadInt64(&s.size)
}

// Iterator provides sequential access to the skip list entries
type Iterator struct {
	list    *SkipList
	current *node
}

// NewIterator creates a new Iterator for the skip list
func (s *SkipList) NewIterator() *Iterator {
	return &Iterator{
		list:    s,
		current: s.head,
	}
}

// Valid returns true if the iterator is positioned at a valid entry
func (it *Iterator) Valid() bool {
	return it.current != nil && it.current != it.list.head
}

// Next advances the iterator to the next entry
func (it *Iterator) Next() {
	if it.current == nil {
		return
	}
	it.current = it.current.getNext(0)
}

// SeekToFirst positions the iterator at the first entry
func (it *Iterator) SeekToFirst() {
	it.current = it.list.head.getNext(0)
}

// Seek positions the iterator at the first entry with a key >= target
func (it *Iterator) Seek(key []byte) {
	// Start from head
	current := it.list.head
	height := it.list.getCurrentHeight()

	// Search algorithm similar to Find
	for level := height - 1; level >= 0; level-- {
		for next := current.getNext(level); next != nil; next = current.getNext(level) {
			if next.entry.compare(key) >= 0 {
				break
			}
			current = next
		}
	}

	// Move to the next node, which should be >= target
	it.current = current.getNext(0)
}

// Key returns the key of the current entry
func (it *Iterator) Key() []byte {
	if !it.Valid() {
		return nil
	}
	return it.current.entry.key
}

// Value returns the value of the current entry
func (it *Iterator) Value() []byte {
	if !it.Valid() {
		return nil
	}

	// For tombstones (deletion markers), we still return nil
	// but we preserve them during iteration so compaction can see them
	return it.current.entry.value
}

// ValueType returns the type of the current entry (TypeValue or TypeDeletion)
func (it *Iterator) ValueType() ValueType {
	if !it.Valid() {
		return 0 // Invalid type
	}
	return it.current.entry.valueType
}

// IsTombstone returns true if the current entry is a deletion marker
func (it *Iterator) IsTombstone() bool {
	return it.Valid() && it.current.entry.valueType == TypeDeletion
}

// Entry returns the current entry
func (it *Iterator) Entry() *entry {
	if !it.Valid() {
		return nil
	}
	return it.current.entry
}