kevo/pkg/memtable/skiplist.go
Jeremy Tregunna 6fc3be617d
Some checks failed
Go Tests / Run Tests (1.24.2) (push) Has been cancelled
feat: Initial release of kevo storage engine.
Adds a complete LSM-based storage engine with these features:
- Single-writer based architecture for the storage engine
- WAL for durability, and hey it's configurable
- MemTable with skip list implementation for fast read/writes
- SSTable with block-based structure for on-disk level-based storage
- Background compaction with tiered strategy
- ACID transactions
- Good documentation (I hope)
2025-04-20 14:06:50 -06:00

325 lines
8.0 KiB
Go

package memtable
import (
"bytes"
"math/rand"
"sync"
"sync/atomic"
"time"
"unsafe"
)
const (
// MaxHeight is the maximum height of the skip list
MaxHeight = 12
// BranchingFactor determines the probability of increasing the height
BranchingFactor = 4
// DefaultCacheLineSize aligns nodes to cache lines for better performance
DefaultCacheLineSize = 64
)
// ValueType represents the type of a key-value entry
type ValueType uint8
const (
// TypeValue indicates the entry contains a value
TypeValue ValueType = iota + 1
// TypeDeletion indicates the entry is a tombstone (deletion marker)
TypeDeletion
)
// entry represents a key-value pair with additional metadata
type entry struct {
key []byte
value []byte
valueType ValueType
seqNum uint64
}
// newEntry creates a new entry
func newEntry(key, value []byte, valueType ValueType, seqNum uint64) *entry {
return &entry{
key: key,
value: value,
valueType: valueType,
seqNum: seqNum,
}
}
// size returns the approximate size of the entry in memory
func (e *entry) size() int {
return len(e.key) + len(e.value) + 16 // adding overhead for metadata
}
// compare compares this entry with another key
// Returns: negative if e.key < key, 0 if equal, positive if e.key > key
func (e *entry) compare(key []byte) int {
return bytes.Compare(e.key, key)
}
// compareWithEntry compares this entry with another entry
// First by key, then by sequence number (in reverse order to prioritize newer entries)
func (e *entry) compareWithEntry(other *entry) int {
cmp := bytes.Compare(e.key, other.key)
if cmp == 0 {
// If keys are equal, compare sequence numbers in reverse order (newer first)
if e.seqNum > other.seqNum {
return -1
} else if e.seqNum < other.seqNum {
return 1
}
return 0
}
return cmp
}
// node represents a node in the skip list
type node struct {
entry *entry
height int32
// next contains pointers to the next nodes at each level
// This is allocated as a single block for cache efficiency
next [MaxHeight]unsafe.Pointer
}
// newNode creates a new node with a random height
func newNode(e *entry, height int) *node {
return &node{
entry: e,
height: int32(height),
}
}
// getNext returns the next node at the given level
func (n *node) getNext(level int) *node {
return (*node)(atomic.LoadPointer(&n.next[level]))
}
// setNext sets the next node at the given level
func (n *node) setNext(level int, next *node) {
atomic.StorePointer(&n.next[level], unsafe.Pointer(next))
}
// SkipList is a concurrent skip list implementation for the MemTable
type SkipList struct {
head *node
maxHeight int32
rnd *rand.Rand
rndMtx sync.Mutex
size int64
}
// NewSkipList creates a new skip list
func NewSkipList() *SkipList {
seed := time.Now().UnixNano()
list := &SkipList{
head: newNode(nil, MaxHeight),
maxHeight: 1,
rnd: rand.New(rand.NewSource(seed)),
}
return list
}
// randomHeight generates a random height for a new node
func (s *SkipList) randomHeight() int {
s.rndMtx.Lock()
defer s.rndMtx.Unlock()
height := 1
for height < MaxHeight && s.rnd.Intn(BranchingFactor) == 0 {
height++
}
return height
}
// getCurrentHeight returns the current maximum height of the skip list
func (s *SkipList) getCurrentHeight() int {
return int(atomic.LoadInt32(&s.maxHeight))
}
// Insert adds a new entry to the skip list
func (s *SkipList) Insert(e *entry) {
height := s.randomHeight()
prev := [MaxHeight]*node{}
node := newNode(e, height)
// Try to increase the height of the list
currHeight := s.getCurrentHeight()
if height > currHeight {
// Attempt to increase the height
if atomic.CompareAndSwapInt32(&s.maxHeight, int32(currHeight), int32(height)) {
currHeight = height
}
}
// Find where to insert at each level
current := s.head
for level := currHeight - 1; level >= 0; level-- {
// Find the insertion point at this level
for next := current.getNext(level); next != nil; next = current.getNext(level) {
if next.entry.compareWithEntry(e) >= 0 {
break
}
current = next
}
prev[level] = current
}
// Insert the node at each level
for level := 0; level < height; level++ {
node.setNext(level, prev[level].getNext(level))
prev[level].setNext(level, node)
}
// Update approximate size
atomic.AddInt64(&s.size, int64(e.size()))
}
// Find looks for an entry with the specified key
// If multiple entries have the same key, the most recent one is returned
func (s *SkipList) Find(key []byte) *entry {
var result *entry
current := s.head
height := s.getCurrentHeight()
// Start from the highest level for efficient search
for level := height - 1; level >= 0; level-- {
// Scan forward until we find a key greater than or equal to the target
for next := current.getNext(level); next != nil; next = current.getNext(level) {
cmp := next.entry.compare(key)
if cmp > 0 {
// Key at next is greater than target, go down a level
break
} else if cmp == 0 {
// Found a match, check if it's newer than our current result
if result == nil || next.entry.seqNum > result.seqNum {
result = next.entry
}
// Continue at this level to see if there are more entries with same key
current = next
} else {
// Key at next is less than target, move forward
current = next
}
}
}
// For level 0, do one more sweep to ensure we get the newest entry
current = s.head
for next := current.getNext(0); next != nil; next = next.getNext(0) {
cmp := next.entry.compare(key)
if cmp > 0 {
// Past the key
break
} else if cmp == 0 {
// Found a match, update result if it's newer
if result == nil || next.entry.seqNum > result.seqNum {
result = next.entry
}
}
current = next
}
return result
}
// ApproximateSize returns the approximate size of the skip list in bytes
func (s *SkipList) ApproximateSize() int64 {
return atomic.LoadInt64(&s.size)
}
// Iterator provides sequential access to the skip list entries
type Iterator struct {
list *SkipList
current *node
}
// NewIterator creates a new Iterator for the skip list
func (s *SkipList) NewIterator() *Iterator {
return &Iterator{
list: s,
current: s.head,
}
}
// Valid returns true if the iterator is positioned at a valid entry
func (it *Iterator) Valid() bool {
return it.current != nil && it.current != it.list.head
}
// Next advances the iterator to the next entry
func (it *Iterator) Next() {
if it.current == nil {
return
}
it.current = it.current.getNext(0)
}
// SeekToFirst positions the iterator at the first entry
func (it *Iterator) SeekToFirst() {
it.current = it.list.head.getNext(0)
}
// Seek positions the iterator at the first entry with a key >= target
func (it *Iterator) Seek(key []byte) {
// Start from head
current := it.list.head
height := it.list.getCurrentHeight()
// Search algorithm similar to Find
for level := height - 1; level >= 0; level-- {
for next := current.getNext(level); next != nil; next = current.getNext(level) {
if next.entry.compare(key) >= 0 {
break
}
current = next
}
}
// Move to the next node, which should be >= target
it.current = current.getNext(0)
}
// Key returns the key of the current entry
func (it *Iterator) Key() []byte {
if !it.Valid() {
return nil
}
return it.current.entry.key
}
// Value returns the value of the current entry
func (it *Iterator) Value() []byte {
if !it.Valid() {
return nil
}
// For tombstones (deletion markers), we still return nil
// but we preserve them during iteration so compaction can see them
return it.current.entry.value
}
// ValueType returns the type of the current entry (TypeValue or TypeDeletion)
func (it *Iterator) ValueType() ValueType {
if !it.Valid() {
return 0 // Invalid type
}
return it.current.entry.valueType
}
// IsTombstone returns true if the current entry is a deletion marker
func (it *Iterator) IsTombstone() bool {
return it.Valid() && it.current.entry.valueType == TypeDeletion
}
// Entry returns the current entry
func (it *Iterator) Entry() *entry {
if !it.Valid() {
return nil
}
return it.current.entry
}