Some checks failed
Go Tests / Run Tests (1.24.2) (push) Has been cancelled
Adds a complete LSM-based storage engine with these features: - Single-writer based architecture for the storage engine - WAL for durability, and hey it's configurable - MemTable with skip list implementation for fast read/writes - SSTable with block-based structure for on-disk level-based storage - Background compaction with tiered strategy - ACID transactions - Good documentation (I hope)
325 lines
8.0 KiB
Go
325 lines
8.0 KiB
Go
package memtable
|
|
|
|
import (
|
|
"bytes"
|
|
"math/rand"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
"unsafe"
|
|
)
|
|
|
|
const (
|
|
// MaxHeight is the maximum height of the skip list
|
|
MaxHeight = 12
|
|
|
|
// BranchingFactor determines the probability of increasing the height
|
|
BranchingFactor = 4
|
|
|
|
// DefaultCacheLineSize aligns nodes to cache lines for better performance
|
|
DefaultCacheLineSize = 64
|
|
)
|
|
|
|
// ValueType represents the type of a key-value entry
|
|
type ValueType uint8
|
|
|
|
const (
|
|
// TypeValue indicates the entry contains a value
|
|
TypeValue ValueType = iota + 1
|
|
|
|
// TypeDeletion indicates the entry is a tombstone (deletion marker)
|
|
TypeDeletion
|
|
)
|
|
|
|
// entry represents a key-value pair with additional metadata
|
|
type entry struct {
|
|
key []byte
|
|
value []byte
|
|
valueType ValueType
|
|
seqNum uint64
|
|
}
|
|
|
|
// newEntry creates a new entry
|
|
func newEntry(key, value []byte, valueType ValueType, seqNum uint64) *entry {
|
|
return &entry{
|
|
key: key,
|
|
value: value,
|
|
valueType: valueType,
|
|
seqNum: seqNum,
|
|
}
|
|
}
|
|
|
|
// size returns the approximate size of the entry in memory
|
|
func (e *entry) size() int {
|
|
return len(e.key) + len(e.value) + 16 // adding overhead for metadata
|
|
}
|
|
|
|
// compare compares this entry with another key
|
|
// Returns: negative if e.key < key, 0 if equal, positive if e.key > key
|
|
func (e *entry) compare(key []byte) int {
|
|
return bytes.Compare(e.key, key)
|
|
}
|
|
|
|
// compareWithEntry compares this entry with another entry
|
|
// First by key, then by sequence number (in reverse order to prioritize newer entries)
|
|
func (e *entry) compareWithEntry(other *entry) int {
|
|
cmp := bytes.Compare(e.key, other.key)
|
|
if cmp == 0 {
|
|
// If keys are equal, compare sequence numbers in reverse order (newer first)
|
|
if e.seqNum > other.seqNum {
|
|
return -1
|
|
} else if e.seqNum < other.seqNum {
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
return cmp
|
|
}
|
|
|
|
// node represents a node in the skip list
|
|
type node struct {
|
|
entry *entry
|
|
height int32
|
|
// next contains pointers to the next nodes at each level
|
|
// This is allocated as a single block for cache efficiency
|
|
next [MaxHeight]unsafe.Pointer
|
|
}
|
|
|
|
// newNode creates a new node with a random height
|
|
func newNode(e *entry, height int) *node {
|
|
return &node{
|
|
entry: e,
|
|
height: int32(height),
|
|
}
|
|
}
|
|
|
|
// getNext returns the next node at the given level
|
|
func (n *node) getNext(level int) *node {
|
|
return (*node)(atomic.LoadPointer(&n.next[level]))
|
|
}
|
|
|
|
// setNext sets the next node at the given level
|
|
func (n *node) setNext(level int, next *node) {
|
|
atomic.StorePointer(&n.next[level], unsafe.Pointer(next))
|
|
}
|
|
|
|
// SkipList is a concurrent skip list implementation for the MemTable
|
|
type SkipList struct {
|
|
head *node
|
|
maxHeight int32
|
|
rnd *rand.Rand
|
|
rndMtx sync.Mutex
|
|
size int64
|
|
}
|
|
|
|
// NewSkipList creates a new skip list
|
|
func NewSkipList() *SkipList {
|
|
seed := time.Now().UnixNano()
|
|
list := &SkipList{
|
|
head: newNode(nil, MaxHeight),
|
|
maxHeight: 1,
|
|
rnd: rand.New(rand.NewSource(seed)),
|
|
}
|
|
return list
|
|
}
|
|
|
|
// randomHeight generates a random height for a new node
|
|
func (s *SkipList) randomHeight() int {
|
|
s.rndMtx.Lock()
|
|
defer s.rndMtx.Unlock()
|
|
|
|
height := 1
|
|
for height < MaxHeight && s.rnd.Intn(BranchingFactor) == 0 {
|
|
height++
|
|
}
|
|
return height
|
|
}
|
|
|
|
// getCurrentHeight returns the current maximum height of the skip list
|
|
func (s *SkipList) getCurrentHeight() int {
|
|
return int(atomic.LoadInt32(&s.maxHeight))
|
|
}
|
|
|
|
// Insert adds a new entry to the skip list
|
|
func (s *SkipList) Insert(e *entry) {
|
|
height := s.randomHeight()
|
|
prev := [MaxHeight]*node{}
|
|
node := newNode(e, height)
|
|
|
|
// Try to increase the height of the list
|
|
currHeight := s.getCurrentHeight()
|
|
if height > currHeight {
|
|
// Attempt to increase the height
|
|
if atomic.CompareAndSwapInt32(&s.maxHeight, int32(currHeight), int32(height)) {
|
|
currHeight = height
|
|
}
|
|
}
|
|
|
|
// Find where to insert at each level
|
|
current := s.head
|
|
for level := currHeight - 1; level >= 0; level-- {
|
|
// Find the insertion point at this level
|
|
for next := current.getNext(level); next != nil; next = current.getNext(level) {
|
|
if next.entry.compareWithEntry(e) >= 0 {
|
|
break
|
|
}
|
|
current = next
|
|
}
|
|
prev[level] = current
|
|
}
|
|
|
|
// Insert the node at each level
|
|
for level := 0; level < height; level++ {
|
|
node.setNext(level, prev[level].getNext(level))
|
|
prev[level].setNext(level, node)
|
|
}
|
|
|
|
// Update approximate size
|
|
atomic.AddInt64(&s.size, int64(e.size()))
|
|
}
|
|
|
|
// Find looks for an entry with the specified key
|
|
// If multiple entries have the same key, the most recent one is returned
|
|
func (s *SkipList) Find(key []byte) *entry {
|
|
var result *entry
|
|
current := s.head
|
|
height := s.getCurrentHeight()
|
|
|
|
// Start from the highest level for efficient search
|
|
for level := height - 1; level >= 0; level-- {
|
|
// Scan forward until we find a key greater than or equal to the target
|
|
for next := current.getNext(level); next != nil; next = current.getNext(level) {
|
|
cmp := next.entry.compare(key)
|
|
if cmp > 0 {
|
|
// Key at next is greater than target, go down a level
|
|
break
|
|
} else if cmp == 0 {
|
|
// Found a match, check if it's newer than our current result
|
|
if result == nil || next.entry.seqNum > result.seqNum {
|
|
result = next.entry
|
|
}
|
|
// Continue at this level to see if there are more entries with same key
|
|
current = next
|
|
} else {
|
|
// Key at next is less than target, move forward
|
|
current = next
|
|
}
|
|
}
|
|
}
|
|
|
|
// For level 0, do one more sweep to ensure we get the newest entry
|
|
current = s.head
|
|
for next := current.getNext(0); next != nil; next = next.getNext(0) {
|
|
cmp := next.entry.compare(key)
|
|
if cmp > 0 {
|
|
// Past the key
|
|
break
|
|
} else if cmp == 0 {
|
|
// Found a match, update result if it's newer
|
|
if result == nil || next.entry.seqNum > result.seqNum {
|
|
result = next.entry
|
|
}
|
|
}
|
|
current = next
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// ApproximateSize returns the approximate size of the skip list in bytes
|
|
func (s *SkipList) ApproximateSize() int64 {
|
|
return atomic.LoadInt64(&s.size)
|
|
}
|
|
|
|
// Iterator provides sequential access to the skip list entries
|
|
type Iterator struct {
|
|
list *SkipList
|
|
current *node
|
|
}
|
|
|
|
// NewIterator creates a new Iterator for the skip list
|
|
func (s *SkipList) NewIterator() *Iterator {
|
|
return &Iterator{
|
|
list: s,
|
|
current: s.head,
|
|
}
|
|
}
|
|
|
|
// Valid returns true if the iterator is positioned at a valid entry
|
|
func (it *Iterator) Valid() bool {
|
|
return it.current != nil && it.current != it.list.head
|
|
}
|
|
|
|
// Next advances the iterator to the next entry
|
|
func (it *Iterator) Next() {
|
|
if it.current == nil {
|
|
return
|
|
}
|
|
it.current = it.current.getNext(0)
|
|
}
|
|
|
|
// SeekToFirst positions the iterator at the first entry
|
|
func (it *Iterator) SeekToFirst() {
|
|
it.current = it.list.head.getNext(0)
|
|
}
|
|
|
|
// Seek positions the iterator at the first entry with a key >= target
|
|
func (it *Iterator) Seek(key []byte) {
|
|
// Start from head
|
|
current := it.list.head
|
|
height := it.list.getCurrentHeight()
|
|
|
|
// Search algorithm similar to Find
|
|
for level := height - 1; level >= 0; level-- {
|
|
for next := current.getNext(level); next != nil; next = current.getNext(level) {
|
|
if next.entry.compare(key) >= 0 {
|
|
break
|
|
}
|
|
current = next
|
|
}
|
|
}
|
|
|
|
// Move to the next node, which should be >= target
|
|
it.current = current.getNext(0)
|
|
}
|
|
|
|
// Key returns the key of the current entry
|
|
func (it *Iterator) Key() []byte {
|
|
if !it.Valid() {
|
|
return nil
|
|
}
|
|
return it.current.entry.key
|
|
}
|
|
|
|
// Value returns the value of the current entry
|
|
func (it *Iterator) Value() []byte {
|
|
if !it.Valid() {
|
|
return nil
|
|
}
|
|
|
|
// For tombstones (deletion markers), we still return nil
|
|
// but we preserve them during iteration so compaction can see them
|
|
return it.current.entry.value
|
|
}
|
|
|
|
// ValueType returns the type of the current entry (TypeValue or TypeDeletion)
|
|
func (it *Iterator) ValueType() ValueType {
|
|
if !it.Valid() {
|
|
return 0 // Invalid type
|
|
}
|
|
return it.current.entry.valueType
|
|
}
|
|
|
|
// IsTombstone returns true if the current entry is a deletion marker
|
|
func (it *Iterator) IsTombstone() bool {
|
|
return it.Valid() && it.current.entry.valueType == TypeDeletion
|
|
}
|
|
|
|
// Entry returns the current entry
|
|
func (it *Iterator) Entry() *entry {
|
|
if !it.Valid() {
|
|
return nil
|
|
}
|
|
return it.current.entry
|
|
}
|