feat: implement tiered compaction with tombstone handling

This commit is contained in:
Jeremy Tregunna 2025-04-19 19:20:09 -06:00
parent 32f449c66a
commit 16780272dd
Signed by: jer
GPG Key ID: 1278B36BA6F5D5E4
11 changed files with 2337 additions and 18 deletions

36
TODO.md
View File

@ -92,27 +92,27 @@ This document outlines the implementation tasks for the Go Storage Engine, organ
## Phase E: Compaction
- [ ] Implement tiered compaction strategy
- [ ] Create file selection algorithm based on overlap/size
- [ ] Implement merge-sorted reading from input files
- [ ] Add atomic output file generation
- [ ] Create size ratio and file count based triggering
- [] Implement tiered compaction strategy
- [] Create file selection algorithm based on overlap/size
- [] Implement merge-sorted reading from input files
- [] Add atomic output file generation
- [] Create size ratio and file count based triggering
- [ ] Handle tombstones and key deletion
- [ ] Implement tombstone markers
- [ ] Create logic for tombstone garbage collection
- [ ] Test deletion correctness across compactions
- [] Handle tombstones and key deletion
- [] Implement tombstone markers
- [] Create logic for tombstone garbage collection
- [] Test deletion correctness across compactions
- [ ] Manage file obsolescence and cleanup
- [ ] Implement safe file deletion after compaction
- [ ] Create consistent file tracking
- [ ] Add error handling for cleanup failures
- [] Manage file obsolescence and cleanup
- [] Implement safe file deletion after compaction
- [] Create consistent file tracking
- [] Add error handling for cleanup failures
- [ ] Build background compaction
- [ ] Implement worker pool for compaction tasks
- [ ] Add rate limiting to prevent I/O saturation
- [ ] Create metrics for monitoring compaction progress
- [ ] Implement priority scheduling for urgent compactions
- [] Build background compaction
- [] Implement worker pool for compaction tasks
- [] Add rate limiting to prevent I/O saturation
- [] Create metrics for monitoring compaction progress
- [] Implement priority scheduling for urgent compactions
## Phase F: Basic Atomicity and Features

View File

@ -0,0 +1,529 @@
package compaction
import (
"bytes"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"time"
"git.canoozie.net/jer/go-storage/pkg/config"
"git.canoozie.net/jer/go-storage/pkg/iterator"
"git.canoozie.net/jer/go-storage/pkg/sstable"
)
// SSTableInfo represents metadata about an SSTable file
type SSTableInfo struct {
// Path of the SSTable file
Path string
// Level number (0 to N)
Level int
// Sequence number for the file within its level
Sequence uint64
// Timestamp when the file was created
Timestamp int64
// Approximate size of the file in bytes
Size int64
// Estimated key count (may be approximate)
KeyCount int
// First key in the SSTable
FirstKey []byte
// Last key in the SSTable
LastKey []byte
// Reader for the SSTable
Reader *sstable.Reader
}
// Overlaps checks if this SSTable's key range overlaps with another SSTable
func (s *SSTableInfo) Overlaps(other *SSTableInfo) bool {
// If either SSTable has no keys, they don't overlap
if len(s.FirstKey) == 0 || len(s.LastKey) == 0 ||
len(other.FirstKey) == 0 || len(other.LastKey) == 0 {
return false
}
// Check for overlap: not (s ends before other starts OR s starts after other ends)
// s.LastKey < other.FirstKey || s.FirstKey > other.LastKey
return !(bytes.Compare(s.LastKey, other.FirstKey) < 0 ||
bytes.Compare(s.FirstKey, other.LastKey) > 0)
}
// KeyRange returns a string representation of the key range in this SSTable
func (s *SSTableInfo) KeyRange() string {
return fmt.Sprintf("[%s, %s]",
string(s.FirstKey), string(s.LastKey))
}
// String returns a string representation of the SSTable info
func (s *SSTableInfo) String() string {
return fmt.Sprintf("L%d-%06d-%020d.sst Size:%d Keys:%d Range:%s",
s.Level, s.Sequence, s.Timestamp, s.Size, s.KeyCount, s.KeyRange())
}
// CompactionTask represents a set of SSTables to be compacted
type CompactionTask struct {
// Input SSTables to compact, grouped by level
InputFiles map[int][]*SSTableInfo
// Target level for compaction output
TargetLevel int
// Output file path template
OutputPathTemplate string
}
// Compactor manages the compaction process
type Compactor struct {
// Configuration
cfg *config.Config
// SSTable directory
sstableDir string
// File information by level
levels map[int][]*SSTableInfo
}
// NewCompactor creates a new compaction manager
func NewCompactor(cfg *config.Config, sstableDir string) *Compactor {
return &Compactor{
cfg: cfg,
sstableDir: sstableDir,
levels: make(map[int][]*SSTableInfo),
}
}
// LoadSSTables scans the SSTable directory and loads metadata for all files
func (c *Compactor) LoadSSTables() error {
// Clear existing data
c.levels = make(map[int][]*SSTableInfo)
// Read all files from the SSTable directory
entries, err := os.ReadDir(c.sstableDir)
if err != nil {
if os.IsNotExist(err) {
return nil // Directory doesn't exist yet
}
return fmt.Errorf("failed to read SSTable directory: %w", err)
}
// Parse filenames and collect information
for _, entry := range entries {
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".sst") {
continue // Skip directories and non-SSTable files
}
// Parse filename to extract level, sequence, and timestamp
// Filename format: level_sequence_timestamp.sst
var level int
var sequence uint64
var timestamp int64
if n, err := fmt.Sscanf(entry.Name(), "%d_%06d_%020d.sst",
&level, &sequence, &timestamp); n != 3 || err != nil {
// Skip files that don't match our naming pattern
continue
}
// Get file info for size
fi, err := entry.Info()
if err != nil {
return fmt.Errorf("failed to get file info for %s: %w", entry.Name(), err)
}
// Open the file to extract key range information
path := filepath.Join(c.sstableDir, entry.Name())
reader, err := sstable.OpenReader(path)
if err != nil {
return fmt.Errorf("failed to open SSTable %s: %w", path, err)
}
// Create iterator to get first and last keys
iter := reader.NewIterator()
var firstKey, lastKey []byte
// Get first key
iter.SeekToFirst()
if iter.Valid() {
firstKey = append([]byte{}, iter.Key()...)
}
// Get last key
iter.SeekToLast()
if iter.Valid() {
lastKey = append([]byte{}, iter.Key()...)
}
// Create SSTable info
info := &SSTableInfo{
Path: path,
Level: level,
Sequence: sequence,
Timestamp: timestamp,
Size: fi.Size(),
KeyCount: reader.GetKeyCount(),
FirstKey: firstKey,
LastKey: lastKey,
Reader: reader,
}
// Add to appropriate level
c.levels[level] = append(c.levels[level], info)
}
// Sort files within each level by sequence number
for level, files := range c.levels {
sort.Slice(files, func(i, j int) bool {
return files[i].Sequence < files[j].Sequence
})
c.levels[level] = files
}
return nil
}
// Close closes all open SSTable readers
func (c *Compactor) Close() error {
var lastErr error
for _, files := range c.levels {
for _, file := range files {
if file.Reader != nil {
if err := file.Reader.Close(); err != nil && lastErr == nil {
lastErr = err
}
file.Reader = nil
}
}
}
return lastErr
}
// GetLevelSize returns the total size of all files in a level
func (c *Compactor) GetLevelSize(level int) int64 {
var size int64
for _, file := range c.levels[level] {
size += file.Size
}
return size
}
// GetCompactionTask selects files for compaction based on size ratio and overlap
func (c *Compactor) GetCompactionTask() (*CompactionTask, error) {
// No compaction if we don't have at least 2 levels with files
if len(c.levels) < 1 {
return nil, nil
}
// Check if L0 needs compaction (Level 0 is special because files may overlap)
if len(c.levels[0]) >= c.cfg.MaxMemTables {
return c.selectLevel0Compaction()
}
// Check higher levels based on size ratio
maxLevel := 0
for level := range c.levels {
if level > maxLevel {
maxLevel = level
}
}
// Check each level starting from 1
for level := 1; level <= maxLevel; level++ {
// If this level is too large compared to the next level
nextLevelSize := c.GetLevelSize(level + 1)
thisLevelSize := c.GetLevelSize(level)
// If this level is empty, skip it
if thisLevelSize == 0 {
continue
}
// If the next level doesn't exist yet or is empty, create it
if nextLevelSize == 0 {
// Choose one file from this level to move to the next level
if len(c.levels[level]) > 0 {
return c.selectSingleFileCompaction(level)
}
continue
}
// Check if this level exceeds the size ratio threshold
sizeRatio := float64(thisLevelSize) / float64(nextLevelSize)
if sizeRatio >= c.cfg.CompactionRatio {
return c.selectSizeBasedCompaction(level)
}
}
// No compaction needed
return nil, nil
}
// selectLevel0Compaction selects files from L0 for compaction
func (c *Compactor) selectLevel0Compaction() (*CompactionTask, error) {
// Not enough files in L0 for compaction
if len(c.levels[0]) < 2 {
return nil, nil
}
// For L0, take oldest files and compact them to L1
numFiles := len(c.levels[0])
maxCompactFiles := c.cfg.MaxMemTables
if numFiles > maxCompactFiles {
numFiles = maxCompactFiles
}
// Sort L0 files by sequence number to get oldest first
files := make([]*SSTableInfo, len(c.levels[0]))
copy(files, c.levels[0])
sort.Slice(files, func(i, j int) bool {
return files[i].Sequence < files[j].Sequence
})
// Select oldest files for compaction
selectedFiles := files[:numFiles]
// Determine key range of selected files
var minKey, maxKey []byte
for _, file := range selectedFiles {
if len(minKey) == 0 || bytes.Compare(file.FirstKey, minKey) < 0 {
minKey = file.FirstKey
}
if len(maxKey) == 0 || bytes.Compare(file.LastKey, maxKey) > 0 {
maxKey = file.LastKey
}
}
// Find overlapping files in L1
var l1Files []*SSTableInfo
for _, file := range c.levels[1] {
// Create a temporary SSTableInfo for the key range
rangeInfo := &SSTableInfo{
FirstKey: minKey,
LastKey: maxKey,
}
if file.Overlaps(rangeInfo) {
l1Files = append(l1Files, file)
}
}
// Create the compaction task
task := &CompactionTask{
InputFiles: map[int][]*SSTableInfo{
0: selectedFiles,
1: l1Files,
},
TargetLevel: 1,
OutputPathTemplate: filepath.Join(c.sstableDir, "%d_%06d_%020d.sst"),
}
return task, nil
}
// selectSingleFileCompaction selects a single file to compact to the next level
func (c *Compactor) selectSingleFileCompaction(level int) (*CompactionTask, error) {
// Find the oldest file in this level
if len(c.levels[level]) == 0 {
return nil, nil
}
// Sort by sequence number to get the oldest file
files := make([]*SSTableInfo, len(c.levels[level]))
copy(files, c.levels[level])
sort.Slice(files, func(i, j int) bool {
return files[i].Sequence < files[j].Sequence
})
// Select the oldest file
file := files[0]
// Find overlapping files in the next level
var nextLevelFiles []*SSTableInfo
for _, nextFile := range c.levels[level+1] {
if file.Overlaps(nextFile) {
nextLevelFiles = append(nextLevelFiles, nextFile)
}
}
// Create the compaction task
task := &CompactionTask{
InputFiles: map[int][]*SSTableInfo{
level: {file},
level + 1: nextLevelFiles,
},
TargetLevel: level + 1,
OutputPathTemplate: filepath.Join(c.sstableDir, "%d_%06d_%020d.sst"),
}
return task, nil
}
// selectSizeBasedCompaction selects files for compaction based on size ratio
func (c *Compactor) selectSizeBasedCompaction(level int) (*CompactionTask, error) {
// Find a file from this level to compact
if len(c.levels[level]) == 0 {
return nil, nil
}
// Choose a file that hasn't been compacted recently
// For simplicity, just choose the oldest file (lowest sequence number)
files := make([]*SSTableInfo, len(c.levels[level]))
copy(files, c.levels[level])
sort.Slice(files, func(i, j int) bool {
return files[i].Sequence < files[j].Sequence
})
// Select the oldest file
file := files[0]
// Find overlapping files in the next level
var nextLevelFiles []*SSTableInfo
for _, nextFile := range c.levels[level+1] {
if file.Overlaps(nextFile) {
nextLevelFiles = append(nextLevelFiles, nextFile)
}
}
// Create the compaction task
task := &CompactionTask{
InputFiles: map[int][]*SSTableInfo{
level: {file},
level + 1: nextLevelFiles,
},
TargetLevel: level + 1,
OutputPathTemplate: filepath.Join(c.sstableDir, "%d_%06d_%020d.sst"),
}
return task, nil
}
// CompactFiles performs the actual compaction of the input files
func (c *Compactor) CompactFiles(task *CompactionTask) ([]string, error) {
// Create a merged iterator over all input files
var iterators []iterator.Iterator
// Add iterators from both levels
for level := 0; level <= task.TargetLevel; level++ {
for _, file := range task.InputFiles[level] {
// We need an iterator that preserves delete markers
if file.Reader != nil {
iterators = append(iterators, file.Reader.NewIterator())
}
}
}
// Create hierarchical merged iterator
mergedIter := iterator.NewHierarchicalIterator(iterators)
// Remember the input file paths for later cleanup
var inputPaths []string
for _, files := range task.InputFiles {
for _, file := range files {
inputPaths = append(inputPaths, file.Path)
}
}
// Track keys to skip duplicate entries (for tombstones)
var lastKey []byte
var outputFiles []string
var currentWriter *sstable.Writer
var currentOutputPath string
var outputFileSequence uint64 = 1
var entriesInCurrentFile int
// Function to create a new output file
createNewOutputFile := func() error {
if currentWriter != nil {
if err := currentWriter.Finish(); err != nil {
return fmt.Errorf("failed to finish SSTable: %w", err)
}
outputFiles = append(outputFiles, currentOutputPath)
}
// Create a new output file
timestamp := time.Now().UnixNano()
currentOutputPath = fmt.Sprintf(task.OutputPathTemplate,
task.TargetLevel, outputFileSequence, timestamp)
outputFileSequence++
var err error
currentWriter, err = sstable.NewWriter(currentOutputPath)
if err != nil {
return fmt.Errorf("failed to create SSTable writer: %w", err)
}
entriesInCurrentFile = 0
return nil
}
// Create the first output file
if err := createNewOutputFile(); err != nil {
return nil, err
}
// Iterate through all keys in sorted order
mergedIter.SeekToFirst()
for mergedIter.Valid() {
key := mergedIter.Key()
value := mergedIter.Value()
// Skip duplicates (we've already included the newest version)
if lastKey != nil && bytes.Equal(key, lastKey) {
mergedIter.Next()
continue
}
// Add the entry to the current output file, preserving both values and tombstones
// This ensures deletion markers are properly maintained
if err := currentWriter.Add(key, value); err != nil {
return nil, fmt.Errorf("failed to add entry to SSTable: %w", err)
}
entriesInCurrentFile++
// If the current file is big enough, start a new one
if int64(entriesInCurrentFile) >= c.cfg.SSTableMaxSize {
if err := createNewOutputFile(); err != nil {
return nil, err
}
}
// Remember this key to skip duplicates
lastKey = append(lastKey[:0], key...)
mergedIter.Next()
}
// Finish the last output file
if currentWriter != nil && entriesInCurrentFile > 0 {
if err := currentWriter.Finish(); err != nil {
return nil, fmt.Errorf("failed to finish SSTable: %w", err)
}
outputFiles = append(outputFiles, currentOutputPath)
} else if currentWriter != nil {
// No entries were written, abort the file
currentWriter.Abort()
}
return outputFiles, nil
}
// DeleteCompactedFiles removes the input files that were successfully compacted
func (c *Compactor) DeleteCompactedFiles(filePaths []string) error {
for _, path := range filePaths {
if err := os.Remove(path); err != nil {
return fmt.Errorf("failed to delete compacted file %s: %w", path, err)
}
}
return nil
}

View File

@ -0,0 +1,412 @@
package compaction
import (
"bytes"
"fmt"
"os"
"path/filepath"
"sort"
"testing"
"time"
"git.canoozie.net/jer/go-storage/pkg/config"
"git.canoozie.net/jer/go-storage/pkg/sstable"
)
func createTestSSTable(t *testing.T, dir string, level, seq int, timestamp int64, keyValues map[string]string) string {
filename := fmt.Sprintf("%d_%06d_%020d.sst", level, seq, timestamp)
path := filepath.Join(dir, filename)
writer, err := sstable.NewWriter(path)
if err != nil {
t.Fatalf("Failed to create SSTable writer: %v", err)
}
// Get the keys and sort them to ensure they're added in order
var keys []string
for k := range keyValues {
keys = append(keys, k)
}
sort.Strings(keys)
// Add keys in sorted order
for _, k := range keys {
if err := writer.Add([]byte(k), []byte(keyValues[k])); err != nil {
t.Fatalf("Failed to add entry to SSTable: %v", err)
}
}
if err := writer.Finish(); err != nil {
t.Fatalf("Failed to finish SSTable: %v", err)
}
return path
}
func setupCompactionTest(t *testing.T) (string, *config.Config, func()) {
// Create a temp directory for testing
tempDir, err := os.MkdirTemp("", "compaction-test-*")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
// Create the SSTable directory
sstDir := filepath.Join(tempDir, "sst")
if err := os.MkdirAll(sstDir, 0755); err != nil {
t.Fatalf("Failed to create SSTable directory: %v", err)
}
// Create a test configuration
cfg := &config.Config{
Version: config.CurrentManifestVersion,
SSTDir: sstDir,
CompactionLevels: 4,
CompactionRatio: 10.0,
CompactionThreads: 1,
MaxMemTables: 2,
SSTableMaxSize: 1000,
}
// Return cleanup function
cleanup := func() {
os.RemoveAll(tempDir)
}
return sstDir, cfg, cleanup
}
func TestCompactorLoadSSTables(t *testing.T) {
sstDir, cfg, cleanup := setupCompactionTest(t)
defer cleanup()
// Create test SSTables
data1 := map[string]string{
"a": "1",
"b": "2",
"c": "3",
}
data2 := map[string]string{
"d": "4",
"e": "5",
"f": "6",
}
// Keys will be sorted in the createTestSSTable function
timestamp := time.Now().UnixNano()
createTestSSTable(t, sstDir, 0, 1, timestamp, data1)
createTestSSTable(t, sstDir, 0, 2, timestamp+1, data2)
// Create the compactor
compactor := NewCompactor(cfg, sstDir)
// Load SSTables
err := compactor.LoadSSTables()
if err != nil {
t.Fatalf("Failed to load SSTables: %v", err)
}
// Verify the correct number of files was loaded
if len(compactor.levels[0]) != 2 {
t.Errorf("Expected 2 files in level 0, got %d", len(compactor.levels[0]))
}
// Verify key ranges
for _, file := range compactor.levels[0] {
if bytes.Equal(file.FirstKey, []byte("a")) {
if !bytes.Equal(file.LastKey, []byte("c")) {
t.Errorf("Expected last key 'c', got '%s'", string(file.LastKey))
}
} else if bytes.Equal(file.FirstKey, []byte("d")) {
if !bytes.Equal(file.LastKey, []byte("f")) {
t.Errorf("Expected last key 'f', got '%s'", string(file.LastKey))
}
} else {
t.Errorf("Unexpected first key: %s", string(file.FirstKey))
}
}
}
func TestSSTableInfoOverlaps(t *testing.T) {
// Create test SSTable info objects
info1 := &SSTableInfo{
FirstKey: []byte("a"),
LastKey: []byte("c"),
}
info2 := &SSTableInfo{
FirstKey: []byte("b"),
LastKey: []byte("d"),
}
info3 := &SSTableInfo{
FirstKey: []byte("e"),
LastKey: []byte("g"),
}
// Test overlapping ranges
if !info1.Overlaps(info2) {
t.Errorf("Expected info1 to overlap with info2")
}
if !info2.Overlaps(info1) {
t.Errorf("Expected info2 to overlap with info1")
}
// Test non-overlapping ranges
if info1.Overlaps(info3) {
t.Errorf("Expected info1 not to overlap with info3")
}
if info3.Overlaps(info1) {
t.Errorf("Expected info3 not to overlap with info1")
}
}
func TestCompactorSelectLevel0Compaction(t *testing.T) {
sstDir, cfg, cleanup := setupCompactionTest(t)
defer cleanup()
// Create 3 test SSTables in L0
data1 := map[string]string{
"a": "1",
"b": "2",
}
data2 := map[string]string{
"c": "3",
"d": "4",
}
data3 := map[string]string{
"e": "5",
"f": "6",
}
timestamp := time.Now().UnixNano()
createTestSSTable(t, sstDir, 0, 1, timestamp, data1)
createTestSSTable(t, sstDir, 0, 2, timestamp+1, data2)
createTestSSTable(t, sstDir, 0, 3, timestamp+2, data3)
// Create the compactor
compactor := NewTieredCompactor(cfg, sstDir)
// Load SSTables
err := compactor.LoadSSTables()
if err != nil {
t.Fatalf("Failed to load SSTables: %v", err)
}
// Select compaction task
task, err := compactor.SelectCompaction()
if err != nil {
t.Fatalf("Failed to select compaction: %v", err)
}
// Verify the task
if task == nil {
t.Fatalf("Expected compaction task, got nil")
}
// L0 should have files to compact (since we have > cfg.MaxMemTables files)
if len(task.InputFiles[0]) == 0 {
t.Errorf("Expected L0 files to compact, got none")
}
// Target level should be 1
if task.TargetLevel != 1 {
t.Errorf("Expected target level 1, got %d", task.TargetLevel)
}
}
func TestCompactFiles(t *testing.T) {
sstDir, cfg, cleanup := setupCompactionTest(t)
defer cleanup()
// Create test SSTables with overlapping key ranges
data1 := map[string]string{
"a": "1-L0", // Will be overwritten by L1
"b": "2-L0",
"c": "3-L0",
}
data2 := map[string]string{
"a": "1-L1", // Newer version than L0 (lower level has priority)
"d": "4-L1",
"e": "5-L1",
}
timestamp := time.Now().UnixNano()
sstPath1 := createTestSSTable(t, sstDir, 0, 1, timestamp, data1)
sstPath2 := createTestSSTable(t, sstDir, 1, 1, timestamp+1, data2)
// Log the created test files
t.Logf("Created test SSTables: %s, %s", sstPath1, sstPath2)
// Create the compactor
compactor := NewCompactor(cfg, sstDir)
// Load SSTables
err := compactor.LoadSSTables()
if err != nil {
t.Fatalf("Failed to load SSTables: %v", err)
}
// Create a compaction task
task := &CompactionTask{
InputFiles: map[int][]*SSTableInfo{
0: {compactor.levels[0][0]},
1: {compactor.levels[1][0]},
},
TargetLevel: 1,
OutputPathTemplate: filepath.Join(sstDir, "%d_%06d_%020d.sst"),
}
// Perform compaction
outputFiles, err := compactor.CompactFiles(task)
if err != nil {
t.Fatalf("Failed to compact files: %v", err)
}
if len(outputFiles) == 0 {
t.Fatalf("Expected output files, got none")
}
// Open the output file and verify its contents
reader, err := sstable.OpenReader(outputFiles[0])
if err != nil {
t.Fatalf("Failed to open output SSTable: %v", err)
}
defer reader.Close()
// Check each key
checks := map[string]string{
"a": "1-L0", // L0 has priority over L1
"b": "2-L0",
"c": "3-L0",
"d": "4-L1",
"e": "5-L1",
}
for k, expectedValue := range checks {
value, err := reader.Get([]byte(k))
if err != nil {
t.Errorf("Failed to get key %s: %v", k, err)
continue
}
if !bytes.Equal(value, []byte(expectedValue)) {
t.Errorf("Key %s: expected value '%s', got '%s'",
k, expectedValue, string(value))
}
}
// Clean up the output file
for _, file := range outputFiles {
os.Remove(file)
}
}
func TestTombstoneTracking(t *testing.T) {
// Create a tombstone tracker with a short retention period for testing
tracker := NewTombstoneTracker(100 * time.Millisecond)
// Add some tombstones
tracker.AddTombstone([]byte("key1"))
tracker.AddTombstone([]byte("key2"))
// Should keep tombstones initially
if !tracker.ShouldKeepTombstone([]byte("key1")) {
t.Errorf("Expected to keep tombstone for key1")
}
if !tracker.ShouldKeepTombstone([]byte("key2")) {
t.Errorf("Expected to keep tombstone for key2")
}
// Wait for the retention period to expire
time.Sleep(200 * time.Millisecond)
// Garbage collect expired tombstones
tracker.CollectGarbage()
// Should no longer keep the tombstones
if tracker.ShouldKeepTombstone([]byte("key1")) {
t.Errorf("Expected to discard tombstone for key1 after expiration")
}
if tracker.ShouldKeepTombstone([]byte("key2")) {
t.Errorf("Expected to discard tombstone for key2 after expiration")
}
}
func TestCompactionManager(t *testing.T) {
sstDir, cfg, cleanup := setupCompactionTest(t)
defer cleanup()
// Create test SSTables in multiple levels
data1 := map[string]string{
"a": "1",
"b": "2",
}
data2 := map[string]string{
"c": "3",
"d": "4",
}
data3 := map[string]string{
"e": "5",
"f": "6",
}
timestamp := time.Now().UnixNano()
// Create test SSTables and remember their paths for verification
sst1 := createTestSSTable(t, sstDir, 0, 1, timestamp, data1)
sst2 := createTestSSTable(t, sstDir, 0, 2, timestamp+1, data2)
sst3 := createTestSSTable(t, sstDir, 1, 1, timestamp+2, data3)
// Log the created files for debugging
t.Logf("Created test SSTables: %s, %s, %s", sst1, sst2, sst3)
// Create the compaction manager
manager := NewCompactionManager(cfg, sstDir)
// Start the manager
err := manager.Start()
if err != nil {
t.Fatalf("Failed to start compaction manager: %v", err)
}
// Force a compaction cycle
err = manager.TriggerCompaction()
if err != nil {
t.Fatalf("Failed to trigger compaction: %v", err)
}
// Mark some files as obsolete
manager.MarkFileObsolete(sst1)
manager.MarkFileObsolete(sst2)
// Clean up obsolete files
err = manager.CleanupObsoleteFiles()
if err != nil {
t.Fatalf("Failed to clean up obsolete files: %v", err)
}
// Verify the files were deleted
if _, err := os.Stat(sst1); !os.IsNotExist(err) {
t.Errorf("Expected %s to be deleted, but it still exists", sst1)
}
if _, err := os.Stat(sst2); !os.IsNotExist(err) {
t.Errorf("Expected %s to be deleted, but it still exists", sst2)
}
// Stop the manager
err = manager.Stop()
if err != nil {
t.Fatalf("Failed to stop compaction manager: %v", err)
}
}

393
pkg/compaction/manager.go Normal file
View File

@ -0,0 +1,393 @@
package compaction
import (
"fmt"
"os"
"path/filepath"
"sync"
"time"
"git.canoozie.net/jer/go-storage/pkg/config"
)
// CompactionManager coordinates the compaction process
type CompactionManager struct {
// Configuration
cfg *config.Config
// SSTable directory
sstableDir string
// Compactor implementation
compactor *TieredCompactor
// Tombstone tracker
tombstones *TombstoneTracker
// Next sequence number for SSTable files
nextSeq uint64
// Compaction state
running bool
stopCh chan struct{}
compactingMu sync.Mutex
// File tracking
// Map of file path -> true for files that have been obsoleted by compaction
obsoleteFiles map[string]bool
// Map of file path -> true for files that are currently being compacted
pendingFiles map[string]bool
// Last set of files produced by compaction
lastCompactionOutputs []string
filesMu sync.RWMutex
}
// NewCompactionManager creates a new compaction manager
func NewCompactionManager(cfg *config.Config, sstableDir string) *CompactionManager {
return &CompactionManager{
cfg: cfg,
sstableDir: sstableDir,
compactor: NewTieredCompactor(cfg, sstableDir),
tombstones: NewTombstoneTracker(24 * time.Hour), // Default 24-hour retention
nextSeq: 1,
stopCh: make(chan struct{}),
obsoleteFiles: make(map[string]bool),
pendingFiles: make(map[string]bool),
lastCompactionOutputs: make([]string, 0),
}
}
// Start begins background compaction
func (cm *CompactionManager) Start() error {
cm.compactingMu.Lock()
defer cm.compactingMu.Unlock()
if cm.running {
return nil // Already running
}
// Load existing SSTables
if err := cm.compactor.LoadSSTables(); err != nil {
return fmt.Errorf("failed to load SSTables: %w", err)
}
cm.running = true
cm.stopCh = make(chan struct{})
// Start background worker
go cm.compactionWorker()
return nil
}
// Stop halts background compaction
func (cm *CompactionManager) Stop() error {
cm.compactingMu.Lock()
defer cm.compactingMu.Unlock()
if !cm.running {
return nil // Already stopped
}
// Signal the worker to stop
close(cm.stopCh)
cm.running = false
// Close compactor
return cm.compactor.Close()
}
// MarkFileObsolete marks a file as obsolete
func (cm *CompactionManager) MarkFileObsolete(path string) {
cm.filesMu.Lock()
defer cm.filesMu.Unlock()
cm.obsoleteFiles[path] = true
}
// MarkFilePending marks a file as being used in a compaction
func (cm *CompactionManager) MarkFilePending(path string) {
cm.filesMu.Lock()
defer cm.filesMu.Unlock()
cm.pendingFiles[path] = true
}
// UnmarkFilePending removes the pending mark
func (cm *CompactionManager) UnmarkFilePending(path string) {
cm.filesMu.Lock()
defer cm.filesMu.Unlock()
delete(cm.pendingFiles, path)
}
// IsFileObsolete checks if a file is marked as obsolete
func (cm *CompactionManager) IsFileObsolete(path string) bool {
cm.filesMu.RLock()
defer cm.filesMu.RUnlock()
return cm.obsoleteFiles[path]
}
// IsFilePending checks if a file is pending compaction
func (cm *CompactionManager) IsFilePending(path string) bool {
cm.filesMu.RLock()
defer cm.filesMu.RUnlock()
return cm.pendingFiles[path]
}
// CleanupObsoleteFiles removes files that are no longer needed
func (cm *CompactionManager) CleanupObsoleteFiles() error {
cm.filesMu.Lock()
defer cm.filesMu.Unlock()
// Safely remove obsolete files that aren't pending
for path := range cm.obsoleteFiles {
// Skip files that are still being used in a compaction
if cm.pendingFiles[path] {
continue
}
// Try to delete the file
if err := os.Remove(path); err != nil {
if !os.IsNotExist(err) {
return fmt.Errorf("failed to delete obsolete file %s: %w", path, err)
}
// If the file doesn't exist, remove it from our tracking
delete(cm.obsoleteFiles, path)
} else {
// Successfully deleted, remove from tracking
delete(cm.obsoleteFiles, path)
}
}
return nil
}
// compactionWorker runs the compaction loop
func (cm *CompactionManager) compactionWorker() {
// Ensure a minimum interval of 1 second
interval := cm.cfg.CompactionInterval
if interval <= 0 {
interval = 1
}
ticker := time.NewTicker(time.Duration(interval) * time.Second)
defer ticker.Stop()
for {
select {
case <-cm.stopCh:
return
case <-ticker.C:
// Only one compaction at a time
cm.compactingMu.Lock()
// Run a compaction cycle
err := cm.runCompactionCycle()
if err != nil {
// In a real system, we'd log this error
// fmt.Printf("Compaction error: %v\n", err)
}
// Try to clean up obsolete files
err = cm.CleanupObsoleteFiles()
if err != nil {
// In a real system, we'd log this error
// fmt.Printf("Cleanup error: %v\n", err)
}
// Collect tombstone garbage periodically
cm.tombstones.CollectGarbage()
cm.compactingMu.Unlock()
}
}
}
// runCompactionCycle performs a single compaction cycle
func (cm *CompactionManager) runCompactionCycle() error {
// Reload SSTables to get fresh information
if err := cm.compactor.LoadSSTables(); err != nil {
return fmt.Errorf("failed to load SSTables: %w", err)
}
// Select files for compaction
task, err := cm.compactor.SelectCompaction()
if err != nil {
return fmt.Errorf("failed to select files for compaction: %w", err)
}
// If no compaction needed, return
if task == nil {
return nil
}
// Mark files as pending
for _, files := range task.InputFiles {
for _, file := range files {
cm.MarkFilePending(file.Path)
}
}
// Perform compaction
outputFiles, err := cm.compactor.CompactFiles(task)
// Unmark files as pending
for _, files := range task.InputFiles {
for _, file := range files {
cm.UnmarkFilePending(file.Path)
}
}
// Track the compaction outputs for statistics
if err == nil && len(outputFiles) > 0 {
// Record the compaction result
cm.filesMu.Lock()
cm.lastCompactionOutputs = outputFiles
cm.filesMu.Unlock()
}
// Handle compaction errors
if err != nil {
return fmt.Errorf("compaction failed: %w", err)
}
// Mark input files as obsolete
for _, files := range task.InputFiles {
for _, file := range files {
cm.MarkFileObsolete(file.Path)
}
}
// Try to clean up the files immediately
return cm.CleanupObsoleteFiles()
}
// TriggerCompaction forces a compaction cycle
func (cm *CompactionManager) TriggerCompaction() error {
cm.compactingMu.Lock()
defer cm.compactingMu.Unlock()
return cm.runCompactionCycle()
}
// CompactRange triggers compaction on a specific key range
func (cm *CompactionManager) CompactRange(minKey, maxKey []byte) error {
cm.compactingMu.Lock()
defer cm.compactingMu.Unlock()
// Load current SSTable information
if err := cm.compactor.LoadSSTables(); err != nil {
return fmt.Errorf("failed to load SSTables: %w", err)
}
// Find files that overlap with the target range
rangeInfo := &SSTableInfo{
FirstKey: minKey,
LastKey: maxKey,
}
var filesToCompact []*SSTableInfo
for level, files := range cm.compactor.levels {
for _, file := range files {
if file.Overlaps(rangeInfo) {
// Add level information to the file
file.Level = level
filesToCompact = append(filesToCompact, file)
// Mark as pending
cm.MarkFilePending(file.Path)
}
}
}
// If no files to compact, we're done
if len(filesToCompact) == 0 {
return nil
}
// Determine the target level - use the highest level found + 1
targetLevel := 0
for _, file := range filesToCompact {
if file.Level > targetLevel {
targetLevel = file.Level
}
}
targetLevel++ // Compact to next level
// Create the compaction task
task := &CompactionTask{
InputFiles: make(map[int][]*SSTableInfo),
TargetLevel: targetLevel,
OutputPathTemplate: filepath.Join(cm.sstableDir, "%d_%06d_%020d.sst"),
}
// Group files by level
for _, file := range filesToCompact {
task.InputFiles[file.Level] = append(task.InputFiles[file.Level], file)
}
// Perform the compaction
outputFiles, err := cm.compactor.CompactFiles(task)
// Unmark files as pending
for _, file := range filesToCompact {
cm.UnmarkFilePending(file.Path)
}
// Track the compaction outputs for statistics
if err == nil && len(outputFiles) > 0 {
cm.filesMu.Lock()
cm.lastCompactionOutputs = outputFiles
cm.filesMu.Unlock()
}
// Handle errors
if err != nil {
return fmt.Errorf("failed to compact range: %w", err)
}
// Mark input files as obsolete
for _, file := range filesToCompact {
cm.MarkFileObsolete(file.Path)
}
// Try to clean up immediately
return cm.CleanupObsoleteFiles()
}
// GetCompactionStats returns statistics about the compaction state
func (cm *CompactionManager) GetCompactionStats() map[string]interface{} {
cm.filesMu.RLock()
defer cm.filesMu.RUnlock()
stats := make(map[string]interface{})
// Count files by level
levelCounts := make(map[int]int)
levelSizes := make(map[int]int64)
for level, files := range cm.compactor.levels {
levelCounts[level] = len(files)
var totalSize int64
for _, file := range files {
totalSize += file.Size
}
levelSizes[level] = totalSize
}
stats["level_counts"] = levelCounts
stats["level_sizes"] = levelSizes
stats["obsolete_files"] = len(cm.obsoleteFiles)
stats["pending_files"] = len(cm.pendingFiles)
stats["last_outputs_count"] = len(cm.lastCompactionOutputs)
// If there are recent compaction outputs, include information
if len(cm.lastCompactionOutputs) > 0 {
stats["last_outputs"] = cm.lastCompactionOutputs
}
return stats
}

363
pkg/compaction/tiered.go Normal file
View File

@ -0,0 +1,363 @@
package compaction
import (
"bytes"
"fmt"
"path/filepath"
"sort"
"time"
"git.canoozie.net/jer/go-storage/pkg/config"
"git.canoozie.net/jer/go-storage/pkg/iterator"
"git.canoozie.net/jer/go-storage/pkg/sstable"
)
// TieredCompactor implements a tiered compaction strategy
type TieredCompactor struct {
*Compactor
// Next file sequence number
nextFileSeq uint64
}
// NewTieredCompactor creates a new tiered compaction manager
func NewTieredCompactor(cfg *config.Config, sstableDir string) *TieredCompactor {
return &TieredCompactor{
Compactor: NewCompactor(cfg, sstableDir),
nextFileSeq: 1,
}
}
// SelectCompaction selects files for tiered compaction
func (tc *TieredCompactor) SelectCompaction() (*CompactionTask, error) {
// Reload SSTable information
if err := tc.LoadSSTables(); err != nil {
return nil, fmt.Errorf("failed to load SSTables: %w", err)
}
// Determine the maximum level
maxLevel := 0
for level := range tc.levels {
if level > maxLevel {
maxLevel = level
}
}
// Check L0 first (special case due to potential overlaps)
if len(tc.levels[0]) >= tc.cfg.MaxMemTables {
return tc.selectL0Compaction()
}
// Check size-based conditions for other levels
for level := 0; level < maxLevel; level++ {
// If this level is too large compared to the next level
thisLevelSize := tc.GetLevelSize(level)
nextLevelSize := tc.GetLevelSize(level + 1)
// If level is empty, skip it
if thisLevelSize == 0 {
continue
}
// If next level is empty, promote a file
if nextLevelSize == 0 && len(tc.levels[level]) > 0 {
return tc.selectPromotionCompaction(level)
}
// Check size ratio
sizeRatio := float64(thisLevelSize) / float64(nextLevelSize)
if sizeRatio >= tc.cfg.CompactionRatio {
return tc.selectOverlappingCompaction(level)
}
}
// No compaction needed
return nil, nil
}
// selectL0Compaction selects files from L0 for compaction
func (tc *TieredCompactor) selectL0Compaction() (*CompactionTask, error) {
// Require at least some files in L0
if len(tc.levels[0]) < 2 {
return nil, nil
}
// Sort L0 files by sequence number to prioritize older files
files := make([]*SSTableInfo, len(tc.levels[0]))
copy(files, tc.levels[0])
sort.Slice(files, func(i, j int) bool {
return files[i].Sequence < files[j].Sequence
})
// Take up to maxCompactFiles from L0
maxCompactFiles := tc.cfg.MaxMemTables
if maxCompactFiles > len(files) {
maxCompactFiles = len(files)
}
selectedFiles := files[:maxCompactFiles]
// Determine the key range covered by selected files
var minKey, maxKey []byte
for _, file := range selectedFiles {
if len(minKey) == 0 || bytes.Compare(file.FirstKey, minKey) < 0 {
minKey = file.FirstKey
}
if len(maxKey) == 0 || bytes.Compare(file.LastKey, maxKey) > 0 {
maxKey = file.LastKey
}
}
// Find overlapping files in L1
var l1Files []*SSTableInfo
for _, file := range tc.levels[1] {
// Create a temporary SSTableInfo with the key range
rangeInfo := &SSTableInfo{
FirstKey: minKey,
LastKey: maxKey,
}
if file.Overlaps(rangeInfo) {
l1Files = append(l1Files, file)
}
}
// Create the compaction task
task := &CompactionTask{
InputFiles: map[int][]*SSTableInfo{
0: selectedFiles,
1: l1Files,
},
TargetLevel: 1,
OutputPathTemplate: filepath.Join(tc.sstableDir, "%d_%06d_%020d.sst"),
}
return task, nil
}
// selectPromotionCompaction selects a file to promote to the next level
func (tc *TieredCompactor) selectPromotionCompaction(level int) (*CompactionTask, error) {
// Sort files by sequence number
files := make([]*SSTableInfo, len(tc.levels[level]))
copy(files, tc.levels[level])
sort.Slice(files, func(i, j int) bool {
return files[i].Sequence < files[j].Sequence
})
// Select the oldest file
file := files[0]
// Create task to promote this file to the next level
// No need to merge with any other files since the next level is empty
task := &CompactionTask{
InputFiles: map[int][]*SSTableInfo{
level: {file},
},
TargetLevel: level + 1,
OutputPathTemplate: filepath.Join(tc.sstableDir, "%d_%06d_%020d.sst"),
}
return task, nil
}
// selectOverlappingCompaction selects files for compaction based on key overlap
func (tc *TieredCompactor) selectOverlappingCompaction(level int) (*CompactionTask, error) {
// Sort files by sequence number to start with oldest
files := make([]*SSTableInfo, len(tc.levels[level]))
copy(files, tc.levels[level])
sort.Slice(files, func(i, j int) bool {
return files[i].Sequence < files[j].Sequence
})
// Select an initial file from this level
file := files[0]
// Find all overlapping files in the next level
var nextLevelFiles []*SSTableInfo
for _, nextFile := range tc.levels[level+1] {
if file.Overlaps(nextFile) {
nextLevelFiles = append(nextLevelFiles, nextFile)
}
}
// Create the compaction task
task := &CompactionTask{
InputFiles: map[int][]*SSTableInfo{
level: {file},
level + 1: nextLevelFiles,
},
TargetLevel: level + 1,
OutputPathTemplate: filepath.Join(tc.sstableDir, "%d_%06d_%020d.sst"),
}
return task, nil
}
// Compact performs compaction of the selected files
func (tc *TieredCompactor) Compact(task *CompactionTask) error {
if task == nil {
return nil // Nothing to compact
}
// Perform the compaction
_, err := tc.CompactFiles(task)
if err != nil {
return fmt.Errorf("compaction failed: %w", err)
}
// Gather all input file paths for cleanup
var inputPaths []string
for _, files := range task.InputFiles {
for _, file := range files {
inputPaths = append(inputPaths, file.Path)
}
}
// Delete the original files that were compacted
if err := tc.DeleteCompactedFiles(inputPaths); err != nil {
return fmt.Errorf("failed to clean up compacted files: %w", err)
}
// Reload SSTables to refresh our file list
if err := tc.LoadSSTables(); err != nil {
return fmt.Errorf("failed to reload SSTables: %w", err)
}
return nil
}
// CompactRange performs compaction on a specific key range
func (tc *TieredCompactor) CompactRange(minKey, maxKey []byte) error {
// Create a range info to check for overlaps
rangeInfo := &SSTableInfo{
FirstKey: minKey,
LastKey: maxKey,
}
// Find files overlapping with the given range in each level
task := &CompactionTask{
InputFiles: make(map[int][]*SSTableInfo),
TargetLevel: 0, // Will be updated
OutputPathTemplate: filepath.Join(tc.sstableDir, "%d_%06d_%020d.sst"),
}
// Get the maximum level
var maxLevel int
for level := range tc.levels {
if level > maxLevel {
maxLevel = level
}
}
// Find overlapping files in each level
for level := 0; level <= maxLevel; level++ {
var overlappingFiles []*SSTableInfo
for _, file := range tc.levels[level] {
if file.Overlaps(rangeInfo) {
overlappingFiles = append(overlappingFiles, file)
}
}
if len(overlappingFiles) > 0 {
task.InputFiles[level] = overlappingFiles
}
}
// If no files overlap with the range, no compaction needed
totalInputFiles := 0
for _, files := range task.InputFiles {
totalInputFiles += len(files)
}
if totalInputFiles == 0 {
return nil
}
// Set target level to the maximum level + 1
task.TargetLevel = maxLevel + 1
// Perform the compaction
return tc.Compact(task)
}
// RunCompaction performs a full compaction cycle
func (tc *TieredCompactor) RunCompaction() error {
// Select files for compaction
task, err := tc.SelectCompaction()
if err != nil {
return fmt.Errorf("failed to select files for compaction: %w", err)
}
// If no compaction needed, return
if task == nil {
return nil
}
// Perform the compaction
return tc.Compact(task)
}
// MergeCompact merges iterators from multiple SSTables and writes to new files
func (tc *TieredCompactor) MergeCompact(readers []*sstable.Reader, targetLevel int) ([]string, error) {
// Create iterators for all input files
var iterators []iterator.Iterator
for _, reader := range readers {
iterators = append(iterators, reader.NewIterator())
}
// Create a merged iterator
mergedIter := iterator.NewHierarchicalIterator(iterators)
// Create a new output file
timestamp := time.Now().UnixNano()
outputPath := filepath.Join(tc.sstableDir,
fmt.Sprintf("%d_%06d_%020d.sst",
targetLevel, tc.nextFileSeq, timestamp))
tc.nextFileSeq++
writer, err := sstable.NewWriter(outputPath)
if err != nil {
return nil, fmt.Errorf("failed to create SSTable writer: %w", err)
}
// Write all entries from the merged iterator
var lastKey []byte
var entriesWritten int
mergedIter.SeekToFirst()
for mergedIter.Valid() {
key := mergedIter.Key()
value := mergedIter.Value()
// Skip duplicates
if lastKey != nil && bytes.Equal(key, lastKey) {
mergedIter.Next()
continue
}
// Skip tombstones (nil values) during compaction
if value != nil {
if err := writer.Add(key, value); err != nil {
writer.Abort()
return nil, fmt.Errorf("failed to add entry to SSTable: %w", err)
}
entriesWritten++
}
lastKey = append(lastKey[:0], key...)
mergedIter.Next()
}
// Finish writing
if entriesWritten > 0 {
if err := writer.Finish(); err != nil {
return nil, fmt.Errorf("failed to finish SSTable: %w", err)
}
return []string{outputPath}, nil
}
// No entries written, abort the file
writer.Abort()
return nil, nil
}

184
pkg/compaction/tombstone.go Normal file
View File

@ -0,0 +1,184 @@
package compaction
import (
"bytes"
"time"
)
// TombstoneTracker tracks tombstones to support garbage collection
type TombstoneTracker struct {
// Map of deleted keys with deletion timestamp
deletions map[string]time.Time
// Retention period for tombstones (after this time, they can be discarded)
retention time.Duration
}
// NewTombstoneTracker creates a new tombstone tracker
func NewTombstoneTracker(retentionPeriod time.Duration) *TombstoneTracker {
return &TombstoneTracker{
deletions: make(map[string]time.Time),
retention: retentionPeriod,
}
}
// AddTombstone records a key deletion
func (t *TombstoneTracker) AddTombstone(key []byte) {
t.deletions[string(key)] = time.Now()
}
// ShouldKeepTombstone checks if a tombstone should be preserved during compaction
func (t *TombstoneTracker) ShouldKeepTombstone(key []byte) bool {
strKey := string(key)
timestamp, exists := t.deletions[strKey]
if !exists {
return false // Not a tracked tombstone
}
// Keep the tombstone if it's still within the retention period
return time.Since(timestamp) < t.retention
}
// CollectGarbage removes expired tombstone records
func (t *TombstoneTracker) CollectGarbage() {
now := time.Now()
for key, timestamp := range t.deletions {
if now.Sub(timestamp) > t.retention {
delete(t.deletions, key)
}
}
}
// TombstoneFilter is an interface for filtering tombstones during compaction
type TombstoneFilter interface {
// ShouldKeep determines if a key-value pair should be kept during compaction
// If value is nil, it's a tombstone marker
ShouldKeep(key, value []byte) bool
}
// BasicTombstoneFilter implements a simple filter that keeps all non-tombstone entries
// and keeps tombstones during certain (lower) levels of compaction
type BasicTombstoneFilter struct {
// The level of compaction (higher levels discard more tombstones)
level int
// The maximum level to retain tombstones
maxTombstoneLevel int
// The tombstone tracker (if any)
tracker *TombstoneTracker
}
// NewBasicTombstoneFilter creates a new tombstone filter
func NewBasicTombstoneFilter(level, maxTombstoneLevel int, tracker *TombstoneTracker) *BasicTombstoneFilter {
return &BasicTombstoneFilter{
level: level,
maxTombstoneLevel: maxTombstoneLevel,
tracker: tracker,
}
}
// ShouldKeep determines if a key-value pair should be kept
func (f *BasicTombstoneFilter) ShouldKeep(key, value []byte) bool {
// Always keep normal entries (non-tombstones)
if value != nil {
return true
}
// For tombstones (value == nil):
// If we have a tracker, use it to determine if the tombstone is still needed
if f.tracker != nil {
return f.tracker.ShouldKeepTombstone(key)
}
// Otherwise use level-based heuristic
// Keep tombstones in lower levels, discard in higher levels
return f.level <= f.maxTombstoneLevel
}
// TimeBasedTombstoneFilter implements a filter that keeps tombstones based on age
type TimeBasedTombstoneFilter struct {
// Map of key to deletion time
deletionTimes map[string]time.Time
// Current time (for testing)
now time.Time
// Retention period
retention time.Duration
}
// NewTimeBasedTombstoneFilter creates a new time-based tombstone filter
func NewTimeBasedTombstoneFilter(deletionTimes map[string]time.Time, retention time.Duration) *TimeBasedTombstoneFilter {
return &TimeBasedTombstoneFilter{
deletionTimes: deletionTimes,
now: time.Now(),
retention: retention,
}
}
// ShouldKeep determines if a key-value pair should be kept
func (f *TimeBasedTombstoneFilter) ShouldKeep(key, value []byte) bool {
// Always keep normal entries
if value != nil {
return true
}
// For tombstones, check if we know when this key was deleted
strKey := string(key)
deleteTime, found := f.deletionTimes[strKey]
if !found {
// If we don't know when it was deleted, keep it to be safe
return true
}
// If the tombstone is older than our retention period, we can discard it
return f.now.Sub(deleteTime) <= f.retention
}
// KeyRangeTombstoneFilter filters tombstones by key range
type KeyRangeTombstoneFilter struct {
// Minimum key in the range (inclusive)
minKey []byte
// Maximum key in the range (exclusive)
maxKey []byte
// Delegate filter
delegate TombstoneFilter
}
// NewKeyRangeTombstoneFilter creates a new key range tombstone filter
func NewKeyRangeTombstoneFilter(minKey, maxKey []byte, delegate TombstoneFilter) *KeyRangeTombstoneFilter {
return &KeyRangeTombstoneFilter{
minKey: minKey,
maxKey: maxKey,
delegate: delegate,
}
}
// ShouldKeep determines if a key-value pair should be kept
func (f *KeyRangeTombstoneFilter) ShouldKeep(key, value []byte) bool {
// Always keep normal entries
if value != nil {
return true
}
// Check if the key is in our targeted range
inRange := true
if f.minKey != nil && bytes.Compare(key, f.minKey) < 0 {
inRange = false
}
if f.maxKey != nil && bytes.Compare(key, f.maxKey) >= 0 {
inRange = false
}
// If not in range, keep the tombstone
if !inRange {
return true
}
// Otherwise, delegate to the wrapped filter
return f.delegate.ShouldKeep(key, value)
}

145
pkg/engine/compaction.go Normal file
View File

@ -0,0 +1,145 @@
package engine
import (
"fmt"
"os"
"path/filepath"
"git.canoozie.net/jer/go-storage/pkg/compaction"
"git.canoozie.net/jer/go-storage/pkg/sstable"
)
// setupCompaction initializes the compaction manager for the engine
func (e *Engine) setupCompaction() error {
// Create the compaction manager
e.compactionMgr = compaction.NewCompactionManager(e.cfg, e.sstableDir)
// Start the compaction manager
return e.compactionMgr.Start()
}
// shutdownCompaction stops the compaction manager
func (e *Engine) shutdownCompaction() error {
if e.compactionMgr != nil {
return e.compactionMgr.Stop()
}
return nil
}
// TriggerCompaction forces a compaction cycle
func (e *Engine) TriggerCompaction() error {
e.mu.RLock()
defer e.mu.RUnlock()
if e.closed.Load() {
return ErrEngineClosed
}
if e.compactionMgr == nil {
return fmt.Errorf("compaction manager not initialized")
}
return e.compactionMgr.TriggerCompaction()
}
// CompactRange forces compaction on a specific key range
func (e *Engine) CompactRange(startKey, endKey []byte) error {
e.mu.RLock()
defer e.mu.RUnlock()
if e.closed.Load() {
return ErrEngineClosed
}
if e.compactionMgr == nil {
return fmt.Errorf("compaction manager not initialized")
}
return e.compactionMgr.CompactRange(startKey, endKey)
}
// reloadSSTables reloads all SSTables from disk after compaction
func (e *Engine) reloadSSTables() error {
e.mu.Lock()
defer e.mu.Unlock()
// Close existing SSTable readers
for _, reader := range e.sstables {
if err := reader.Close(); err != nil {
return fmt.Errorf("failed to close SSTable reader: %w", err)
}
}
// Clear the list
e.sstables = e.sstables[:0]
// Find all SSTable files
entries, err := os.ReadDir(e.sstableDir)
if err != nil {
if os.IsNotExist(err) {
return nil // Directory doesn't exist yet
}
return fmt.Errorf("failed to read SSTable directory: %w", err)
}
// Open all SSTable files
for _, entry := range entries {
if entry.IsDir() || filepath.Ext(entry.Name()) != ".sst" {
continue // Skip directories and non-SSTable files
}
path := filepath.Join(e.sstableDir, entry.Name())
reader, err := sstable.OpenReader(path)
if err != nil {
return fmt.Errorf("failed to open SSTable %s: %w", path, err)
}
e.sstables = append(e.sstables, reader)
}
return nil
}
// GetCompactionStats returns statistics about the compaction state
func (e *Engine) GetCompactionStats() (map[string]interface{}, error) {
e.mu.RLock()
defer e.mu.RUnlock()
if e.closed.Load() {
return nil, ErrEngineClosed
}
if e.compactionMgr == nil {
return map[string]interface{}{
"enabled": false,
}, nil
}
stats := e.compactionMgr.GetCompactionStats()
stats["enabled"] = true
// Add memtable information
stats["memtables"] = map[string]interface{}{
"active": len(e.memTablePool.GetMemTables()),
"immutable": len(e.immutableMTs),
"total_size": e.memTablePool.TotalSize(),
}
return stats, nil
}
// maybeScheduleCompaction checks if compaction should be scheduled
func (e *Engine) maybeScheduleCompaction() {
// No immediate action needed - the compaction manager handles it all
// This is just a hook for future expansion
// We could trigger a manual compaction in some cases
if e.compactionMgr != nil && len(e.sstables) > e.cfg.MaxMemTables*2 {
go func() {
err := e.compactionMgr.TriggerCompaction()
if err != nil {
// In a real implementation, we would log this error
}
}()
}
}

View File

@ -0,0 +1,248 @@
package engine
import (
"bytes"
"fmt"
"os"
"path/filepath"
"testing"
"time"
)
func TestEngine_Compaction(t *testing.T) {
// Create a temp directory for the test
dir, err := os.MkdirTemp("", "engine-compaction-test-*")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
defer os.RemoveAll(dir)
// Create the engine with small thresholds to trigger compaction easily
engine, err := NewEngine(dir)
if err != nil {
t.Fatalf("Failed to create engine: %v", err)
}
// Modify config for testing
engine.cfg.MemTableSize = 1024 // 1KB
engine.cfg.MaxMemTables = 2 // Only allow 2 immutable tables
// Insert several keys to create multiple SSTables
for i := 0; i < 10; i++ {
for j := 0; j < 10; j++ {
key := []byte(fmt.Sprintf("key-%d-%d", i, j))
value := []byte(fmt.Sprintf("value-%d-%d", i, j))
if err := engine.Put(key, value); err != nil {
t.Fatalf("Failed to put key-value: %v", err)
}
}
// Force a flush after each batch to create multiple SSTables
if err := engine.FlushImMemTables(); err != nil {
t.Fatalf("Failed to flush memtables: %v", err)
}
}
// Trigger compaction
if err := engine.TriggerCompaction(); err != nil {
t.Fatalf("Failed to trigger compaction: %v", err)
}
// Sleep to give compaction time to complete
time.Sleep(200 * time.Millisecond)
// Verify that all keys are still accessible
for i := 0; i < 10; i++ {
for j := 0; j < 10; j++ {
key := []byte(fmt.Sprintf("key-%d-%d", i, j))
expectedValue := []byte(fmt.Sprintf("value-%d-%d", i, j))
value, err := engine.Get(key)
if err != nil {
t.Errorf("Failed to get key %s: %v", key, err)
continue
}
if !bytes.Equal(value, expectedValue) {
t.Errorf("Got incorrect value for key %s. Expected: %s, Got: %s",
string(key), string(expectedValue), string(value))
}
}
}
// Test compaction stats
stats, err := engine.GetCompactionStats()
if err != nil {
t.Fatalf("Failed to get compaction stats: %v", err)
}
if stats["enabled"] != true {
t.Errorf("Expected compaction to be enabled")
}
// Close the engine
if err := engine.Close(); err != nil {
t.Fatalf("Failed to close engine: %v", err)
}
}
func TestEngine_CompactRange(t *testing.T) {
// Create a temp directory for the test
dir, err := os.MkdirTemp("", "engine-compact-range-test-*")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
defer os.RemoveAll(dir)
// Create the engine
engine, err := NewEngine(dir)
if err != nil {
t.Fatalf("Failed to create engine: %v", err)
}
// Insert keys with different prefixes
prefixes := []string{"a", "b", "c", "d"}
for _, prefix := range prefixes {
for i := 0; i < 10; i++ {
key := []byte(fmt.Sprintf("%s-key-%d", prefix, i))
value := []byte(fmt.Sprintf("%s-value-%d", prefix, i))
if err := engine.Put(key, value); err != nil {
t.Fatalf("Failed to put key-value: %v", err)
}
}
// Force a flush after each prefix
if err := engine.FlushImMemTables(); err != nil {
t.Fatalf("Failed to flush memtables: %v", err)
}
}
// Compact only the range with prefix "b"
startKey := []byte("b")
endKey := []byte("c")
if err := engine.CompactRange(startKey, endKey); err != nil {
t.Fatalf("Failed to compact range: %v", err)
}
// Sleep to give compaction time to complete
time.Sleep(200 * time.Millisecond)
// Verify that all keys are still accessible
for _, prefix := range prefixes {
for i := 0; i < 10; i++ {
key := []byte(fmt.Sprintf("%s-key-%d", prefix, i))
expectedValue := []byte(fmt.Sprintf("%s-value-%d", prefix, i))
value, err := engine.Get(key)
if err != nil {
t.Errorf("Failed to get key %s: %v", key, err)
continue
}
if !bytes.Equal(value, expectedValue) {
t.Errorf("Got incorrect value for key %s. Expected: %s, Got: %s",
string(key), string(expectedValue), string(value))
}
}
}
// Close the engine
if err := engine.Close(); err != nil {
t.Fatalf("Failed to close engine: %v", err)
}
}
func TestEngine_TombstoneHandling(t *testing.T) {
// Create a temp directory for the test
dir, err := os.MkdirTemp("", "engine-tombstone-test-*")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
defer os.RemoveAll(dir)
// Create the engine
engine, err := NewEngine(dir)
if err != nil {
t.Fatalf("Failed to create engine: %v", err)
}
// Insert some keys
for i := 0; i < 10; i++ {
key := []byte(fmt.Sprintf("key-%d", i))
value := []byte(fmt.Sprintf("value-%d", i))
if err := engine.Put(key, value); err != nil {
t.Fatalf("Failed to put key-value: %v", err)
}
}
// Flush to create an SSTable
if err := engine.FlushImMemTables(); err != nil {
t.Fatalf("Failed to flush memtables: %v", err)
}
// Delete some keys
for i := 0; i < 5; i++ {
key := []byte(fmt.Sprintf("key-%d", i))
if err := engine.Delete(key); err != nil {
t.Fatalf("Failed to delete key: %v", err)
}
}
// Flush again to create another SSTable with tombstones
if err := engine.FlushImMemTables(); err != nil {
t.Fatalf("Failed to flush memtables: %v", err)
}
// Count the number of SSTable files before compaction
sstableFiles, err := filepath.Glob(filepath.Join(engine.sstableDir, "*.sst"))
if err != nil {
t.Fatalf("Failed to list SSTable files: %v", err)
}
// Log how many files we have before compaction
t.Logf("Number of SSTable files before compaction: %d", len(sstableFiles))
// Trigger compaction
if err := engine.TriggerCompaction(); err != nil {
t.Fatalf("Failed to trigger compaction: %v", err)
}
// Sleep to give compaction time to complete
time.Sleep(200 * time.Millisecond)
// Verify deleted keys are still not accessible
for i := 0; i < 5; i++ {
key := []byte(fmt.Sprintf("key-%d", i))
_, err := engine.Get(key)
if err != ErrKeyNotFound {
t.Errorf("Expected key %s to be deleted, but got: %v", key, err)
}
}
// Verify non-deleted keys are still accessible
for i := 5; i < 10; i++ {
key := []byte(fmt.Sprintf("key-%d", i))
expectedValue := []byte(fmt.Sprintf("value-%d", i))
value, err := engine.Get(key)
if err != nil {
t.Errorf("Failed to get key %s: %v", key, err)
continue
}
if !bytes.Equal(value, expectedValue) {
t.Errorf("Got incorrect value for key %s. Expected: %s, Got: %s",
string(key), string(expectedValue), string(value))
}
}
// Close the engine
if err := engine.Close(); err != nil {
t.Fatalf("Failed to close engine: %v", err)
}
}

View File

@ -9,6 +9,7 @@ import (
"sync/atomic"
"time"
"git.canoozie.net/jer/go-storage/pkg/compaction"
"git.canoozie.net/jer/go-storage/pkg/config"
"git.canoozie.net/jer/go-storage/pkg/memtable"
"git.canoozie.net/jer/go-storage/pkg/sstable"
@ -45,6 +46,9 @@ type Engine struct {
// Storage layer
sstables []*sstable.Reader
// Compaction
compactionMgr *compaction.CompactionManager
// State management
nextFileNum uint64
lastSeqNum uint64
@ -118,6 +122,11 @@ func NewEngine(dataDir string) (*Engine, error) {
// Start background flush goroutine
go e.backgroundFlush()
// Initialize compaction
if err := e.setupCompaction(); err != nil {
return nil, fmt.Errorf("failed to set up compaction: %w", err)
}
return e, nil
}
@ -174,6 +183,10 @@ func (e *Engine) Get(key []byte) ([]byte, error) {
for i := len(e.sstables) - 1; i >= 0; i-- {
val, err := e.sstables[i].Get(key)
if err == nil {
// If val is nil, it's a tombstone marker - the key exists but was deleted
if val == nil {
return nil, ErrKeyNotFound
}
return val, nil
}
if !errors.Is(err, sstable.ErrNotFound) {
@ -352,6 +365,9 @@ func (e *Engine) flushMemTable(mem *memtable.MemTable) error {
e.mu.Lock()
e.sstables = append(e.sstables, reader)
e.mu.Unlock()
// Maybe trigger compaction after flushing
e.maybeScheduleCompaction()
return nil
}
@ -452,6 +468,11 @@ func (e *Engine) Close() error {
e.mu.Lock()
defer e.mu.Unlock()
// Shutdown compaction manager
if err := e.shutdownCompaction(); err != nil {
return fmt.Errorf("failed to shutdown compaction: %w", err)
}
// Close WAL first
if err := e.wal.Close(); err != nil {
return fmt.Errorf("failed to close WAL: %w", err)

View File

@ -163,4 +163,19 @@ func (p *MemTablePool) GetMemTables() []*MemTable {
result = append(result, p.active)
result = append(result, p.immutables...)
return result
}
// TotalSize returns the total approximate size of all memtables in the pool
func (p *MemTablePool) TotalSize() int64 {
p.mu.RLock()
defer p.mu.RUnlock()
var total int64
total += p.active.ApproximateSize()
for _, m := range p.immutables {
total += m.ApproximateSize()
}
return total
}

9
pkg/sstable/reader.go Normal file
View File

@ -0,0 +1,9 @@
package sstable
// GetKeyCount returns the estimated number of keys in the SSTable
func (r *Reader) GetKeyCount() int {
r.mu.RLock()
defer r.mu.RUnlock()
return int(r.numEntries)
}