From 001934e7b59c4a4aa7323846e63f4f54a42e8697 Mon Sep 17 00:00:00 2001 From: Jeremy Tregunna Date: Mon, 21 Apr 2025 15:59:10 -0600 Subject: [PATCH] refactor: optimize two pass deduping into a single pass across the codebase --- pkg/common/iterator/composite/hierarchical.go | 129 ++++++++-------- pkg/engine/engine.go | 33 ++--- pkg/engine/iterator.go | 138 ++++++------------ 3 files changed, 125 insertions(+), 175 deletions(-) diff --git a/pkg/common/iterator/composite/hierarchical.go b/pkg/common/iterator/composite/hierarchical.go index 29520ff..f8d713f 100644 --- a/pkg/common/iterator/composite/hierarchical.go +++ b/pkg/common/iterator/composite/hierarchical.go @@ -94,48 +94,54 @@ func (h *HierarchicalIterator) Seek(target []byte) bool { iter.Seek(target) } - // For seek, we need to treat it differently than findNextUniqueKey since we want - // keys >= target, not strictly > target - var minKey []byte - var minValue []byte - var seenKeys = make(map[string]bool) + // For seek, we need to find the smallest key >= target + var bestKey []byte + var bestValue []byte + var bestIterIdx int = -1 h.valid = false - // Find the smallest key >= target from all iterators - for _, iter := range h.iterators { + // First pass: find the smallest key >= target + for i, iter := range h.iterators { if !iter.Valid() { continue } key := iter.Key() - value := iter.Value() // Skip keys < target (Seek should return keys >= target) if bytes.Compare(key, target) < 0 { continue } - // Convert key to string for map lookup - keyStr := string(key) - - // Only use this key if we haven't seen it from a newer iterator - if !seenKeys[keyStr] { - // Mark as seen - seenKeys[keyStr] = true - - // Update min key if needed - if minKey == nil || bytes.Compare(key, minKey) < 0 { - minKey = key - minValue = value - h.valid = true - } + // If we haven't found a valid key yet, or this key is smaller than the current best key + if bestIterIdx == -1 || bytes.Compare(key, bestKey) < 0 { + // This becomes our best candidate so far + bestKey = key + bestValue = iter.Value() + bestIterIdx = i } } - // Set the found key/value - if h.valid { - h.key = minKey - h.value = minValue + // Now we need to check if any newer iterators have the same key + if bestIterIdx != -1 { + // Check all newer iterators (earlier in the slice) for the same key + for i := 0; i < bestIterIdx; i++ { + iter := h.iterators[i] + if !iter.Valid() { + continue + } + + // If a newer iterator has the same key, use its value + if bytes.Equal(iter.Key(), bestKey) { + bestValue = iter.Value() + break // Since iterators are in newest-to-oldest order, we can stop at the first match + } + } + + // Set the found key/value + h.key = bestKey + h.value = bestValue + h.valid = true return true } @@ -218,23 +224,20 @@ func (h *HierarchicalIterator) GetSourceIterators() []iterator.Iterator { // Returns true if a valid key was found func (h *HierarchicalIterator) findNextUniqueKey(prevKey []byte) bool { // Find the smallest key among all iterators that is > prevKey - var minKey []byte - var minValue []byte - var seenKeys = make(map[string]bool) + var bestKey []byte + var bestValue []byte + var bestIterIdx int = -1 h.valid = false - // First pass: collect all valid keys and find min key > prevKey - for _, iter := range h.iterators { + // First pass: advance all iterators past prevKey and find the smallest next key + for i, iter := range h.iterators { // Skip invalid iterators if !iter.Valid() { continue } - key := iter.Key() - value := iter.Value() - // Skip keys <= prevKey if we're looking for the next key - if prevKey != nil && bytes.Compare(key, prevKey) <= 0 { + if prevKey != nil && bytes.Compare(iter.Key(), prevKey) <= 0 { // Advance to find a key > prevKey for iter.Valid() && bytes.Compare(iter.Key(), prevKey) <= 0 { if !iter.Next() { @@ -246,38 +249,40 @@ func (h *HierarchicalIterator) findNextUniqueKey(prevKey []byte) bool { if !iter.Valid() { continue } - - // Get the new key after advancing - key = iter.Key() - value = iter.Value() - - // If key is still <= prevKey after advancing, skip this iterator - if bytes.Compare(key, prevKey) <= 0 { - continue - } } - // Convert key to string for map lookup - keyStr := string(key) - - // If this key hasn't been seen before, or this is a newer source for the same key - if !seenKeys[keyStr] { - // Mark this key as seen - it's from the newest source - seenKeys[keyStr] = true - - // Check if this is a new minimum key - if minKey == nil || bytes.Compare(key, minKey) < 0 { - minKey = key - minValue = value - h.valid = true - } + // Get the current key + key := iter.Key() + + // If we haven't found a valid key yet, or this key is smaller than the current best key + if bestIterIdx == -1 || bytes.Compare(key, bestKey) < 0 { + // This becomes our best candidate so far + bestKey = key + bestValue = iter.Value() + bestIterIdx = i } } - // Set the key/value if we found a valid one - if h.valid { - h.key = minKey - h.value = minValue + // Now we need to check if any newer iterators have the same key + if bestIterIdx != -1 { + // Check all newer iterators (earlier in the slice) for the same key + for i := 0; i < bestIterIdx; i++ { + iter := h.iterators[i] + if !iter.Valid() { + continue + } + + // If a newer iterator has the same key, use its value + if bytes.Equal(iter.Key(), bestKey) { + bestValue = iter.Value() + break // Since iterators are in newest-to-oldest order, we can stop at the first match + } + } + + // Set the found key/value + h.key = bestKey + h.value = bestValue + h.valid = true return true } diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index d46e010..a7da6e3 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -509,26 +509,26 @@ func (e *Engine) flushMemTable(mem *memtable.MemTable) error { iter := mem.NewIterator() count := 0 var bytesWritten uint64 - - // Since MemTable's skiplist iterator returns keys in sorted order, - // and the Find() method already returns the most recent entry for each key, - // we can deduplicate in a single pass by tracking the last seen key - var lastKey []byte - - // Process entries in a single pass + + // Since memtable's skiplist returns keys in sorted order, + // but possibly with duplicates (newer versions of same key first), + // we need to track all processed keys (including tombstones) + var processedKeys = make(map[string]struct{}) + for iter.SeekToFirst(); iter.Valid(); iter.Next() { key := iter.Key() + keyStr := string(key) // Use as map key - // Skip deletion markers, only add value entries + // Skip keys we've already processed (including tombstones) + if _, seen := processedKeys[keyStr]; seen { + continue + } + + // Mark this key as processed regardless of whether it's a value or tombstone + processedKeys[keyStr] = struct{}{} + + // Only write non-tombstone entries to the SSTable if value := iter.Value(); value != nil { - // Skip duplicate keys (only add each key once) - // The MemTable iterator returns multiple values for the same key - // but we only need the first one we encounter for each key, which is the latest - if lastKey != nil && bytes.Equal(key, lastKey) { - continue - } - - // Add to SSTable bytesWritten += uint64(len(key) + len(value)) if err := writer.Add(key, value); err != nil { writer.Abort() @@ -536,7 +536,6 @@ func (e *Engine) flushMemTable(mem *memtable.MemTable) error { return fmt.Errorf("failed to add entry to SSTable: %w", err) } count++ - lastKey = append(lastKey[:0], key...) } } diff --git a/pkg/engine/iterator.go b/pkg/engine/iterator.go index 8d5a4d0..b066f06 100644 --- a/pkg/engine/iterator.go +++ b/pkg/engine/iterator.go @@ -440,41 +440,23 @@ func (c *chainedIterator) SeekToFirst() { iter.SeekToFirst() } - // Maps to track the best (newest) source for each key - keyToSource := make(map[string]int) // Key -> best source index - keyToLevel := make(map[string]int) // Key -> best source level (lower is better) - keyToPos := make(map[string][]byte) // Key -> binary key value (for ordering) - - // First pass: Find the best source for each key + // Find the iterator with the smallest key from the newest source + c.current = -1 + + // Find the smallest valid key for i, iter := range c.iterators { if !iter.Valid() { continue } - - // Use string key for map - keyStr := string(iter.Key()) - keyBytes := iter.Key() - level := c.sources[i].GetLevel() - - // If we haven't seen this key yet, or this source is newer - bestLevel, seen := keyToLevel[keyStr] - if !seen || level < bestLevel { - keyToSource[keyStr] = i - keyToLevel[keyStr] = level - keyToPos[keyStr] = keyBytes - } - } - - // Find the smallest key in our deduplicated set - c.current = -1 - var smallestKey []byte - - for keyStr, sourceIdx := range keyToSource { - keyBytes := keyToPos[keyStr] - - if c.current == -1 || bytes.Compare(keyBytes, smallestKey) < 0 { - c.current = sourceIdx - smallestKey = keyBytes + + // If we haven't found a key yet, or this key is smaller than the current smallest + if c.current == -1 || bytes.Compare(iter.Key(), c.iterators[c.current].Key()) < 0 { + c.current = i + } else if bytes.Equal(iter.Key(), c.iterators[c.current].Key()) { + // If keys are equal, prefer the newer source (lower level) + if c.sources[i].GetLevel() < c.sources[c.current].GetLevel() { + c.current = i + } } } } @@ -515,41 +497,23 @@ func (c *chainedIterator) Seek(target []byte) bool { iter.Seek(target) } - // Maps to track the best (newest) source for each key - keyToSource := make(map[string]int) // Key -> best source index - keyToLevel := make(map[string]int) // Key -> best source level (lower is better) - keyToPos := make(map[string][]byte) // Key -> binary key value (for ordering) - - // First pass: Find the best source for each key + // Find the iterator with the smallest key from the newest source + c.current = -1 + + // Find the smallest valid key for i, iter := range c.iterators { if !iter.Valid() { continue } - - // Use string key for map - keyStr := string(iter.Key()) - keyBytes := iter.Key() - level := c.sources[i].GetLevel() - - // If we haven't seen this key yet, or this source is newer - bestLevel, seen := keyToLevel[keyStr] - if !seen || level < bestLevel { - keyToSource[keyStr] = i - keyToLevel[keyStr] = level - keyToPos[keyStr] = keyBytes - } - } - - // Find the smallest key in our deduplicated set - c.current = -1 - var smallestKey []byte - - for keyStr, sourceIdx := range keyToSource { - keyBytes := keyToPos[keyStr] - - if c.current == -1 || bytes.Compare(keyBytes, smallestKey) < 0 { - c.current = sourceIdx - smallestKey = keyBytes + + // If we haven't found a key yet, or this key is smaller than the current smallest + if c.current == -1 || bytes.Compare(iter.Key(), c.iterators[c.current].Key()) < 0 { + c.current = i + } else if bytes.Equal(iter.Key(), c.iterators[c.current].Key()) { + // If keys are equal, prefer the newer source (lower level) + if c.sources[i].GetLevel() < c.sources[c.current].GetLevel() { + c.current = i + } } } @@ -571,46 +535,28 @@ func (c *chainedIterator) Next() bool { } } - // Maps to track the best (newest) source for each key - keyToSource := make(map[string]int) // Key -> best source index - keyToLevel := make(map[string]int) // Key -> best source level (lower is better) - keyToPos := make(map[string][]byte) // Key -> binary key value (for ordering) - - // First pass: Find the best source for each key + // Find the iterator with the smallest key from the newest source + c.current = -1 + + // Find the smallest valid key that is greater than the current key for i, iter := range c.iterators { if !iter.Valid() { continue } - - // Use string key for map - keyStr := string(iter.Key()) - keyBytes := iter.Key() - level := c.sources[i].GetLevel() - - // If this key is the same as current, skip it - if bytes.Equal(keyBytes, currentKey) { + + // Skip if the key is the same as the current key (we've already advanced past it) + if bytes.Equal(iter.Key(), currentKey) { continue } - - // If we haven't seen this key yet, or this source is newer - bestLevel, seen := keyToLevel[keyStr] - if !seen || level < bestLevel { - keyToSource[keyStr] = i - keyToLevel[keyStr] = level - keyToPos[keyStr] = keyBytes - } - } - - // Find the smallest key in our deduplicated set - c.current = -1 - var smallestKey []byte - - for keyStr, sourceIdx := range keyToSource { - keyBytes := keyToPos[keyStr] - - if c.current == -1 || bytes.Compare(keyBytes, smallestKey) < 0 { - c.current = sourceIdx - smallestKey = keyBytes + + // If we haven't found a key yet, or this key is smaller than the current smallest + if c.current == -1 || bytes.Compare(iter.Key(), c.iterators[c.current].Key()) < 0 { + c.current = i + } else if bytes.Equal(iter.Key(), c.iterators[c.current].Key()) { + // If keys are equal, prefer the newer source (lower level) + if c.sources[i].GetLevel() < c.sources[c.current].GetLevel() { + c.current = i + } } }