refactor: optimize two pass deduping into a single pass across the codebase
All checks were successful
Go Tests / Run Tests (1.24.2) (push) Successful in 9m38s

This commit is contained in:
Jeremy Tregunna 2025-04-21 15:59:10 -06:00
parent 1a0de922af
commit 001934e7b5
Signed by: jer
GPG Key ID: 1278B36BA6F5D5E4
3 changed files with 125 additions and 175 deletions

View File

@ -94,48 +94,54 @@ func (h *HierarchicalIterator) Seek(target []byte) bool {
iter.Seek(target)
}
// For seek, we need to treat it differently than findNextUniqueKey since we want
// keys >= target, not strictly > target
var minKey []byte
var minValue []byte
var seenKeys = make(map[string]bool)
// For seek, we need to find the smallest key >= target
var bestKey []byte
var bestValue []byte
var bestIterIdx int = -1
h.valid = false
// Find the smallest key >= target from all iterators
for _, iter := range h.iterators {
// First pass: find the smallest key >= target
for i, iter := range h.iterators {
if !iter.Valid() {
continue
}
key := iter.Key()
value := iter.Value()
// Skip keys < target (Seek should return keys >= target)
if bytes.Compare(key, target) < 0 {
continue
}
// Convert key to string for map lookup
keyStr := string(key)
// Only use this key if we haven't seen it from a newer iterator
if !seenKeys[keyStr] {
// Mark as seen
seenKeys[keyStr] = true
// Update min key if needed
if minKey == nil || bytes.Compare(key, minKey) < 0 {
minKey = key
minValue = value
h.valid = true
}
// If we haven't found a valid key yet, or this key is smaller than the current best key
if bestIterIdx == -1 || bytes.Compare(key, bestKey) < 0 {
// This becomes our best candidate so far
bestKey = key
bestValue = iter.Value()
bestIterIdx = i
}
}
// Set the found key/value
if h.valid {
h.key = minKey
h.value = minValue
// Now we need to check if any newer iterators have the same key
if bestIterIdx != -1 {
// Check all newer iterators (earlier in the slice) for the same key
for i := 0; i < bestIterIdx; i++ {
iter := h.iterators[i]
if !iter.Valid() {
continue
}
// If a newer iterator has the same key, use its value
if bytes.Equal(iter.Key(), bestKey) {
bestValue = iter.Value()
break // Since iterators are in newest-to-oldest order, we can stop at the first match
}
}
// Set the found key/value
h.key = bestKey
h.value = bestValue
h.valid = true
return true
}
@ -218,23 +224,20 @@ func (h *HierarchicalIterator) GetSourceIterators() []iterator.Iterator {
// Returns true if a valid key was found
func (h *HierarchicalIterator) findNextUniqueKey(prevKey []byte) bool {
// Find the smallest key among all iterators that is > prevKey
var minKey []byte
var minValue []byte
var seenKeys = make(map[string]bool)
var bestKey []byte
var bestValue []byte
var bestIterIdx int = -1
h.valid = false
// First pass: collect all valid keys and find min key > prevKey
for _, iter := range h.iterators {
// First pass: advance all iterators past prevKey and find the smallest next key
for i, iter := range h.iterators {
// Skip invalid iterators
if !iter.Valid() {
continue
}
key := iter.Key()
value := iter.Value()
// Skip keys <= prevKey if we're looking for the next key
if prevKey != nil && bytes.Compare(key, prevKey) <= 0 {
if prevKey != nil && bytes.Compare(iter.Key(), prevKey) <= 0 {
// Advance to find a key > prevKey
for iter.Valid() && bytes.Compare(iter.Key(), prevKey) <= 0 {
if !iter.Next() {
@ -246,38 +249,40 @@ func (h *HierarchicalIterator) findNextUniqueKey(prevKey []byte) bool {
if !iter.Valid() {
continue
}
// Get the new key after advancing
key = iter.Key()
value = iter.Value()
// If key is still <= prevKey after advancing, skip this iterator
if bytes.Compare(key, prevKey) <= 0 {
continue
}
}
// Convert key to string for map lookup
keyStr := string(key)
// If this key hasn't been seen before, or this is a newer source for the same key
if !seenKeys[keyStr] {
// Mark this key as seen - it's from the newest source
seenKeys[keyStr] = true
// Check if this is a new minimum key
if minKey == nil || bytes.Compare(key, minKey) < 0 {
minKey = key
minValue = value
h.valid = true
}
// Get the current key
key := iter.Key()
// If we haven't found a valid key yet, or this key is smaller than the current best key
if bestIterIdx == -1 || bytes.Compare(key, bestKey) < 0 {
// This becomes our best candidate so far
bestKey = key
bestValue = iter.Value()
bestIterIdx = i
}
}
// Set the key/value if we found a valid one
if h.valid {
h.key = minKey
h.value = minValue
// Now we need to check if any newer iterators have the same key
if bestIterIdx != -1 {
// Check all newer iterators (earlier in the slice) for the same key
for i := 0; i < bestIterIdx; i++ {
iter := h.iterators[i]
if !iter.Valid() {
continue
}
// If a newer iterator has the same key, use its value
if bytes.Equal(iter.Key(), bestKey) {
bestValue = iter.Value()
break // Since iterators are in newest-to-oldest order, we can stop at the first match
}
}
// Set the found key/value
h.key = bestKey
h.value = bestValue
h.valid = true
return true
}

View File

@ -509,26 +509,26 @@ func (e *Engine) flushMemTable(mem *memtable.MemTable) error {
iter := mem.NewIterator()
count := 0
var bytesWritten uint64
// Since MemTable's skiplist iterator returns keys in sorted order,
// and the Find() method already returns the most recent entry for each key,
// we can deduplicate in a single pass by tracking the last seen key
var lastKey []byte
// Process entries in a single pass
// Since memtable's skiplist returns keys in sorted order,
// but possibly with duplicates (newer versions of same key first),
// we need to track all processed keys (including tombstones)
var processedKeys = make(map[string]struct{})
for iter.SeekToFirst(); iter.Valid(); iter.Next() {
key := iter.Key()
keyStr := string(key) // Use as map key
// Skip deletion markers, only add value entries
// Skip keys we've already processed (including tombstones)
if _, seen := processedKeys[keyStr]; seen {
continue
}
// Mark this key as processed regardless of whether it's a value or tombstone
processedKeys[keyStr] = struct{}{}
// Only write non-tombstone entries to the SSTable
if value := iter.Value(); value != nil {
// Skip duplicate keys (only add each key once)
// The MemTable iterator returns multiple values for the same key
// but we only need the first one we encounter for each key, which is the latest
if lastKey != nil && bytes.Equal(key, lastKey) {
continue
}
// Add to SSTable
bytesWritten += uint64(len(key) + len(value))
if err := writer.Add(key, value); err != nil {
writer.Abort()
@ -536,7 +536,6 @@ func (e *Engine) flushMemTable(mem *memtable.MemTable) error {
return fmt.Errorf("failed to add entry to SSTable: %w", err)
}
count++
lastKey = append(lastKey[:0], key...)
}
}

View File

@ -440,41 +440,23 @@ func (c *chainedIterator) SeekToFirst() {
iter.SeekToFirst()
}
// Maps to track the best (newest) source for each key
keyToSource := make(map[string]int) // Key -> best source index
keyToLevel := make(map[string]int) // Key -> best source level (lower is better)
keyToPos := make(map[string][]byte) // Key -> binary key value (for ordering)
// First pass: Find the best source for each key
// Find the iterator with the smallest key from the newest source
c.current = -1
// Find the smallest valid key
for i, iter := range c.iterators {
if !iter.Valid() {
continue
}
// Use string key for map
keyStr := string(iter.Key())
keyBytes := iter.Key()
level := c.sources[i].GetLevel()
// If we haven't seen this key yet, or this source is newer
bestLevel, seen := keyToLevel[keyStr]
if !seen || level < bestLevel {
keyToSource[keyStr] = i
keyToLevel[keyStr] = level
keyToPos[keyStr] = keyBytes
}
}
// Find the smallest key in our deduplicated set
c.current = -1
var smallestKey []byte
for keyStr, sourceIdx := range keyToSource {
keyBytes := keyToPos[keyStr]
if c.current == -1 || bytes.Compare(keyBytes, smallestKey) < 0 {
c.current = sourceIdx
smallestKey = keyBytes
// If we haven't found a key yet, or this key is smaller than the current smallest
if c.current == -1 || bytes.Compare(iter.Key(), c.iterators[c.current].Key()) < 0 {
c.current = i
} else if bytes.Equal(iter.Key(), c.iterators[c.current].Key()) {
// If keys are equal, prefer the newer source (lower level)
if c.sources[i].GetLevel() < c.sources[c.current].GetLevel() {
c.current = i
}
}
}
}
@ -515,41 +497,23 @@ func (c *chainedIterator) Seek(target []byte) bool {
iter.Seek(target)
}
// Maps to track the best (newest) source for each key
keyToSource := make(map[string]int) // Key -> best source index
keyToLevel := make(map[string]int) // Key -> best source level (lower is better)
keyToPos := make(map[string][]byte) // Key -> binary key value (for ordering)
// First pass: Find the best source for each key
// Find the iterator with the smallest key from the newest source
c.current = -1
// Find the smallest valid key
for i, iter := range c.iterators {
if !iter.Valid() {
continue
}
// Use string key for map
keyStr := string(iter.Key())
keyBytes := iter.Key()
level := c.sources[i].GetLevel()
// If we haven't seen this key yet, or this source is newer
bestLevel, seen := keyToLevel[keyStr]
if !seen || level < bestLevel {
keyToSource[keyStr] = i
keyToLevel[keyStr] = level
keyToPos[keyStr] = keyBytes
}
}
// Find the smallest key in our deduplicated set
c.current = -1
var smallestKey []byte
for keyStr, sourceIdx := range keyToSource {
keyBytes := keyToPos[keyStr]
if c.current == -1 || bytes.Compare(keyBytes, smallestKey) < 0 {
c.current = sourceIdx
smallestKey = keyBytes
// If we haven't found a key yet, or this key is smaller than the current smallest
if c.current == -1 || bytes.Compare(iter.Key(), c.iterators[c.current].Key()) < 0 {
c.current = i
} else if bytes.Equal(iter.Key(), c.iterators[c.current].Key()) {
// If keys are equal, prefer the newer source (lower level)
if c.sources[i].GetLevel() < c.sources[c.current].GetLevel() {
c.current = i
}
}
}
@ -571,46 +535,28 @@ func (c *chainedIterator) Next() bool {
}
}
// Maps to track the best (newest) source for each key
keyToSource := make(map[string]int) // Key -> best source index
keyToLevel := make(map[string]int) // Key -> best source level (lower is better)
keyToPos := make(map[string][]byte) // Key -> binary key value (for ordering)
// First pass: Find the best source for each key
// Find the iterator with the smallest key from the newest source
c.current = -1
// Find the smallest valid key that is greater than the current key
for i, iter := range c.iterators {
if !iter.Valid() {
continue
}
// Use string key for map
keyStr := string(iter.Key())
keyBytes := iter.Key()
level := c.sources[i].GetLevel()
// If this key is the same as current, skip it
if bytes.Equal(keyBytes, currentKey) {
// Skip if the key is the same as the current key (we've already advanced past it)
if bytes.Equal(iter.Key(), currentKey) {
continue
}
// If we haven't seen this key yet, or this source is newer
bestLevel, seen := keyToLevel[keyStr]
if !seen || level < bestLevel {
keyToSource[keyStr] = i
keyToLevel[keyStr] = level
keyToPos[keyStr] = keyBytes
}
}
// Find the smallest key in our deduplicated set
c.current = -1
var smallestKey []byte
for keyStr, sourceIdx := range keyToSource {
keyBytes := keyToPos[keyStr]
if c.current == -1 || bytes.Compare(keyBytes, smallestKey) < 0 {
c.current = sourceIdx
smallestKey = keyBytes
// If we haven't found a key yet, or this key is smaller than the current smallest
if c.current == -1 || bytes.Compare(iter.Key(), c.iterators[c.current].Key()) < 0 {
c.current = i
} else if bytes.Equal(iter.Key(), c.iterators[c.current].Key()) {
// If keys are equal, prefer the newer source (lower level)
if c.sources[i].GetLevel() < c.sources[c.current].GetLevel() {
c.current = i
}
}
}