refactor: optimize two pass deduping into a single pass across the codebase
All checks were successful
Go Tests / Run Tests (1.24.2) (push) Successful in 9m38s
All checks were successful
Go Tests / Run Tests (1.24.2) (push) Successful in 9m38s
This commit is contained in:
parent
1a0de922af
commit
001934e7b5
@ -94,48 +94,54 @@ func (h *HierarchicalIterator) Seek(target []byte) bool {
|
||||
iter.Seek(target)
|
||||
}
|
||||
|
||||
// For seek, we need to treat it differently than findNextUniqueKey since we want
|
||||
// keys >= target, not strictly > target
|
||||
var minKey []byte
|
||||
var minValue []byte
|
||||
var seenKeys = make(map[string]bool)
|
||||
// For seek, we need to find the smallest key >= target
|
||||
var bestKey []byte
|
||||
var bestValue []byte
|
||||
var bestIterIdx int = -1
|
||||
h.valid = false
|
||||
|
||||
// Find the smallest key >= target from all iterators
|
||||
for _, iter := range h.iterators {
|
||||
// First pass: find the smallest key >= target
|
||||
for i, iter := range h.iterators {
|
||||
if !iter.Valid() {
|
||||
continue
|
||||
}
|
||||
|
||||
key := iter.Key()
|
||||
value := iter.Value()
|
||||
|
||||
// Skip keys < target (Seek should return keys >= target)
|
||||
if bytes.Compare(key, target) < 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Convert key to string for map lookup
|
||||
keyStr := string(key)
|
||||
|
||||
// Only use this key if we haven't seen it from a newer iterator
|
||||
if !seenKeys[keyStr] {
|
||||
// Mark as seen
|
||||
seenKeys[keyStr] = true
|
||||
|
||||
// Update min key if needed
|
||||
if minKey == nil || bytes.Compare(key, minKey) < 0 {
|
||||
minKey = key
|
||||
minValue = value
|
||||
h.valid = true
|
||||
}
|
||||
// If we haven't found a valid key yet, or this key is smaller than the current best key
|
||||
if bestIterIdx == -1 || bytes.Compare(key, bestKey) < 0 {
|
||||
// This becomes our best candidate so far
|
||||
bestKey = key
|
||||
bestValue = iter.Value()
|
||||
bestIterIdx = i
|
||||
}
|
||||
}
|
||||
|
||||
// Set the found key/value
|
||||
if h.valid {
|
||||
h.key = minKey
|
||||
h.value = minValue
|
||||
// Now we need to check if any newer iterators have the same key
|
||||
if bestIterIdx != -1 {
|
||||
// Check all newer iterators (earlier in the slice) for the same key
|
||||
for i := 0; i < bestIterIdx; i++ {
|
||||
iter := h.iterators[i]
|
||||
if !iter.Valid() {
|
||||
continue
|
||||
}
|
||||
|
||||
// If a newer iterator has the same key, use its value
|
||||
if bytes.Equal(iter.Key(), bestKey) {
|
||||
bestValue = iter.Value()
|
||||
break // Since iterators are in newest-to-oldest order, we can stop at the first match
|
||||
}
|
||||
}
|
||||
|
||||
// Set the found key/value
|
||||
h.key = bestKey
|
||||
h.value = bestValue
|
||||
h.valid = true
|
||||
return true
|
||||
}
|
||||
|
||||
@ -218,23 +224,20 @@ func (h *HierarchicalIterator) GetSourceIterators() []iterator.Iterator {
|
||||
// Returns true if a valid key was found
|
||||
func (h *HierarchicalIterator) findNextUniqueKey(prevKey []byte) bool {
|
||||
// Find the smallest key among all iterators that is > prevKey
|
||||
var minKey []byte
|
||||
var minValue []byte
|
||||
var seenKeys = make(map[string]bool)
|
||||
var bestKey []byte
|
||||
var bestValue []byte
|
||||
var bestIterIdx int = -1
|
||||
h.valid = false
|
||||
|
||||
// First pass: collect all valid keys and find min key > prevKey
|
||||
for _, iter := range h.iterators {
|
||||
// First pass: advance all iterators past prevKey and find the smallest next key
|
||||
for i, iter := range h.iterators {
|
||||
// Skip invalid iterators
|
||||
if !iter.Valid() {
|
||||
continue
|
||||
}
|
||||
|
||||
key := iter.Key()
|
||||
value := iter.Value()
|
||||
|
||||
// Skip keys <= prevKey if we're looking for the next key
|
||||
if prevKey != nil && bytes.Compare(key, prevKey) <= 0 {
|
||||
if prevKey != nil && bytes.Compare(iter.Key(), prevKey) <= 0 {
|
||||
// Advance to find a key > prevKey
|
||||
for iter.Valid() && bytes.Compare(iter.Key(), prevKey) <= 0 {
|
||||
if !iter.Next() {
|
||||
@ -246,38 +249,40 @@ func (h *HierarchicalIterator) findNextUniqueKey(prevKey []byte) bool {
|
||||
if !iter.Valid() {
|
||||
continue
|
||||
}
|
||||
|
||||
// Get the new key after advancing
|
||||
key = iter.Key()
|
||||
value = iter.Value()
|
||||
|
||||
// If key is still <= prevKey after advancing, skip this iterator
|
||||
if bytes.Compare(key, prevKey) <= 0 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Convert key to string for map lookup
|
||||
keyStr := string(key)
|
||||
|
||||
// If this key hasn't been seen before, or this is a newer source for the same key
|
||||
if !seenKeys[keyStr] {
|
||||
// Mark this key as seen - it's from the newest source
|
||||
seenKeys[keyStr] = true
|
||||
|
||||
// Check if this is a new minimum key
|
||||
if minKey == nil || bytes.Compare(key, minKey) < 0 {
|
||||
minKey = key
|
||||
minValue = value
|
||||
h.valid = true
|
||||
}
|
||||
// Get the current key
|
||||
key := iter.Key()
|
||||
|
||||
// If we haven't found a valid key yet, or this key is smaller than the current best key
|
||||
if bestIterIdx == -1 || bytes.Compare(key, bestKey) < 0 {
|
||||
// This becomes our best candidate so far
|
||||
bestKey = key
|
||||
bestValue = iter.Value()
|
||||
bestIterIdx = i
|
||||
}
|
||||
}
|
||||
|
||||
// Set the key/value if we found a valid one
|
||||
if h.valid {
|
||||
h.key = minKey
|
||||
h.value = minValue
|
||||
// Now we need to check if any newer iterators have the same key
|
||||
if bestIterIdx != -1 {
|
||||
// Check all newer iterators (earlier in the slice) for the same key
|
||||
for i := 0; i < bestIterIdx; i++ {
|
||||
iter := h.iterators[i]
|
||||
if !iter.Valid() {
|
||||
continue
|
||||
}
|
||||
|
||||
// If a newer iterator has the same key, use its value
|
||||
if bytes.Equal(iter.Key(), bestKey) {
|
||||
bestValue = iter.Value()
|
||||
break // Since iterators are in newest-to-oldest order, we can stop at the first match
|
||||
}
|
||||
}
|
||||
|
||||
// Set the found key/value
|
||||
h.key = bestKey
|
||||
h.value = bestValue
|
||||
h.valid = true
|
||||
return true
|
||||
}
|
||||
|
||||
|
@ -509,26 +509,26 @@ func (e *Engine) flushMemTable(mem *memtable.MemTable) error {
|
||||
iter := mem.NewIterator()
|
||||
count := 0
|
||||
var bytesWritten uint64
|
||||
|
||||
// Since MemTable's skiplist iterator returns keys in sorted order,
|
||||
// and the Find() method already returns the most recent entry for each key,
|
||||
// we can deduplicate in a single pass by tracking the last seen key
|
||||
var lastKey []byte
|
||||
|
||||
// Process entries in a single pass
|
||||
|
||||
// Since memtable's skiplist returns keys in sorted order,
|
||||
// but possibly with duplicates (newer versions of same key first),
|
||||
// we need to track all processed keys (including tombstones)
|
||||
var processedKeys = make(map[string]struct{})
|
||||
|
||||
for iter.SeekToFirst(); iter.Valid(); iter.Next() {
|
||||
key := iter.Key()
|
||||
keyStr := string(key) // Use as map key
|
||||
|
||||
// Skip deletion markers, only add value entries
|
||||
// Skip keys we've already processed (including tombstones)
|
||||
if _, seen := processedKeys[keyStr]; seen {
|
||||
continue
|
||||
}
|
||||
|
||||
// Mark this key as processed regardless of whether it's a value or tombstone
|
||||
processedKeys[keyStr] = struct{}{}
|
||||
|
||||
// Only write non-tombstone entries to the SSTable
|
||||
if value := iter.Value(); value != nil {
|
||||
// Skip duplicate keys (only add each key once)
|
||||
// The MemTable iterator returns multiple values for the same key
|
||||
// but we only need the first one we encounter for each key, which is the latest
|
||||
if lastKey != nil && bytes.Equal(key, lastKey) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Add to SSTable
|
||||
bytesWritten += uint64(len(key) + len(value))
|
||||
if err := writer.Add(key, value); err != nil {
|
||||
writer.Abort()
|
||||
@ -536,7 +536,6 @@ func (e *Engine) flushMemTable(mem *memtable.MemTable) error {
|
||||
return fmt.Errorf("failed to add entry to SSTable: %w", err)
|
||||
}
|
||||
count++
|
||||
lastKey = append(lastKey[:0], key...)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -440,41 +440,23 @@ func (c *chainedIterator) SeekToFirst() {
|
||||
iter.SeekToFirst()
|
||||
}
|
||||
|
||||
// Maps to track the best (newest) source for each key
|
||||
keyToSource := make(map[string]int) // Key -> best source index
|
||||
keyToLevel := make(map[string]int) // Key -> best source level (lower is better)
|
||||
keyToPos := make(map[string][]byte) // Key -> binary key value (for ordering)
|
||||
|
||||
// First pass: Find the best source for each key
|
||||
// Find the iterator with the smallest key from the newest source
|
||||
c.current = -1
|
||||
|
||||
// Find the smallest valid key
|
||||
for i, iter := range c.iterators {
|
||||
if !iter.Valid() {
|
||||
continue
|
||||
}
|
||||
|
||||
// Use string key for map
|
||||
keyStr := string(iter.Key())
|
||||
keyBytes := iter.Key()
|
||||
level := c.sources[i].GetLevel()
|
||||
|
||||
// If we haven't seen this key yet, or this source is newer
|
||||
bestLevel, seen := keyToLevel[keyStr]
|
||||
if !seen || level < bestLevel {
|
||||
keyToSource[keyStr] = i
|
||||
keyToLevel[keyStr] = level
|
||||
keyToPos[keyStr] = keyBytes
|
||||
}
|
||||
}
|
||||
|
||||
// Find the smallest key in our deduplicated set
|
||||
c.current = -1
|
||||
var smallestKey []byte
|
||||
|
||||
for keyStr, sourceIdx := range keyToSource {
|
||||
keyBytes := keyToPos[keyStr]
|
||||
|
||||
if c.current == -1 || bytes.Compare(keyBytes, smallestKey) < 0 {
|
||||
c.current = sourceIdx
|
||||
smallestKey = keyBytes
|
||||
|
||||
// If we haven't found a key yet, or this key is smaller than the current smallest
|
||||
if c.current == -1 || bytes.Compare(iter.Key(), c.iterators[c.current].Key()) < 0 {
|
||||
c.current = i
|
||||
} else if bytes.Equal(iter.Key(), c.iterators[c.current].Key()) {
|
||||
// If keys are equal, prefer the newer source (lower level)
|
||||
if c.sources[i].GetLevel() < c.sources[c.current].GetLevel() {
|
||||
c.current = i
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -515,41 +497,23 @@ func (c *chainedIterator) Seek(target []byte) bool {
|
||||
iter.Seek(target)
|
||||
}
|
||||
|
||||
// Maps to track the best (newest) source for each key
|
||||
keyToSource := make(map[string]int) // Key -> best source index
|
||||
keyToLevel := make(map[string]int) // Key -> best source level (lower is better)
|
||||
keyToPos := make(map[string][]byte) // Key -> binary key value (for ordering)
|
||||
|
||||
// First pass: Find the best source for each key
|
||||
// Find the iterator with the smallest key from the newest source
|
||||
c.current = -1
|
||||
|
||||
// Find the smallest valid key
|
||||
for i, iter := range c.iterators {
|
||||
if !iter.Valid() {
|
||||
continue
|
||||
}
|
||||
|
||||
// Use string key for map
|
||||
keyStr := string(iter.Key())
|
||||
keyBytes := iter.Key()
|
||||
level := c.sources[i].GetLevel()
|
||||
|
||||
// If we haven't seen this key yet, or this source is newer
|
||||
bestLevel, seen := keyToLevel[keyStr]
|
||||
if !seen || level < bestLevel {
|
||||
keyToSource[keyStr] = i
|
||||
keyToLevel[keyStr] = level
|
||||
keyToPos[keyStr] = keyBytes
|
||||
}
|
||||
}
|
||||
|
||||
// Find the smallest key in our deduplicated set
|
||||
c.current = -1
|
||||
var smallestKey []byte
|
||||
|
||||
for keyStr, sourceIdx := range keyToSource {
|
||||
keyBytes := keyToPos[keyStr]
|
||||
|
||||
if c.current == -1 || bytes.Compare(keyBytes, smallestKey) < 0 {
|
||||
c.current = sourceIdx
|
||||
smallestKey = keyBytes
|
||||
|
||||
// If we haven't found a key yet, or this key is smaller than the current smallest
|
||||
if c.current == -1 || bytes.Compare(iter.Key(), c.iterators[c.current].Key()) < 0 {
|
||||
c.current = i
|
||||
} else if bytes.Equal(iter.Key(), c.iterators[c.current].Key()) {
|
||||
// If keys are equal, prefer the newer source (lower level)
|
||||
if c.sources[i].GetLevel() < c.sources[c.current].GetLevel() {
|
||||
c.current = i
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -571,46 +535,28 @@ func (c *chainedIterator) Next() bool {
|
||||
}
|
||||
}
|
||||
|
||||
// Maps to track the best (newest) source for each key
|
||||
keyToSource := make(map[string]int) // Key -> best source index
|
||||
keyToLevel := make(map[string]int) // Key -> best source level (lower is better)
|
||||
keyToPos := make(map[string][]byte) // Key -> binary key value (for ordering)
|
||||
|
||||
// First pass: Find the best source for each key
|
||||
// Find the iterator with the smallest key from the newest source
|
||||
c.current = -1
|
||||
|
||||
// Find the smallest valid key that is greater than the current key
|
||||
for i, iter := range c.iterators {
|
||||
if !iter.Valid() {
|
||||
continue
|
||||
}
|
||||
|
||||
// Use string key for map
|
||||
keyStr := string(iter.Key())
|
||||
keyBytes := iter.Key()
|
||||
level := c.sources[i].GetLevel()
|
||||
|
||||
// If this key is the same as current, skip it
|
||||
if bytes.Equal(keyBytes, currentKey) {
|
||||
|
||||
// Skip if the key is the same as the current key (we've already advanced past it)
|
||||
if bytes.Equal(iter.Key(), currentKey) {
|
||||
continue
|
||||
}
|
||||
|
||||
// If we haven't seen this key yet, or this source is newer
|
||||
bestLevel, seen := keyToLevel[keyStr]
|
||||
if !seen || level < bestLevel {
|
||||
keyToSource[keyStr] = i
|
||||
keyToLevel[keyStr] = level
|
||||
keyToPos[keyStr] = keyBytes
|
||||
}
|
||||
}
|
||||
|
||||
// Find the smallest key in our deduplicated set
|
||||
c.current = -1
|
||||
var smallestKey []byte
|
||||
|
||||
for keyStr, sourceIdx := range keyToSource {
|
||||
keyBytes := keyToPos[keyStr]
|
||||
|
||||
if c.current == -1 || bytes.Compare(keyBytes, smallestKey) < 0 {
|
||||
c.current = sourceIdx
|
||||
smallestKey = keyBytes
|
||||
|
||||
// If we haven't found a key yet, or this key is smaller than the current smallest
|
||||
if c.current == -1 || bytes.Compare(iter.Key(), c.iterators[c.current].Key()) < 0 {
|
||||
c.current = i
|
||||
} else if bytes.Equal(iter.Key(), c.iterators[c.current].Key()) {
|
||||
// If keys are equal, prefer the newer source (lower level)
|
||||
if c.sources[i].GetLevel() < c.sources[c.current].GetLevel() {
|
||||
c.current = i
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user