Skip to content

Commit

Permalink
Less linear search in DBIter::Seek() when keys are overwritten a lot
Browse files Browse the repository at this point in the history
Summary:
In one deployment we saw high latencies (presumably from slow iterator operations) and a lot of CPU time reported by perf with this stack:

```
  rocksdb::MergingIterator::Next
  rocksdb::DBIter::FindNextUserEntryInternal
  rocksdb::DBIter::Seek
```

I think what's happening is:
1. we create a snapshot iterator,
2. we do lots of Put()s for the same key x; this creates lots of entries in memtable,
3. we seek the iterator to a key slightly smaller than x,
4. the seek walks over lots of entries in memtable for key x, skipping them because of high sequence numbers.

CC IslamAbdelRahman
Closes facebook#1413

Differential Revision: D4083879

Pulled By: IslamAbdelRahman

fbshipit-source-id: a83ddae
  • Loading branch information
al13n321 authored and Facebook Github Bot committed Nov 28, 2016
1 parent cd7c414 commit 236d4c6
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 72 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ util/build_version.cc
build_tools/VALGRIND_LOGS/
coverage/COVERAGE_REPORT
.gdbhistory
.gdb_history
package/
.phutil_module_cache
unity.a
Expand Down
198 changes: 129 additions & 69 deletions db/db_iter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ void DBIter::Next() {
// saved_value_ => the merged value
//
// NOTE: In between, saved_key_ can point to a user key that has
// a delete marker
// a delete marker or a sequence number higher than sequence_
//
// The prefix_check parameter controls whether we check the iterated
// keys against the prefix of the seeked key. Set to false when
Expand All @@ -356,90 +356,137 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
assert(iter_->Valid());
assert(direction_ == kForward);
current_entry_is_merged_ = false;

// How many times in a row we have skipped an entry with user key less than
// or equal to saved_key_. We could skip these entries either because
// sequence numbers were too high or because skipping = true.
// What saved_key_ contains throughout this method:
// - if skipping : saved_key_ contains the key that we need to skip,
// and we haven't seen any keys greater than that,
// - if num_skipped > 0 : saved_key_ contains the key that we have skipped
// num_skipped times, and we haven't seen any keys
// greater than that,
// - none of the above : saved_key_ can contain anything, it doesn't matter.
uint64_t num_skipped = 0;

do {
ParsedInternalKey ikey;

if (ParseKey(&ikey)) {
if (iterate_upper_bound_ != nullptr &&
user_comparator_->Compare(ikey.user_key, *iterate_upper_bound_) >= 0) {
break;
}
if (!ParseKey(&ikey)) {
// Skip corrupted keys.
iter_->Next();
continue;
}

if (prefix_extractor_ && prefix_check &&
prefix_extractor_->Transform(ikey.user_key).compare(prefix_start_key_) != 0) {
break;
}
if (iterate_upper_bound_ != nullptr &&
user_comparator_->Compare(ikey.user_key, *iterate_upper_bound_) >= 0) {
break;
}

if (ikey.sequence <= sequence_) {
if (skipping &&
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
num_skipped++; // skip this entry
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
} else {
switch (ikey.type) {
case kTypeDeletion:
case kTypeSingleDeletion:
if (prefix_extractor_ && prefix_check &&
prefix_extractor_->Transform(ikey.user_key)
.compare(prefix_start_key_) != 0) {
break;
}

if (ikey.sequence <= sequence_) {
if (skipping &&
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
num_skipped++; // skip this entry
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
} else {
skipping = false; // ikey > saved_key_, i.e. saved_key_ is skipped
num_skipped = 0;
switch (ikey.type) {
case kTypeDeletion:
case kTypeSingleDeletion:
// Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion.
saved_key_.SetKey(
ikey.user_key,
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
skipping = true;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
break;
case kTypeValue:
saved_key_.SetKey(
ikey.user_key,
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
if (range_del_agg_.ShouldDelete(ikey)) {
// Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion.
saved_key_.SetKey(
ikey.user_key,
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
skipping = true;
num_skipped = 0;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
break;
case kTypeValue:
saved_key_.SetKey(
ikey.user_key,
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
if (range_del_agg_.ShouldDelete(ikey)) {
// Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion.
skipping = true;
num_skipped = 0;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
} else {
valid_ = true;
return;
}
break;
case kTypeMerge:
saved_key_.SetKey(
ikey.user_key,
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
if (range_del_agg_.ShouldDelete(ikey)) {
// Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion.
skipping = true;
num_skipped = 0;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
} else {
// By now, we are sure the current ikey is going to yield a
// value
current_entry_is_merged_ = true;
valid_ = true;
MergeValuesNewToOld(); // Go to a different state machine
return;
}
break;
default:
assert(false);
break;
}
} else {
valid_ = true;
return;
}
break;
case kTypeMerge:
saved_key_.SetKey(
ikey.user_key,
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
if (range_del_agg_.ShouldDelete(ikey)) {
// Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion.
skipping = true;
num_skipped = 0;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
} else {
// By now, we are sure the current ikey is going to yield a
// value
current_entry_is_merged_ = true;
valid_ = true;
MergeValuesNewToOld(); // Go to a different state machine
return;
}
break;
default:
assert(false);
break;
}
}
} else {
// This key was inserted after our snapshot was taken.
PERF_COUNTER_ADD(internal_recent_skipped_count, 1);

// Here saved_key_ may contain some old key, or the default empty key, or
// key assigned by some random other method. We don't care.
if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
num_skipped++;
} else {
saved_key_.SetKey(
ikey.user_key,
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
skipping = false;
num_skipped = 0;
}
}
// If we have sequentially iterated via numerous keys and still not
// found the next user-key, then it is better to seek so that we can
// avoid too many key comparisons. We seek to the last occurrence of
// our current key by looking for sequence number 0 and type deletion
// (the smallest type).
if (skipping && num_skipped > max_skip_) {

// If we have sequentially iterated via numerous equal keys, then it's
// better to seek so that we can avoid too many key comparisons.
if (num_skipped > max_skip_) {
num_skipped = 0;
std::string last_key;
AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), 0,
kTypeDeletion));
if (skipping) {
// We're looking for the next user-key but all we see are the same
// user-key with decreasing sequence numbers. Fast forward to
// sequence number 0 and type deletion (the smallest type).
AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), 0,
kTypeDeletion));
// Don't set skipping = false because we may still see more user-keys
// equal to saved_key_.
} else {
// We saw multiple entries with this user key and sequence numbers
// higher than sequence_. Fast forward to sequence_.
// Note that this only covers a case when a higher key was overwritten
// many times since our snapshot was taken, not the case when a lot of
// different keys were inserted after our snapshot was taken.
AppendInternalKey(&last_key,
ParsedInternalKey(saved_key_.GetKey(), sequence_,
kValueTypeForSeek));
}
iter_->Seek(last_key);
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
} else {
Expand Down Expand Up @@ -503,6 +550,7 @@ void DBIter::MergeValuesNewToOld() {
// when complete, add result to operands and continue.
merge_context_.PushOperand(iter_->value(),
iter_->IsValuePinned() /* operand_pinned */);
PERF_COUNTER_ADD(internal_merge_count, 1);
} else {
assert(false);
}
Expand Down Expand Up @@ -564,6 +612,11 @@ void DBIter::ReverseToBackward() {
FindParseableKey(&ikey, kReverse);
while (iter_->Valid() &&
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) > 0) {
if (ikey.sequence > sequence_) {
PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
} else {
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
}
iter_->Prev();
FindParseableKey(&ikey, kReverse);
}
Expand Down Expand Up @@ -676,6 +729,7 @@ bool DBIter::FindValueForCurrentKey() {
assert(merge_operator_ != nullptr);
merge_context_.PushOperandBack(
iter_->value(), iter_->IsValuePinned() /* operand_pinned */);
PERF_COUNTER_ADD(internal_merge_count, 1);
}
break;
default:
Expand Down Expand Up @@ -760,6 +814,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
ikey.type == kTypeMerge && !range_del_agg_.ShouldDelete(ikey)) {
merge_context_.PushOperand(iter_->value(),
iter_->IsValuePinned() /* operand_pinned */);
PERF_COUNTER_ADD(internal_merge_count, 1);
iter_->Next();
FindParseableKey(&ikey, kForward);
}
Expand Down Expand Up @@ -830,6 +885,11 @@ void DBIter::FindPrevUserKey() {
++num_skipped;
}
}
if (ikey.sequence > sequence_) {
PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
} else {
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
}
iter_->Prev();
FindParseableKey(&ikey, kReverse);
}
Expand Down
58 changes: 57 additions & 1 deletion db/db_iterator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "db/db_test_util.h"
#include "port/port.h"
#include "port/stack_trace.h"
#include "rocksdb/iostats_context.h"
#include "rocksdb/perf_context.h"
#include "port/port.h"

namespace rocksdb {

Expand Down Expand Up @@ -1778,6 +1778,62 @@ TEST_F(DBIteratorTest, ReadAhead) {
delete iter;
}

// Insert a key, create a snapshot iterator, overwrite key lots of times,
// seek to a smaller key. Expect DBIter to fall back to a seek instead of
// going through all the overwrites linearly.
TEST_F(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
Options options = CurrentOptions();
options.env = env_;
options.create_if_missing = true;
options.max_sequential_skip_in_iterations = 3;
options.prefix_extractor = nullptr;
options.write_buffer_size = 1 << 27; // big enough to avoid flush
options.statistics = rocksdb::CreateDBStatistics();
DestroyAndReopen(options);

// Insert.
ASSERT_OK(Put("b", "0"));

// Create iterator.
ReadOptions ro;
std::unique_ptr<Iterator> iter(db_->NewIterator(ro));

// Insert a lot.
for (int i = 0; i < 100; ++i) {
ASSERT_OK(Put("b", std::to_string(i + 1).c_str()));
}

#ifndef ROCKSDB_LITE
// Check that memtable wasn't flushed.
std::string val;
ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &val));
EXPECT_EQ("0", val);
#endif

// Seek iterator to a smaller key.
perf_context.Reset();
iter->Seek("a");
ASSERT_TRUE(iter->Valid());
EXPECT_EQ("b", iter->key().ToString());
EXPECT_EQ("0", iter->value().ToString());

// Check that the seek didn't do too much work.
// Checks are not tight, just make sure that everything is well below 100.
EXPECT_LT(perf_context.internal_key_skipped_count, 4);
EXPECT_LT(perf_context.internal_recent_skipped_count, 8);
EXPECT_LT(perf_context.seek_on_memtable_count, 10);
EXPECT_LT(perf_context.next_on_memtable_count, 10);
EXPECT_LT(perf_context.prev_on_memtable_count, 10);

// Check that iterator did something like what we expect.
EXPECT_EQ(perf_context.internal_delete_skipped_count, 0);
EXPECT_EQ(perf_context.internal_merge_count, 0);
EXPECT_GE(perf_context.internal_recent_skipped_count, 2);
EXPECT_GE(perf_context.seek_on_memtable_count, 2);
EXPECT_EQ(1, options.statistics->getTickerCount(
NUMBER_OF_RESEEKS_IN_ITERATION));
}

} // namespace rocksdb

int main(int argc, char** argv) {
Expand Down
2 changes: 2 additions & 0 deletions db/memtable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -322,11 +322,13 @@ class MemTableIterator : public InternalIterator {
valid_ = iter_->Valid();
}
virtual void Next() override {
PERF_COUNTER_ADD(next_on_memtable_count, 1);
assert(Valid());
iter_->Next();
valid_ = iter_->Valid();
}
virtual void Prev() override {
PERF_COUNTER_ADD(prev_on_memtable_count, 1);
assert(Valid());
iter_->Prev();
valid_ = iter_->Valid();
Expand Down
15 changes: 13 additions & 2 deletions include/rocksdb/perf_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ struct PerfContext {
// tombstones are not included in this counter, while previous updates
// hidden by the tombstones will be included here.
// 4. symmetric cases for Prev() and SeekToLast()
// We sometimes also skip entries of more recent updates than the snapshot
// we read from, but they are not included in this counter.
// internal_recent_skipped_count is not included in this counter.
//
uint64_t internal_key_skipped_count;
// Total number of deletes and single deletes skipped over during iteration
Expand All @@ -57,6 +56,13 @@ struct PerfContext {
// still older updates invalidated by the tombstones.
//
uint64_t internal_delete_skipped_count;
// How many times iterators skipped over internal keys that are more recent
// than the snapshot that iterator is using.
//
uint64_t internal_recent_skipped_count;
// How many values were fed into merge operator by iterators.
//
uint64_t internal_merge_count;

uint64_t get_snapshot_time; // total nanos spent on getting snapshot
uint64_t get_from_memtable_time; // total nanos spent on querying memtables
Expand All @@ -67,7 +73,12 @@ struct PerfContext {
// total nanos spent on seeking memtable
uint64_t seek_on_memtable_time;
// number of seeks issued on memtable
// (including SeekForPrev but not SeekToFirst and SeekToLast)
uint64_t seek_on_memtable_count;
// number of Next()s issued on memtable
uint64_t next_on_memtable_count;
// number of Prev()s issued on memtable
uint64_t prev_on_memtable_count;
// total nanos spent on seeking child iters
uint64_t seek_child_seek_time;
// number of seek issued in child iterators
Expand Down
Loading

0 comments on commit 236d4c6

Please sign in to comment.