diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 89a497821dd..bad3532e78d 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -1240,6 +1240,52 @@ Status BlockBasedTable::Get( return s; } +Status BlockBasedTable::Prefetch(const Slice* const begin, + const Slice* const end) { + auto& comparator = rep_->internal_comparator; + // pre-condition + if (begin && end && comparator.Compare(*begin, *end) > 0) { + return Status::InvalidArgument(*begin, *end); + } + + BlockIter iiter; + NewIndexIterator(ReadOptions(), &iiter); + + if (!iiter.status().ok()) { + // error opening index iterator + return iiter.status(); + } + + // indicates if we are on the last page that need to be pre-fetched + bool prefetching_boundary_page = false; + + for (begin ? iiter.Seek(*begin) : iiter.SeekToFirst(); iiter.Valid(); + iiter.Next()) { + Slice block_handle = iiter.value(); + + if (end && comparator.Compare(iiter.key(), *end) >= 0) { + if (prefetching_boundary_page) { + break; + } + + // The index entry represents the last key in the data block. + // We should load this page into memory as well, but no more + prefetching_boundary_page = true; + } + + // Load the block specified by the block_handle into the block cache + BlockIter biter; + NewDataBlockIterator(rep_, ReadOptions(), block_handle, &biter); + + if (!biter.status().ok()) { + // there was an unexpected error while pre-fetching + return biter.status(); + } + } + + return Status::OK(); +} + bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { std::unique_ptr iiter(NewIndexIterator(options)); diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 0e9b5690b5c..727a0d6329e 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -83,6 +83,11 @@ class BlockBasedTable : public TableReader { Status Get(const ReadOptions& readOptions, const Slice& key, GetContext* get_context) override; + // Pre-fetch the disk blocks that correspond to the key range specified by + // (kbegin, kend). The call will return return error status in the event of + // IO or iteration error. + Status Prefetch(const Slice* begin, const Slice* end) override; + // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were // present in the file). The returned value is in terms of file diff --git a/table/table_reader.h b/table/table_reader.h index d3801442edd..2058b868c80 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -68,6 +68,18 @@ class TableReader { virtual Status Get(const ReadOptions& readOptions, const Slice& key, GetContext* get_context) = 0; + // Prefetch data corresponding to a give range of keys + // Typically this functionality is required for table implementations that + // persists the data on a non volatile storage medium like disk/SSD + virtual Status Prefetch(const Slice* begin = nullptr, + const Slice* end = nullptr) { + (void) begin; + (void) end; + // Default implementation is NOOP. + // The child class should implement functionality when applicable + return Status::OK(); + } + // convert db file to a human readable form virtual Status DumpTable(WritableFile* out_file) { return Status::NotSupported("DumpTable() not supported"); diff --git a/table/table_test.cc b/table/table_test.cc index 264e420ae33..0a91e0d533a 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -47,6 +47,9 @@ #include "util/testutil.h" #include "util/scoped_arena_iterator.h" +using std::vector; +using std::string; + namespace rocksdb { extern const uint64_t kLegacyBlockBasedTableMagicNumber; @@ -1125,6 +1128,131 @@ TEST(BlockBasedTableTest, FilterPolicyNameProperties) { ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name); } +// +// BlockBasedTableTest::PrefetchTest +// +void AssertKeysInCache(BlockBasedTable* table_reader, + const vector& keys_in_cache, + const vector& keys_not_in_cache) { + for (auto key : keys_in_cache) { + ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); + } + + for (auto key : keys_not_in_cache) { + ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key)); + } +} + +void PrefetchRange(TableConstructor* c, Options* opt, + BlockBasedTableOptions* table_options, + const vector& keys, + const char* key_begin, const char* key_end, + const vector& keys_in_cache, + const vector& keys_not_in_cache, + const Status expected_status = Status::OK()) { + // reset the cache and reopen the table + table_options->block_cache = NewLRUCache(16 * 1024 * 1024); + opt->table_factory.reset(NewBlockBasedTableFactory(*table_options)); + const ImmutableCFOptions ioptions2(*opt); + ASSERT_OK(c->Reopen(ioptions2)); + + // prefetch + auto* table_reader = dynamic_cast(c->GetTableReader()); + // empty string replacement is a trick so we don't crash the test + Slice begin(key_begin ? key_begin : ""); + Slice end(key_end ? key_end : ""); + Status s = table_reader->Prefetch(key_begin ? &begin : nullptr, + key_end ? &end : nullptr); + ASSERT_TRUE(s.code() == expected_status.code()); + + // assert our expectation in cache warmup + AssertKeysInCache(table_reader, keys_in_cache, keys_not_in_cache); +} + + +TEST(BlockBasedTableTest, PrefetchTest) { + // The purpose of this test is to test the prefetching operation built into + // BlockBasedTable. + Options opt; + unique_ptr ikc; + ikc.reset(new test::PlainInternalKeyComparator(opt.comparator)); + opt.compression = kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + // big enough so we don't ever lose cached values. + table_options.block_cache = NewLRUCache(16 * 1024 * 1024); + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + TableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + const ImmutableCFOptions ioptions(opt); + c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap); + + // We get the following data spread : + // + // Data block Index + // ======================== + // [ k01 k02 k03 ] k03 + // [ k04 ] k04 + // [ k05 ] k05 + // [ k06 k07 ] k07 + + + // Simple + PrefetchRange(&c, &opt, &table_options, keys, + /*key_range=*/ "k01", "k05", + /*keys_in_cache=*/ {"k01", "k02", "k03", "k04", "k05"}, + /*keys_not_in_cache=*/ {"k06", "k07"}); + PrefetchRange(&c, &opt, &table_options, keys, + "k01", "k01", + {"k01", "k02", "k03"}, + {"k04", "k05", "k06", "k07"}); + // odd + PrefetchRange(&c, &opt, &table_options, keys, + "a", "z", + {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, + {}); + PrefetchRange(&c, &opt, &table_options, keys, + "k00", "k00", + {"k01", "k02", "k03"}, + {"k04", "k05", "k06", "k07"}); + // Edge cases + PrefetchRange(&c, &opt, &table_options, keys, + "k00", "k06", + {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, + {}); + PrefetchRange(&c, &opt, &table_options, keys, + "k00", "zzz", + {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, + {}); + // null keys + PrefetchRange(&c, &opt, &table_options, keys, + nullptr, nullptr, + {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, + {}); + PrefetchRange(&c, &opt, &table_options, keys, + "k04", nullptr, + {"k04", "k05", "k06", "k07"}, + {"k01", "k02", "k03"}); + PrefetchRange(&c, &opt, &table_options, keys, + nullptr, "k05", + {"k01", "k02", "k03", "k04", "k05"}, + {"k06", "k07"}); + // invalid + PrefetchRange(&c, &opt, &table_options, keys, + "k06", "k00", {}, {}, + Status::InvalidArgument(Slice("k06 "), Slice("k07"))); +} + + TEST(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { BlockBasedTableOptions table_options; for (int i = 0; i < 4; ++i) {