Skip to content

Commit

Permalink
Enable hash index for block-based table
Browse files Browse the repository at this point in the history
Summary: Based on previous patches, this diff eventually provides the end-to-end mechanism for users to specify the hash-index.

Test Plan: Wrote several new unit tests.

Reviewers: sdong, haobo, dhruba

Reviewed By: sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D16539
  • Loading branch information
Kai Liu committed Apr 10, 2014
1 parent 7a92537 commit 75b59d5
Show file tree
Hide file tree
Showing 10 changed files with 521 additions and 79 deletions.
42 changes: 37 additions & 5 deletions db/db_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,8 @@ class DBTest {
// Sequence of option configurations to try
enum OptionConfig {
kDefault,
kBlockBasedTableWithPrefixHashIndex,
kBlockBasedTableWithWholeKeyHashIndex,
kPlainTableFirstBytePrefix,
kPlainTableAllBytesPrefix,
kVectorRep,
Expand Down Expand Up @@ -303,7 +305,8 @@ class DBTest {
kSkipDeletesFilterFirst = 1,
kSkipUniversalCompaction = 2,
kSkipMergePut = 4,
kSkipPlainTable = 8
kSkipPlainTable = 8,
kSkipHashIndex = 16
};

DBTest() : option_config_(kDefault),
Expand Down Expand Up @@ -343,6 +346,12 @@ class DBTest {
|| option_config_ == kPlainTableFirstBytePrefix)) {
continue;
}
if ((skip_mask & kSkipPlainTable) &&
(option_config_ == kBlockBasedTableWithPrefixHashIndex ||
option_config_ == kBlockBasedTableWithWholeKeyHashIndex)) {
continue;
}

break;
}

Expand Down Expand Up @@ -439,6 +448,20 @@ class DBTest {
case kInfiniteMaxOpenFiles:
options.max_open_files = -1;
break;
case kBlockBasedTableWithPrefixHashIndex: {
BlockBasedTableOptions table_options;
table_options.index_type = BlockBasedTableOptions::kHashSearch;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
break;
}
case kBlockBasedTableWithWholeKeyHashIndex: {
BlockBasedTableOptions table_options;
table_options.index_type = BlockBasedTableOptions::kHashSearch;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.prefix_extractor.reset(NewNoopTransform());
break;
}
default:
break;
}
Expand Down Expand Up @@ -1363,7 +1386,7 @@ TEST(DBTest, KeyMayExist) {

// KeyMayExist function only checks data in block caches, which is not used
// by plain table format.
} while (ChangeOptions(kSkipPlainTable));
} while (ChangeOptions(kSkipPlainTable | kSkipHashIndex));
}

TEST(DBTest, NonBlockingIteration) {
Expand Down Expand Up @@ -6184,7 +6207,9 @@ TEST(DBTest, Randomized) {
int minimum = 0;
if (option_config_ == kHashSkipList ||
option_config_ == kHashLinkList ||
option_config_ == kPlainTableFirstBytePrefix) {
option_config_ == kPlainTableFirstBytePrefix ||
option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
option_config_ == kBlockBasedTableWithPrefixHashIndex) {
minimum = 1;
}
if (p < 45) { // Put
Expand Down Expand Up @@ -6224,8 +6249,15 @@ TEST(DBTest, Randomized) {
}

if ((step % 100) == 0) {
ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
// For DB instances that use the hash index + block-based table, the
// iterator will be invalid right when seeking a non-existent key, right
// than return a key that is close to it.
if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
option_config_ != kBlockBasedTableWithPrefixHashIndex) {
ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
}

// Save a snapshot from each DB this time that we'll use next
// time we compare things, to make sure the current state is
// preserved with the snapshot
Expand Down
31 changes: 31 additions & 0 deletions db/dbformat.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "rocksdb/db.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/table.h"
#include "rocksdb/types.h"
#include "util/coding.h"
Expand Down Expand Up @@ -304,4 +305,34 @@ class IterKey {
void operator=(const IterKey&) = delete;
};

class InternalKeySliceTransform : public SliceTransform {
public:
explicit InternalKeySliceTransform(const SliceTransform* transform)
: transform_(transform) {}

virtual const char* Name() const { return transform_->Name(); }

virtual Slice Transform(const Slice& src) const {
auto user_key = ExtractUserKey(src);
return transform_->Transform(user_key);
}

virtual bool InDomain(const Slice& src) const {
auto user_key = ExtractUserKey(src);
return transform_->InDomain(user_key);
}

virtual bool InRange(const Slice& dst) const {
auto user_key = ExtractUserKey(dst);
return transform_->InRange(user_key);
}

const SliceTransform* user_prefix_extractor() const { return transform_; }

private:
// Like comparator, InternalKeySliceTransform will not take care of the
// deletion of transform_
const SliceTransform* const transform_;
};

} // namespace rocksdb
6 changes: 6 additions & 0 deletions include/rocksdb/table.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ struct BlockBasedTableOptions {
// A space efficient index block that is optimized for
// binary-search-based index.
kBinarySearch,

// The hash index, if enabled, will do the hash lookup when
// `ReadOption.prefix_seek == true`. User should also specify
// `Options.prefix_extractor` to allow the index block to correctly
// extract the prefix of the given key and perform hash table lookup.
kHashSearch,
};

IndexType index_type = kBinarySearch;
Expand Down
113 changes: 73 additions & 40 deletions table/block.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,20 @@

#include "table/block.h"

#include <vector>
#include <algorithm>
#include <string>
#include <unordered_map>
#include <vector>

#include "rocksdb/comparator.h"
#include "table/block_hash_index.h"
#include "table/format.h"
#include "util/coding.h"
#include "util/logging.h"

namespace rocksdb {

inline uint32_t Block::NumRestarts() const {
uint32_t Block::NumRestarts() const {
assert(size_ >= 2*sizeof(uint32_t));
return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
}
Expand Down Expand Up @@ -92,6 +96,7 @@ class Block::Iter : public Iterator {
std::string key_;
Slice value_;
Status status_;
BlockHashIndex* hash_index_;

inline int Compare(const Slice& a, const Slice& b) const {
return comparator_->Compare(a, b);
Expand All @@ -118,16 +123,15 @@ class Block::Iter : public Iterator {
}

public:
Iter(const Comparator* comparator,
const char* data,
uint32_t restarts,
uint32_t num_restarts)
Iter(const Comparator* comparator, const char* data, uint32_t restarts,
uint32_t num_restarts, BlockHashIndex* hash_index)
: comparator_(comparator),
data_(data),
restarts_(restarts),
num_restarts_(num_restarts),
current_(restarts_),
restart_index_(num_restarts_) {
restart_index_(num_restarts_),
hash_index_(hash_index) {
assert(num_restarts_ > 0);
}

Expand Down Expand Up @@ -169,45 +173,22 @@ class Block::Iter : public Iterator {
}

virtual void Seek(const Slice& target) {
// Binary search in restart array to find the first restart point
// with a key >= target
uint32_t left = 0;
uint32_t right = num_restarts_ - 1;
while (left < right) {
uint32_t mid = (left + right + 1) / 2;
uint32_t region_offset = GetRestartPoint(mid);
uint32_t shared, non_shared, value_length;
const char* key_ptr = DecodeEntry(data_ + region_offset,
data_ + restarts_,
&shared, &non_shared, &value_length);
if (key_ptr == nullptr || (shared != 0)) {
CorruptionError();
return;
}
Slice mid_key(key_ptr, non_shared);
if (Compare(mid_key, target) < 0) {
// Key at "mid" is smaller than "target". Therefore all
// blocks before "mid" are uninteresting.
left = mid;
} else {
// Key at "mid" is >= "target". Therefore all blocks at or
// after "mid" are uninteresting.
right = mid - 1;
}
}
uint32_t index = 0;
bool ok = hash_index_ ? HashSeek(target, &index)
: BinarySeek(target, 0, num_restarts_ - 1, &index);

if (!ok) {
return;
}
SeekToRestartPoint(index);
// Linear search (within restart block) for first key >= target
SeekToRestartPoint(left);

while (true) {
if (!ParseNextKey()) {
return;
}
if (Compare(key_, target) >= 0) {
if (!ParseNextKey() || Compare(key_, target) >= 0) {
return;
}
}
}

virtual void SeekToFirst() {
SeekToRestartPoint(0);
ParseNextKey();
Expand Down Expand Up @@ -257,6 +238,53 @@ class Block::Iter : public Iterator {
return true;
}
}
// Binary search in restart array to find the first restart point
// with a key >= target
bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
uint32_t* index) {
assert(left <= right);

while (left < right) {
uint32_t mid = (left + right + 1) / 2;
uint32_t region_offset = GetRestartPoint(mid);
uint32_t shared, non_shared, value_length;
const char* key_ptr =
DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
&non_shared, &value_length);
if (key_ptr == nullptr || (shared != 0)) {
CorruptionError();
return false;
}
Slice mid_key(key_ptr, non_shared);
if (Compare(mid_key, target) < 0) {
// Key at "mid" is smaller than "target". Therefore all
// blocks before "mid" are uninteresting.
left = mid;
} else {
// Key at "mid" is >= "target". Therefore all blocks at or
// after "mid" are uninteresting.
right = mid - 1;
}
}

*index = left;
return true;
}

bool HashSeek(const Slice& target, uint32_t* index) {
assert(hash_index_);
auto restart_index = hash_index_->GetRestartIndex(target);
if (restart_index == nullptr) {
current_ = restarts_;
return 0;
}

// the elements in restart_array[index : index + num_blocks]
// are all with same prefix. We'll do binary search in that small range.
auto left = restart_index->first_index;
auto right = restart_index->first_index + restart_index->num_blocks - 1;
return BinarySeek(target, left, right, index);
}
};

Iterator* Block::NewIterator(const Comparator* cmp) {
Expand All @@ -267,8 +295,13 @@ Iterator* Block::NewIterator(const Comparator* cmp) {
if (num_restarts == 0) {
return NewEmptyIterator();
} else {
return new Iter(cmp, data_, restart_offset_, num_restarts);
return new Iter(cmp, data_, restart_offset_, num_restarts,
hash_index_.get());
}
}

void Block::SetBlockHashIndex(BlockHashIndex* hash_index) {
hash_index_.reset(hash_index);
}

} // namespace rocksdb
18 changes: 14 additions & 4 deletions table/block.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@
#pragma once
#include <stddef.h>
#include <stdint.h>

#include "rocksdb/iterator.h"
#include "rocksdb/options.h"

namespace rocksdb {

struct BlockContents;
class Comparator;
class BlockHashIndex;

class Block {
public:
Expand All @@ -26,20 +28,28 @@ class Block {
~Block();

size_t size() const { return size_; }
bool cachable() const { return cachable_; }
const char* data() const { return data_; }
bool cachable() const { return cachable_; }
uint32_t NumRestarts() const;
CompressionType compression_type() const { return compression_type_; }

// If hash index lookup is enabled and `use_hash_index` is true. This block
// will do hash lookup for the key prefix.
//
// NOTE: for the hash based lookup, if a key prefix doesn't match any key,
// the iterator will simply be set as "invalid", rather than returning
// the key that is just pass the target key.
Iterator* NewIterator(const Comparator* comparator);
const char* data() { return data_; }
void SetBlockHashIndex(BlockHashIndex* hash_index);

private:
uint32_t NumRestarts() const;

const char* data_;
size_t size_;
uint32_t restart_offset_; // Offset in data_ of restart array
bool owned_; // Block owns data_[]
bool cachable_;
CompressionType compression_type_;
std::unique_ptr<BlockHashIndex> hash_index_;

// No copying allowed
Block(const Block&);
Expand Down
Loading

0 comments on commit 75b59d5

Please sign in to comment.