Skip to content

Commit

Permalink
In DB::NewIterator(), try to allocate the whole iterator tree in an a…
Browse files Browse the repository at this point in the history
…rena

Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.

Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.

Test Plan: make all check

Reviewers: ljin, haobo

Reviewed By: haobo

Subscribers: leveldb, dhruba, yhchiang, igor

Differential Revision: https://reviews.facebook.net/D18513
  • Loading branch information
siying committed Jun 3, 2014
1 parent 4627966 commit df9069d
Show file tree
Hide file tree
Showing 31 changed files with 532 additions and 110 deletions.
105 changes: 82 additions & 23 deletions db/db_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3179,18 +3179,34 @@ static void CleanupIteratorState(void* arg1, void* arg2) {

Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
ColumnFamilyData* cfd,
SuperVersion* super_version) {
std::vector<Iterator*> iterator_list;
// Collect iterator for mutable mem
iterator_list.push_back(super_version->mem->NewIterator(options));
// Collect all needed child iterators for immutable memtables
super_version->imm->AddIterators(options, &iterator_list);
// Collect iterators for files in L0 - Ln
super_version->current->AddIterators(options, storage_options_,
&iterator_list);
Iterator* internal_iter = NewMergingIterator(
&cfd->internal_comparator(), &iterator_list[0], iterator_list.size());

SuperVersion* super_version,
Arena* arena) {
Iterator* internal_iter;
if (arena != nullptr) {
// Need to create internal iterator from the arena.
MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena);
// Collect iterator for mutable mem
merge_iter_builder.AddIterator(
super_version->mem->NewIterator(options, false, arena));
// Collect all needed child iterators for immutable memtables
super_version->imm->AddIterators(options, &merge_iter_builder);
// Collect iterators for files in L0 - Ln
super_version->current->AddIterators(options, storage_options_,
&merge_iter_builder);
internal_iter = merge_iter_builder.Finish();
} else {
// Need to create internal iterator using malloc.
std::vector<Iterator*> iterator_list;
// Collect iterator for mutable mem
iterator_list.push_back(super_version->mem->NewIterator(options));
// Collect all needed child iterators for immutable memtables
super_version->imm->AddIterators(options, &iterator_list);
// Collect iterators for files in L0 - Ln
super_version->current->AddIterators(options, storage_options_,
&iterator_list);
internal_iter = NewMergingIterator(&cfd->internal_comparator(),
&iterator_list[0], iterator_list.size());
}
IterState* cleanup = new IterState(this, &mutex_, super_version);
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);

Expand Down Expand Up @@ -3541,34 +3557,77 @@ Iterator* DBImpl::NewIterator(const ReadOptions& options,
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();

Iterator* iter;
if (options.tailing) {
#ifdef ROCKSDB_LITE
// not supported in lite version
return nullptr;
#else
// TODO(ljin): remove tailing iterator
iter = new ForwardIterator(env_, this, options, cfd);
iter = NewDBIterator(env_, *cfd->options(),
cfd->user_comparator(), iter, kMaxSequenceNumber);
//iter = new TailingIterator(env_, this, options, cfd);
auto iter = new ForwardIterator(env_, this, options, cfd);
return NewDBIterator(env_, *cfd->options(), cfd->user_comparator(), iter,
kMaxSequenceNumber);
// return new TailingIterator(env_, this, options, cfd);
#endif
} else {
SequenceNumber latest_snapshot = versions_->LastSequence();
SuperVersion* sv = nullptr;
sv = cfd->GetReferencedSuperVersion(&mutex_);

iter = NewInternalIterator(options, cfd, sv);

auto snapshot =
options.snapshot != nullptr
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
: latest_snapshot;
iter = NewDBIterator(env_, *cfd->options(),
cfd->user_comparator(), iter, snapshot);
}

return iter;
// Try to generate a DB iterator tree in continuous memory area to be
// cache friendly. Here is an example of result:
// +-------------------------------+
// | |
// | ArenaWrappedDBIter |
// | + |
// | +---> Inner Iterator ------------+
// | | | |
// | | +-- -- -- -- -- -- -- --+ |
// | +--- | Arena | |
// | | | |
// | Allocated Memory: | |
// | | +-------------------+ |
// | | | DBIter | <---+
// | | + |
// | | | +-> iter_ ------------+
// | | | | |
// | | +-------------------+ |
// | | | MergingIterator | <---+
// | | + |
// | | | +->child iter1 ------------+
// | | | | | |
// | | +->child iter2 ----------+ |
// | | | | | | |
// | | | +->child iter3 --------+ | |
// | | | | | |
// | | +-------------------+ | | |
// | | | Iterator1 | <--------+
// | | +-------------------+ | |
// | | | Iterator2 | <------+
// | | +-------------------+ |
// | | | Iterator3 | <----+
// | | +-------------------+
// | | |
// +-------+-----------------------+
//
// ArenaWrappedDBIter inlines an arena area where all the iterartor in the
// the iterator tree is allocated in the order of being accessed when
// querying.
// Laying out the iterators in the order of being accessed makes it more
// likely that any iterator pointer is close to the iterator it points to so
// that they are likely to be in the same cache line and/or page.
ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
env_, *cfd->options(), cfd->user_comparator(), snapshot);
Iterator* internal_iter =
NewInternalIterator(options, cfd, sv, db_iter->GetArena());
db_iter->SetIterUnderDBIter(internal_iter);

return db_iter;
}
}

Status DBImpl::NewIterators(
Expand Down
4 changes: 3 additions & 1 deletion db/db_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class Version;
class VersionEdit;
class VersionSet;
class CompactionFilterV2;
class Arena;

class DBImpl : public DB {
public:
Expand Down Expand Up @@ -278,7 +279,8 @@ class DBImpl : public DB {
const DBOptions options_;

Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
SuperVersion* super_version);
SuperVersion* super_version,
Arena* arena = nullptr);

private:
friend class DB;
Expand Down
71 changes: 56 additions & 15 deletions db/db_iter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h"
#include "port/port.h"
#include "util/arena.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/perf_context_imp.h"
Expand All @@ -37,8 +38,6 @@ static void DumpInternalIter(Iterator* iter) {
}
#endif

namespace {

// Memtables and sstables that make the DB representation contain
// (userkey,seq,type) => uservalue entries. DBIter
// combines multiple entries for the same userkey found in the DB
Expand All @@ -57,9 +56,10 @@ class DBIter: public Iterator {
kReverse
};

DBIter(Env* env, const Options& options,
const Comparator* cmp, Iterator* iter, SequenceNumber s)
: env_(env),
DBIter(Env* env, const Options& options, const Comparator* cmp,
Iterator* iter, SequenceNumber s, bool arena_mode)
: arena_mode_(arena_mode),
env_(env),
logger_(options.info_log.get()),
user_comparator_(cmp),
user_merge_operator_(options.merge_operator.get()),
Expand All @@ -74,7 +74,15 @@ class DBIter: public Iterator {
}
virtual ~DBIter() {
RecordTick(statistics_, NO_ITERATORS, -1);
delete iter_;
if (!arena_mode_) {
delete iter_;
} else {
iter_->~Iterator();
}
}
virtual void SetIter(Iterator* iter) {
assert(iter_ == nullptr);
iter_ = iter;
}
virtual bool Valid() const { return valid_; }
virtual Slice key() const {
Expand Down Expand Up @@ -116,11 +124,12 @@ class DBIter: public Iterator {
}
}

bool arena_mode_;
Env* const env_;
Logger* logger_;
const Comparator* const user_comparator_;
const MergeOperator* const user_merge_operator_;
Iterator* const iter_;
Iterator* iter_;
SequenceNumber const sequence_;

Status status_;
Expand Down Expand Up @@ -461,16 +470,48 @@ void DBIter::SeekToLast() {
FindPrevUserEntry();
}

} // anonymous namespace
Iterator* NewDBIterator(Env* env, const Options& options,
const Comparator* user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence) {
return new DBIter(env, options, user_key_comparator, internal_iter, sequence,
false);
}

ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~Iterator(); }

void ArenaWrappedDBIter::SetDBIter(DBIter* iter) { db_iter_ = iter; }

void ArenaWrappedDBIter::SetIterUnderDBIter(Iterator* iter) {
static_cast<DBIter*>(db_iter_)->SetIter(iter);
}

inline bool ArenaWrappedDBIter::Valid() const { return db_iter_->Valid(); }
inline void ArenaWrappedDBIter::SeekToFirst() { db_iter_->SeekToFirst(); }
inline void ArenaWrappedDBIter::SeekToLast() { db_iter_->SeekToLast(); }
inline void ArenaWrappedDBIter::Seek(const Slice& target) {
db_iter_->Seek(target);
}
inline void ArenaWrappedDBIter::Next() { db_iter_->Next(); }
inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); }
inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); }
inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); }
inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); }
void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
void* arg2) {
db_iter_->RegisterCleanup(function, arg1, arg2);
}

Iterator* NewDBIterator(
Env* env,
const Options& options,
const Comparator *user_key_comparator,
Iterator* internal_iter,
ArenaWrappedDBIter* NewArenaWrappedDbIterator(
Env* env, const Options& options, const Comparator* user_key_comparator,
const SequenceNumber& sequence) {
return new DBIter(env, options, user_key_comparator,
internal_iter, sequence);
ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
Arena* arena = iter->GetArena();
auto mem = arena->AllocateAligned(sizeof(DBIter));
DBIter* db_iter = new (mem)
DBIter(env, options, user_key_comparator, nullptr, sequence, true);
iter->SetDBIter(db_iter);
return iter;
}

} // namespace rocksdb
46 changes: 46 additions & 0 deletions db/db_iter.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,14 @@
#include <stdint.h>
#include "rocksdb/db.h"
#include "db/dbformat.h"
#include "util/arena.h"
#include "util/autovector.h"

namespace rocksdb {

class Arena;
class DBIter;

// Return a new iterator that converts internal keys (yielded by
// "*internal_iter") that were live at the specified "sequence" number
// into appropriate user keys.
Expand All @@ -24,4 +29,45 @@ extern Iterator* NewDBIterator(
Iterator* internal_iter,
const SequenceNumber& sequence);

// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
// iterator is supposed be allocated. This class is used as an entry point of
// a iterator hierarchy whose memory can be allocated inline. In that way,
// accessing the iterator tree can be more cache friendly. It is also faster
// to allocate.
class ArenaWrappedDBIter : public Iterator {
public:
virtual ~ArenaWrappedDBIter();

// Get the arena to be used to allocate memory for DBIter to be wrapped,
// as well as child iterators in it.
virtual Arena* GetArena() { return &arena_; }

// Set the DB Iterator to be wrapped

virtual void SetDBIter(DBIter* iter);

// Set the internal iterator wrapped inside the DB Iterator. Usually it is
// a merging iterator.
virtual void SetIterUnderDBIter(Iterator* iter);
virtual bool Valid() const override;
virtual void SeekToFirst() override;
virtual void SeekToLast() override;
virtual void Seek(const Slice& target) override;
virtual void Next() override;
virtual void Prev() override;
virtual Slice key() const override;
virtual Slice value() const override;
virtual Status status() const override;
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);

private:
DBIter* db_iter_;
Arena arena_;
};

// Generate the arena wrapped iterator class.
extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
Env* env, const Options& options, const Comparator* user_key_comparator,
const SequenceNumber& sequence);

} // namespace rocksdb
31 changes: 24 additions & 7 deletions db/memtable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h"
#include "rocksdb/slice_transform.h"
#include "table/merger.h"
#include "util/arena.h"
#include "util/coding.h"
#include "util/murmurhash.h"
Expand Down Expand Up @@ -173,15 +174,24 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
class MemTableIterator: public Iterator {
public:
MemTableIterator(const MemTable& mem, const ReadOptions& options,
bool enforce_total_order)
bool enforce_total_order, Arena* arena)
: bloom_(nullptr),
prefix_extractor_(mem.prefix_extractor_),
valid_(false) {
valid_(false),
arena_mode_(arena != nullptr) {
if (prefix_extractor_ != nullptr && !enforce_total_order) {
bloom_ = mem.prefix_bloom_.get();
iter_.reset(mem.table_->GetDynamicPrefixIterator());
iter_ = mem.table_->GetDynamicPrefixIterator(arena);
} else {
iter_.reset(mem.table_->GetIterator());
iter_ = mem.table_->GetIterator(arena);
}
}

~MemTableIterator() {
if (arena_mode_) {
iter_->~Iterator();
} else {
delete iter_;
}
}

Expand Down Expand Up @@ -228,17 +238,24 @@ class MemTableIterator: public Iterator {
private:
DynamicBloom* bloom_;
const SliceTransform* const prefix_extractor_;
std::unique_ptr<MemTableRep::Iterator> iter_;
MemTableRep::Iterator* iter_;
bool valid_;
bool arena_mode_;

// No copying allowed
MemTableIterator(const MemTableIterator&);
void operator=(const MemTableIterator&);
};

Iterator* MemTable::NewIterator(const ReadOptions& options,
bool enforce_total_order) {
return new MemTableIterator(*this, options, enforce_total_order);
bool enforce_total_order, Arena* arena) {
if (arena == nullptr) {
return new MemTableIterator(*this, options, enforce_total_order, nullptr);
} else {
auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
return new (mem)
MemTableIterator(*this, options, enforce_total_order, arena);
}
}

port::RWMutex* MemTable::GetLock(const Slice& key) {
Expand Down
Loading

0 comments on commit df9069d

Please sign in to comment.