Skip to content

Commit

Permalink
Introduce PinnedIteratorsManager (Reduce PinData() overhead / Refacto…
Browse files Browse the repository at this point in the history
…r PinData)

Summary:
While trying to reuse PinData() / ReleasePinnedData() .. to optimize away some memcpys I realized that there is a significant overhead for using PinData() / ReleasePinnedData if they were called many times.
This diff refactor the pinning logic by introducing PinnedIteratorsManager a centralized component that will be created once and will be notified whenever we need to Pin an Iterator. This implementation have much less overhead than the original implementation

Test Plan:
make check -j64
COMPILE_WITH_ASAN=1 make check -j64

Reviewers: yhchiang, sdong, andrewkr

Reviewed By: andrewkr

Subscribers: andrewkr, dhruba

Differential Revision: https://reviews.facebook.net/D56493
  • Loading branch information
IslamAbdelRahman committed Apr 26, 2016
1 parent 1995e34 commit d719b09
Show file tree
Hide file tree
Showing 11 changed files with 195 additions and 212 deletions.
61 changes: 20 additions & 41 deletions db/db_iter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/merge_context.h"
#include "db/pinned_iterators_manager.h"
#include "port/port.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
Expand Down Expand Up @@ -103,7 +104,7 @@ class DBIter: public Iterator {
InternalIterator* iter, SequenceNumber s, bool arena_mode,
uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
const Slice* iterate_upper_bound = nullptr,
bool prefix_same_as_start = false)
bool prefix_same_as_start = false, bool pin_data = false)
: arena_mode_(arena_mode),
env_(env),
logger_(ioptions.info_log),
Expand All @@ -118,12 +119,21 @@ class DBIter: public Iterator {
version_number_(version_number),
iterate_upper_bound_(iterate_upper_bound),
prefix_same_as_start_(prefix_same_as_start),
iter_pinned_(false) {
pin_thru_lifetime_(pin_data) {
RecordTick(statistics_, NO_ITERATORS);
prefix_extractor_ = ioptions.prefix_extractor;
max_skip_ = max_sequential_skip_in_iterations;
if (pin_thru_lifetime_) {
pinned_iters_mgr_.StartPinning();
}
if (iter_) {
iter_->SetPinnedItersMgr(&pinned_iters_mgr_);
}
}
virtual ~DBIter() {
if (pin_thru_lifetime_) {
pinned_iters_mgr_.ReleasePinnedIterators();
}
RecordTick(statistics_, NO_ITERATORS, -1);
local_stats_.BumpGlobalStatistics(statistics_);
if (!arena_mode_) {
Expand All @@ -135,9 +145,7 @@ class DBIter: public Iterator {
virtual void SetIter(InternalIterator* iter) {
assert(iter_ == nullptr);
iter_ = iter;
if (iter_ && iter_pinned_) {
iter_->PinData();
}
iter_->SetPinnedItersMgr(&pinned_iters_mgr_);
}
virtual bool Valid() const override { return valid_; }
virtual Slice key() const override {
Expand All @@ -156,28 +164,6 @@ class DBIter: public Iterator {
return status_;
}
}
virtual Status PinData() {
Status s;
if (iter_) {
s = iter_->PinData();
}
if (s.ok()) {
// Even if iter_ is nullptr, we set iter_pinned_ to true so that when
// iter_ is updated using SetIter, we Pin it.
iter_pinned_ = true;
}
return s;
}
virtual Status ReleasePinnedData() {
Status s;
if (iter_) {
s = iter_->ReleasePinnedData();
}
if (s.ok()) {
iter_pinned_ = false;
}
return s;
}

virtual Status GetProperty(std::string prop_name,
std::string* prop) override {
Expand All @@ -192,7 +178,7 @@ class DBIter: public Iterator {
return Status::OK();
} else if (prop_name == "rocksdb.iterator.is-key-pinned") {
if (valid_) {
*prop = (iter_pinned_ && saved_key_.IsKeyPinned()) ? "1" : "0";
*prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
} else {
*prop = "Iterator is not valid.";
}
Expand Down Expand Up @@ -250,10 +236,13 @@ class DBIter: public Iterator {
const Slice* iterate_upper_bound_;
IterKey prefix_start_;
bool prefix_same_as_start_;
bool iter_pinned_;
// Means that we will pin all data blocks we read as long the Iterator
// is not deleted, will be true if ReadOptions::pin_data is true
const bool pin_thru_lifetime_;
// List of operands for merge operator.
MergeContext merge_context_;
LocalStatistics local_stats_;
PinnedIteratorsManager pinned_iters_mgr_;

// No copying allowed
DBIter(const DBIter&);
Expand Down Expand Up @@ -890,10 +879,7 @@ Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions,
DBIter* db_iter =
new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence,
false, max_sequential_skip_in_iterations, version_number,
iterate_upper_bound, prefix_same_as_start);
if (pin_data) {
db_iter->PinData();
}
iterate_upper_bound, prefix_same_as_start, pin_data);
return db_iter;
}

Expand All @@ -916,14 +902,10 @@ inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); }
inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); }
inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); }
inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); }
inline Status ArenaWrappedDBIter::PinData() { return db_iter_->PinData(); }
inline Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
std::string* prop) {
return db_iter_->GetProperty(prop_name, prop);
}
inline Status ArenaWrappedDBIter::ReleasePinnedData() {
return db_iter_->ReleasePinnedData();
}
void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
void* arg2) {
db_iter_->RegisterCleanup(function, arg1, arg2);
Expand All @@ -941,12 +923,9 @@ ArenaWrappedDBIter* NewArenaWrappedDbIterator(
DBIter* db_iter =
new (mem) DBIter(env, ioptions, user_key_comparator, nullptr, sequence,
true, max_sequential_skip_in_iterations, version_number,
iterate_upper_bound, prefix_same_as_start);
iterate_upper_bound, prefix_same_as_start, pin_data);

iter->SetDBIter(db_iter);
if (pin_data) {
iter->PinData();
}

return iter;
}
Expand Down
2 changes: 0 additions & 2 deletions db/db_iter.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,6 @@ class ArenaWrappedDBIter : public Iterator {
virtual Status status() const override;

void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
virtual Status PinData();
virtual Status ReleasePinnedData();
virtual Status GetProperty(std::string prop_name, std::string* prop) override;

private:
Expand Down
25 changes: 15 additions & 10 deletions db/memtable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "db/dbformat.h"
#include "db/merge_context.h"
#include "db/pinned_iterators_manager.h"
#include "db/writebuffer.h"
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
Expand Down Expand Up @@ -231,13 +232,27 @@ class MemTableIterator : public InternalIterator {
}

~MemTableIterator() {
#ifndef NDEBUG
// Assert that the MemTableIterator is never deleted while
// Pinning is Enabled.
assert(!pinned_iters_mgr_ ||
(pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
#endif
if (arena_mode_) {
iter_->~Iterator();
} else {
delete iter_;
}
}

#ifndef NDEBUG
virtual void SetPinnedItersMgr(
PinnedIteratorsManager* pinned_iters_mgr) override {
pinned_iters_mgr_ = pinned_iters_mgr;
}
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
#endif

virtual bool Valid() const override { return valid_; }
virtual void Seek(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time);
Expand Down Expand Up @@ -285,16 +300,6 @@ class MemTableIterator : public InternalIterator {

virtual Status status() const override { return Status::OK(); }

virtual Status PinData() override {
// memtable data is always pinned
return Status::OK();
}

virtual Status ReleasePinnedData() override {
// memtable data is always pinned
return Status::OK();
}

virtual bool IsKeyPinned() const override {
// memtable data is always pinned
return true;
Expand Down
66 changes: 66 additions & 0 deletions db/pinned_iterators_manager.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once
#include <algorithm>
#include <memory>
#include <vector>

#include "table/internal_iterator.h"

namespace rocksdb {

// PinnedIteratorsManager will be notified whenever we need to pin an Iterator
// and it will be responsible for deleting pinned Iterators when they are
// not needed anymore.
class PinnedIteratorsManager {
public:
PinnedIteratorsManager() : pinning_enabled(false), pinned_iters_(nullptr) {}
~PinnedIteratorsManager() { assert(!pinning_enabled); }

// Enable Iterators pinning
void StartPinning() {
if (!pinning_enabled) {
pinning_enabled = true;
if (!pinned_iters_) {
pinned_iters_.reset(new std::vector<InternalIterator*>());
}
}
}

// Is pinning enabled ?
bool PinningEnabled() { return pinning_enabled; }

// Take ownership of iter if pinning is enabled and delete it when
// ReleasePinnedIterators() is called
void PinIteratorIfNeeded(InternalIterator* iter) {
if (!pinning_enabled || !iter) {
return;
}
pinned_iters_->push_back(iter);
}

// Release pinned Iterators
void ReleasePinnedIterators() {
if (pinning_enabled) {
pinning_enabled = false;

// Remove duplicate pointers
std::sort(pinned_iters_->begin(), pinned_iters_->end());
std::unique(pinned_iters_->begin(), pinned_iters_->end());

for (auto& iter : *pinned_iters_) {
delete iter;
}
pinned_iters_->clear();
}
}

private:
bool pinning_enabled;
std::unique_ptr<std::vector<InternalIterator*>> pinned_iters_;
};

} // namespace rocksdb
6 changes: 2 additions & 4 deletions include/rocksdb/iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,11 @@ class Iterator : public Cleanable {

// Property "rocksdb.iterator.is-key-pinned":
// If returning "1", this means that the Slice returned by key() is valid
// as long as the iterator is not deleted and ReleasePinnedData() is not
// called.
// as long as the iterator is not deleted.
// It is guaranteed to always return "1" if
// - Iterator created with ReadOptions::pin_data = true
// - DB tables were created with
// BlockBasedTableOptions::use_delta_encoding
// set to false.
// BlockBasedTableOptions::use_delta_encoding = false.
// Property "rocksdb.iterator.super-version-number":
// LSM version used by the iterator. The same format as DB Property
// kCurrentSuperVersionNumber. See its comment for more information.
Expand Down
2 changes: 1 addition & 1 deletion include/rocksdb/table.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ struct BlockBasedTableOptions {
int index_block_restart_interval = 1;

// Use delta encoding to compress keys in blocks.
// Iterator::PinData() requires this option to be disabled.
// ReadOptions::pin_data requires this option to be disabled.
//
// Default: true
bool use_delta_encoding = true;
Expand Down
22 changes: 13 additions & 9 deletions table/block.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@
#include <malloc.h>
#endif

#include "db/dbformat.h"
#include "db/pinned_iterators_manager.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "db/dbformat.h"
#include "table/block_prefix_index.h"
#include "table/block_hash_index.h"
#include "table/block_prefix_index.h"
#include "table/internal_iterator.h"

#include "format.h"
Expand Down Expand Up @@ -151,15 +152,18 @@ class BlockIter : public InternalIterator {

virtual void SeekToLast() override;

virtual Status PinData() override {
// block data is always pinned.
return Status::OK();
#ifndef NDEBUG
~BlockIter() {
// Assert that the BlockIter is never deleted while Pinning is Enabled.
assert(!pinned_iters_mgr_ ||
(pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
}

virtual Status ReleasePinnedData() override {
// block data is always pinned.
return Status::OK();
virtual void SetPinnedItersMgr(
PinnedIteratorsManager* pinned_iters_mgr) override {
pinned_iters_mgr_ = pinned_iters_mgr;
}
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
#endif

virtual bool IsKeyPinned() const override { return key_.IsKeyPinned(); }

Expand Down
23 changes: 12 additions & 11 deletions table/internal_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

namespace rocksdb {

class PinnedIteratorsManager;

class InternalIterator : public Cleanable {
public:
InternalIterator() {}
Expand Down Expand Up @@ -61,20 +63,19 @@ class InternalIterator : public Cleanable {
// satisfied without doing some IO, then this returns Status::Incomplete().
virtual Status status() const = 0;

// Make sure that all current and future data blocks used by this iterator
// will be pinned in memory and will not be released except when
// ReleasePinnedData() is called or the iterator is deleted.
virtual Status PinData() { return Status::NotSupported(""); }

// Release all blocks that were pinned because of PinData() and no future
// blocks will be pinned.
virtual Status ReleasePinnedData() { return Status::NotSupported(""); }
// Pass the PinnedIteratorsManager to the Iterator, most Iterators dont
// communicate with PinnedIteratorsManager so default implementation is no-op
// but for Iterators that need to communicate with PinnedIteratorsManager
// they will implement this function and use the passed pointer to communicate
// with PinnedIteratorsManager.
virtual void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {}

// If true, this means that the Slice returned by key() is valid as long
// as the iterator is not deleted and ReleasePinnedData() is not called.
// If true, this means that the Slice returned by key() is valid as long as
// PinnedIteratorsManager::ReleasePinnedIterators is not called and the
// Iterator is not deleted.
//
// IsKeyPinned() is guaranteed to always return true if
// - PinData() is called
// - Iterator is created with ReadOptions::pin_data = true
// - DB tables were created with BlockBasedTableOptions::use_delta_encoding
// set to false.
virtual bool IsKeyPinned() const { return false; }
Expand Down
Loading

0 comments on commit d719b09

Please sign in to comment.