Skip to content

Commit

Permalink
Blob DB: Improve FIFO eviction
Browse files Browse the repository at this point in the history
Summary:
Improving blob db FIFO eviction with the following changes,
* Change blob_dir_size to max_db_size. Take into account SST file size when computing DB size.
* FIFO now only take into account live sst files and live blob files. It is normal for disk usage to go over max_db_size because there are obsolete sst files and blob files pending deletion.
* FIFO eviction now also evict TTL blob files that's still open. It doesn't evict non-TTL blob files.
* If FIFO is triggered, it will pass an expiration and the current sequence number to compaction filter. Compaction filter will then filter inlined keys to evict those with an earlier expiration and smaller sequence number. So call LSM FIFO.
* Compaction filter also filter those blob indexes where corresponding blob file is gone.
* Add an event listener to listen compaction/flush event and update sst file size.
* Implement DB::Close() to make sure base db, as well as event listener and compaction filter, destruct before blob db.
* More blob db statistics around FIFO.
* Fix some locking issue when accessing a blob file.
Closes facebook#3556

Differential Revision: D7139328

Pulled By: yiwu-arbug

fbshipit-source-id: ea5edb07b33dfceacb2682f4789bea61de28bbfa
  • Loading branch information
Yi Wu authored and facebook-github-bot committed Mar 6, 2018
1 parent 0a2354c commit b864bc9
Show file tree
Hide file tree
Showing 15 changed files with 823 additions and 254 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,7 @@ set(SOURCES
util/transaction_test_util.cc
util/xxhash.cc
utilities/backupable/backupable_db.cc
utilities/blob_db/blob_compaction_filter.cc
utilities/blob_db/blob_db.cc
utilities/blob_db/blob_db_impl.cc
utilities/blob_db/blob_dump_tool.cc
Expand Down
1 change: 1 addition & 0 deletions TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ cpp_library(
"util/transaction_test_util.cc",
"util/xxhash.cc",
"utilities/backupable/backupable_db.cc",
"utilities/blob_db/blob_compaction_filter.cc",
"utilities/blob_db/blob_db.cc",
"utilities/blob_db/blob_db_impl.cc",
"utilities/blob_db/blob_dump_tool.cc",
Expand Down
5 changes: 4 additions & 1 deletion db/compaction_iterator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,13 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
CompactionFilter::ValueType value_type =
ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
: CompactionFilter::ValueType::kBlobIndex;
// Hack: pass internal key to BlobIndexCompactionFilter since it needs
// to get sequence number.
Slice& filter_key = ikey_.type == kTypeValue ? ikey_.user_key : key_;
{
StopWatchNano timer(env_, true);
filter = compaction_filter_->FilterV2(
compaction_->level(), ikey_.user_key, value_type, value_,
compaction_->level(), filter_key, value_type, value_,
&compaction_filter_value_, compaction_filter_skip_until_.rep());
iter_stats_.total_filter_time +=
env_ != nullptr ? timer.ElapsedNanos() : 0;
Expand Down
18 changes: 16 additions & 2 deletions include/rocksdb/statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,16 @@ enum Tickers : uint32_t {
BLOB_DB_BLOB_FILE_SYNCED,
// # of blob index evicted from base DB by BlobDB compaction filter because
// of expiration.
BLOB_DB_BLOB_INDEX_EXPIRED,
BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
// size of blob index evicted from base DB by BlobDB compaction filter
// because of expiration.
BLOB_DB_BLOB_INDEX_EXPIRED_SIZE,
// # of blob index evicted from base DB by BlobDB compaction filter because
// of corresponding file deleted.
BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
// size of blob index evicted from base DB by BlobDB compaction filter
// because of corresponding file deleted.
BLOB_DB_BLOB_INDEX_EVICTED_SIZE,
// # of blob files being garbage collected.
BLOB_DB_GC_NUM_FILES,
// # of blob files generated by garbage collection.
Expand Down Expand Up @@ -417,7 +426,12 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"},
{BLOB_DB_BLOB_FILE_BYTES_READ, "rocksdb.blobdb.blob.file.bytes.read"},
{BLOB_DB_BLOB_FILE_SYNCED, "rocksdb.blobdb.blob.file.synced"},
{BLOB_DB_BLOB_INDEX_EXPIRED, "rocksdb.blobdb.blob.index.expired"},
{BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
"rocksdb.blobdb.blob.index.expired.count"},
{BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, "rocksdb.blobdb.blob.index.expired.size"},
{BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
"rocksdb.blobdb.blob.index.evicted.count"},
{BLOB_DB_BLOB_INDEX_EVICTED_SIZE, "rocksdb.blobdb.blob.index.evicted.size"},
{BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"},
{BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"},
{BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"},
Expand Down
1 change: 1 addition & 0 deletions src.mk
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ LIB_SOURCES = \
util/transaction_test_util.cc \
util/xxhash.cc \
utilities/backupable/backupable_db.cc \
utilities/blob_db/blob_compaction_filter.cc \
utilities/blob_db/blob_db.cc \
utilities/blob_db/blob_db_impl.cc \
utilities/blob_db/blob_file.cc \
Expand Down
4 changes: 2 additions & 2 deletions tools/db_bench_tool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,7 @@ DEFINE_bool(blob_db_enable_gc, false, "Enable BlobDB garbage collection.");

DEFINE_bool(blob_db_is_fifo, false, "Enable FIFO eviction strategy in BlobDB.");

DEFINE_uint64(blob_db_dir_size, 0,
DEFINE_uint64(blob_db_max_db_size, 0,
"Max size limit of the directory where blob files are stored.");

DEFINE_uint64(blob_db_max_ttl_range, 86400,
Expand Down Expand Up @@ -3446,7 +3446,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
blob_db::BlobDBOptions blob_db_options;
blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
blob_db_options.blob_dir_size = FLAGS_blob_db_dir_size;
blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
Expand Down
117 changes: 117 additions & 0 deletions utilities/blob_db/blob_compaction_filter.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).

#ifndef ROCKSDB_LITE

#include "utilities/blob_db/blob_compaction_filter.h"
#include "db/dbformat.h"

namespace rocksdb {
namespace blob_db {

namespace {

// CompactionFilter to delete expired blob index from base DB.
class BlobIndexCompactionFilter : public CompactionFilter {
public:
BlobIndexCompactionFilter(BlobCompactionContext context,
uint64_t current_time, Statistics* statistics)
: context_(context),
current_time_(current_time),
statistics_(statistics) {}

virtual ~BlobIndexCompactionFilter() {
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, expired_count_);
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, expired_size_);
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_COUNT, evicted_count_);
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_SIZE, evicted_size_);
}

virtual const char* Name() const override {
return "BlobIndexCompactionFilter";
}

// Filter expired blob indexes regardless of snapshots.
virtual bool IgnoreSnapshots() const override { return true; }

virtual Decision FilterV2(int /*level*/, const Slice& key,
ValueType value_type, const Slice& value,
std::string* /*new_value*/,
std::string* /*skip_until*/) const override {
if (value_type != kBlobIndex) {
return Decision::kKeep;
}
BlobIndex blob_index;
Status s = blob_index.DecodeFrom(value);
if (!s.ok()) {
// Unable to decode blob index. Keeping the value.
return Decision::kKeep;
}
if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) {
// Expired
expired_count_++;
expired_size_ += key.size() + value.size();
return Decision::kRemove;
}
if (!blob_index.IsInlined() &&
blob_index.file_number() < context_.next_file_number &&
context_.current_blob_files.count(blob_index.file_number()) == 0) {
// Corresponding blob file gone. Could have been garbage collected or
// evicted by FIFO eviction.
evicted_count_++;
evicted_size_ += key.size() + value.size();
return Decision::kRemove;
}
if (context_.fifo_eviction_seq > 0 && blob_index.HasTTL() &&
blob_index.expiration() < context_.evict_expiration_up_to) {
// Hack: Internal key is passed to BlobIndexCompactionFilter for it to
// get sequence number.
ParsedInternalKey ikey;
bool ok = ParseInternalKey(key, &ikey);
// Remove keys that could have been remove by last FIFO eviction.
// If get error while parsing key, ignore and continue.
if (ok && ikey.sequence < context_.fifo_eviction_seq) {
evicted_count_++;
evicted_size_ += key.size() + value.size();
return Decision::kRemove;
}
}
return Decision::kKeep;
}

private:
BlobCompactionContext context_;
const uint64_t current_time_;
Statistics* statistics_;
// It is safe to not using std::atomic since the compaction filter, created
// from a compaction filter factroy, will not be called from multiple threads.
mutable uint64_t expired_count_ = 0;
mutable uint64_t expired_size_ = 0;
mutable uint64_t evicted_count_ = 0;
mutable uint64_t evicted_size_ = 0;
};

} // anonymous namespace

std::unique_ptr<CompactionFilter>
BlobIndexCompactionFilterFactory::CreateCompactionFilter(
const CompactionFilter::Context& /*context*/) {
int64_t current_time = 0;
Status s = env_->GetCurrentTime(&current_time);
if (!s.ok()) {
return nullptr;
}
assert(current_time >= 0);

BlobCompactionContext context;
blob_db_impl_->GetCompactionContext(&context);

return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilter(
context, static_cast<uint64_t>(current_time), statistics_));
}

} // namespace blob_db
} // namespace rocksdb
#endif // ROCKSDB_LITE
69 changes: 13 additions & 56 deletions utilities/blob_db/blob_compaction_filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,82 +5,39 @@
#pragma once
#ifndef ROCKSDB_LITE

#include <unordered_set>

#include "monitoring/statistics.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h"
#include "utilities/blob_db/blob_db_impl.h"
#include "utilities/blob_db/blob_index.h"

namespace rocksdb {
namespace blob_db {

// CompactionFilter to delete expired blob index from base DB.
class BlobIndexCompactionFilter : public CompactionFilter {
public:
BlobIndexCompactionFilter(uint64_t current_time, Statistics* statistics)
: current_time_(current_time), statistics_(statistics) {}

virtual ~BlobIndexCompactionFilter() {
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED, expired_count_);
}

virtual const char* Name() const override {
return "BlobIndexCompactionFilter";
}

// Filter expired blob indexes regardless of snapshots.
virtual bool IgnoreSnapshots() const override { return true; }

virtual Decision FilterV2(int /*level*/, const Slice& /*key*/,
ValueType value_type, const Slice& value,
std::string* /*new_value*/,
std::string* /*skip_until*/) const override {
if (value_type != kBlobIndex) {
return Decision::kKeep;
}
BlobIndex blob_index;
Status s = blob_index.DecodeFrom(value);
if (!s.ok()) {
// Unable to decode blob index. Keeping the value.
return Decision::kKeep;
}
if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) {
// Expired
expired_count_++;
return Decision::kRemove;
}
return Decision::kKeep;
}

private:
const uint64_t current_time_;
Statistics* statistics_;
// It is safe to not using std::atomic since the compaction filter, created
// from a compaction filter factroy, will not be called from multiple threads.
mutable uint64_t expired_count_ = 0;
struct BlobCompactionContext {
uint64_t next_file_number;
std::unordered_set<uint64_t> current_blob_files;
SequenceNumber fifo_eviction_seq;
uint64_t evict_expiration_up_to;
};

class BlobIndexCompactionFilterFactory : public CompactionFilterFactory {
public:
BlobIndexCompactionFilterFactory(Env* env, Statistics* statistics)
: env_(env), statistics_(statistics) {}
BlobIndexCompactionFilterFactory(BlobDBImpl* blob_db_impl, Env* env,
Statistics* statistics)
: blob_db_impl_(blob_db_impl), env_(env), statistics_(statistics) {}

virtual const char* Name() const override {
return "BlobIndexCompactionFilterFactory";
}

virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
const CompactionFilter::Context& /*context*/) override {
int64_t current_time = 0;
Status s = env_->GetCurrentTime(&current_time);
if (!s.ok()) {
return nullptr;
}
assert(current_time >= 0);
return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilter(
static_cast<uint64_t>(current_time), statistics_));
}
const CompactionFilter::Context& /*context*/) override;

private:
BlobDBImpl* blob_db_impl_;
Env* env_;
Statistics* statistics_;
};
Expand Down
66 changes: 42 additions & 24 deletions utilities/blob_db/blob_db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,30 +63,48 @@ Status BlobDB::Open(const DBOptions& db_options,
BlobDB::BlobDB() : StackableDB(nullptr) {}

void BlobDBOptions::Dump(Logger* log) const {
ROCKS_LOG_HEADER(log, " blob_db_options.blob_dir: %s",
blob_dir.c_str());
ROCKS_LOG_HEADER(log, " blob_db_options.path_relative: %d",
path_relative);
ROCKS_LOG_HEADER(log, " blob_db_options.is_fifo: %d",
is_fifo);
ROCKS_LOG_HEADER(log, " blob_db_options.blob_dir_size: %" PRIu64,
blob_dir_size);
ROCKS_LOG_HEADER(log, " blob_db_options.ttl_range_secs: %" PRIu32,
ttl_range_secs);
ROCKS_LOG_HEADER(log, " blob_db_options.min_blob_size: %" PRIu64,
min_blob_size);
ROCKS_LOG_HEADER(log, " blob_db_options.bytes_per_sync: %" PRIu64,
bytes_per_sync);
ROCKS_LOG_HEADER(log, " blob_db_options.blob_file_size: %" PRIu64,
blob_file_size);
ROCKS_LOG_HEADER(log, " blob_db_options.ttl_extractor: %p",
ttl_extractor.get());
ROCKS_LOG_HEADER(log, " blob_db_options.compression: %d",
static_cast<int>(compression));
ROCKS_LOG_HEADER(log, "blob_db_options.enable_garbage_collection: %d",
enable_garbage_collection);
ROCKS_LOG_HEADER(log, " blob_db_options.disable_background_tasks: %d",
disable_background_tasks);
ROCKS_LOG_HEADER(
log, " BlobDBOptions.blob_dir: %s",
blob_dir.c_str());
ROCKS_LOG_HEADER(
log, " BlobDBOptions.path_relative: %d",
path_relative);
ROCKS_LOG_HEADER(
log, " BlobDBOptions.is_fifo: %d",
is_fifo);
ROCKS_LOG_HEADER(
log, " BlobDBOptions.max_db_size: %" PRIu64,
max_db_size);
ROCKS_LOG_HEADER(
log, " BlobDBOptions.ttl_range_secs: %" PRIu32,
ttl_range_secs);
ROCKS_LOG_HEADER(
log, " BlobDBOptions.min_blob_size: %" PRIu64,
min_blob_size);
ROCKS_LOG_HEADER(
log, " BlobDBOptions.bytes_per_sync: %" PRIu64,
bytes_per_sync);
ROCKS_LOG_HEADER(
log, " BlobDBOptions.blob_file_size: %" PRIu64,
blob_file_size);
ROCKS_LOG_HEADER(
log, " BlobDBOptions.ttl_extractor: %p",
ttl_extractor.get());
ROCKS_LOG_HEADER(
log, " BlobDBOptions.compression: %d",
static_cast<int>(compression));
ROCKS_LOG_HEADER(
log, " BlobDBOptions.enable_garbage_collection: %d",
enable_garbage_collection);
ROCKS_LOG_HEADER(
log, " BlobDBOptions.garbage_collection_interval_secs: %" PRIu64,
garbage_collection_interval_secs);
ROCKS_LOG_HEADER(
log, "BlobDBOptions.garbage_collection_deletion_size_threshold: %lf",
garbage_collection_deletion_size_threshold);
ROCKS_LOG_HEADER(
log, " BlobDBOptions.disable_background_tasks: %d",
disable_background_tasks);
}

} // namespace blob_db
Expand Down
Loading

0 comments on commit b864bc9

Please sign in to comment.