Skip to content

Commit

Permalink
Log replay integration for secondary instance (facebook#5305)
Browse files Browse the repository at this point in the history
Summary:
RocksDB secondary can replay both MANIFEST and WAL now.
On the one hand, the memory usage by memtables will grow after replaying WAL for sometime. On the other hand, replaying the MANIFEST can bring the database persistent data to a more recent point in time, giving us the opportunity to discard some memtables containing out-dated data.
This PR coordinates the MANIFEST and WAL replay, using the updates from MANIFEST replay to update the active memtable and immutable memtable list of each column family.
Pull Request resolved: facebook#5305

Differential Revision: D15386512

Pulled By: riversand963

fbshipit-source-id: a3ea6fc415f8382d8cf624f52a71ebdcffa3e355
  • Loading branch information
riversand963 authored and facebook-github-bot committed May 18, 2019
1 parent f3a7847 commit fb4c6a3
Show file tree
Hide file tree
Showing 7 changed files with 336 additions and 45 deletions.
1 change: 1 addition & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
### New Features
* Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
* Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions, it offers higher throughput with however no compromise on guarantees.
* Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.

### Performance Improvements
* Reduce binary search when iterator reseek into the same data block.
Expand Down
4 changes: 2 additions & 2 deletions db/db_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1078,8 +1078,8 @@ class DBImpl : public DB {
JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);

// REQUIRES: log_numbers are sorted in ascending order
virtual Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
SequenceNumber* next_sequence, bool read_only);
Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
SequenceNumber* next_sequence, bool read_only);

// The following two methods are used to flush a memtable to
// storage. The first one is used at database RecoveryTime (when the
Expand Down
119 changes: 92 additions & 27 deletions db/db_impl_secondary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
namespace rocksdb {

#ifndef ROCKSDB_LITE

DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
const std::string& dbname)
: DBImpl(db_options, dbname) {
Expand All @@ -35,6 +34,7 @@ Status DBImplSecondary::Recover(
bool /*error_if_data_exists_in_logs*/) {
mutex_.AssertHeld();

JobContext job_context(0);
Status s;
s = static_cast<ReactiveVersionSet*>(versions_.get())
->Recover(column_families, &manifest_reader_, &manifest_reporter_,
Expand All @@ -59,11 +59,29 @@ Status DBImplSecondary::Recover(
single_column_family_mode_ =
versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;

s = FindAndRecoverLogFiles();
std::unordered_set<ColumnFamilyData*> cfds_changed;
s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
}

// TODO: update options_file_number_ needed?

job_context.Clean();
return s;
}

// find new WAL and apply them in order to the secondary instance
Status DBImplSecondary::FindAndRecoverLogFiles(
std::unordered_set<ColumnFamilyData*>* cfds_changed,
JobContext* job_context) {
assert(nullptr != cfds_changed);
assert(nullptr != job_context);
Status s;
std::vector<uint64_t> logs;
s = FindNewLogNumbers(&logs);
if (s.ok() && !logs.empty()) {
SequenceNumber next_sequence(kMaxSequenceNumber);
s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context);
}
return s;
}

Expand Down Expand Up @@ -151,7 +169,10 @@ Status DBImplSecondary::MaybeInitLogReader(
// REQUIRES: log_numbers are sorted in ascending order
Status DBImplSecondary::RecoverLogFiles(
const std::vector<uint64_t>& log_numbers, SequenceNumber* next_sequence,
bool /*read_only*/) {
std::unordered_set<ColumnFamilyData*>* cfds_changed,
JobContext* job_context) {
assert(nullptr != cfds_changed);
assert(nullptr != job_context);
mutex_.AssertHeld();
Status status;
for (auto log_number : log_numbers) {
Expand Down Expand Up @@ -184,6 +205,39 @@ Status DBImplSecondary::RecoverLogFiles(
continue;
}
WriteBatchInternal::SetContents(&batch, record);
std::vector<uint32_t> column_family_ids;
status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids);
if (status.ok()) {
SequenceNumber seq = versions_->LastSequence();
for (const auto id : column_family_ids) {
ColumnFamilyData* cfd =
versions_->GetColumnFamilySet()->GetColumnFamily(id);
if (cfd == nullptr) {
continue;
}
if (cfds_changed->count(cfd) == 0) {
cfds_changed->insert(cfd);
}
auto curr_log_num = port::kMaxUint64;
if (cfd_to_current_log_.count(cfd) > 0) {
curr_log_num = cfd_to_current_log_[cfd];
}
// If the active memtable contains records added by replaying an
// earlier WAL, then we need to seal the memtable, add it to the
// immutable memtable list and create a new active memtable.
if (!cfd->mem()->IsEmpty() && (curr_log_num == port::kMaxUint64 ||
curr_log_num != log_number)) {
const MutableCFOptions mutable_cf_options =
*cfd->GetLatestMutableCFOptions();
MemTable* new_mem =
cfd->ConstructNewMemtable(mutable_cf_options, seq);
cfd->mem()->SetNextLogNumber(log_number);
cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free);
new_mem->Ref();
cfd->SetMemtable(new_mem);
}
}
}
// do not check sequence number because user may toggle disableWAL
// between writes which breaks sequence number continuity guarantee

Expand All @@ -194,12 +248,30 @@ Status DBImplSecondary::RecoverLogFiles(
// That's why we set ignore missing column families to true
// passing null flush_scheduler will disable memtable flushing which is
// needed for secondary instances
bool has_valid_writes = false;
status = WriteBatchInternal::InsertInto(
&batch, column_family_memtables_.get(), nullptr /* flush_scheduler */,
true, log_number, this, false /* concurrent_memtable_writes */,
next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
if (!status.ok()) {
if (status.ok()) {
bool has_valid_writes = false;
status = WriteBatchInternal::InsertInto(
&batch, column_family_memtables_.get(),
nullptr /* flush_scheduler */, true, log_number, this,
false /* concurrent_memtable_writes */, next_sequence,
&has_valid_writes, seq_per_batch_, batch_per_txn_);
}
if (status.ok()) {
for (const auto id : column_family_ids) {
ColumnFamilyData* cfd =
versions_->GetColumnFamilySet()->GetColumnFamily(id);
if (cfd == nullptr) {
continue;
}
std::unordered_map<ColumnFamilyData*, uint64_t>::iterator iter =
cfd_to_current_log_.find(cfd);
if (iter == cfd_to_current_log_.end()) {
cfd_to_current_log_.insert({cfd, log_number});
} else if (log_number > iter->second) {
iter->second = log_number;
}
}
} else {
// We are treating this as a failure while reading since we read valid
// blocks that do not form coherent data
reader->GetReporter()->Corruption(record.size(), status);
Expand Down Expand Up @@ -296,18 +368,6 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
return s;
}

// find new WAL and apply them in order to the secondary instance
Status DBImplSecondary::FindAndRecoverLogFiles() {
Status s;
std::vector<uint64_t> logs;
s = FindNewLogNumbers(&logs);
if (s.ok() && !logs.empty()) {
SequenceNumber next_sequence(kMaxSequenceNumber);
s = RecoverLogFiles(logs, &next_sequence, true /*read_only*/);
}
return s;
}

Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options,
ColumnFamilyHandle* column_family) {
if (read_options.managed) {
Expand Down Expand Up @@ -393,20 +453,25 @@ Status DBImplSecondary::TryCatchUpWithPrimary() {
Status s;
// read the manifest and apply new changes to the secondary instance
std::unordered_set<ColumnFamilyData*> cfds_changed;
JobContext job_context(0, true /*create_superversion*/);
InstrumentedMutexLock lock_guard(&mutex_);
s = static_cast<ReactiveVersionSet*>(versions_.get())
->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed);
// list wal_dir to discover new WALs and apply new changes to the secondary
// instance
if (s.ok()) {
s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
}
if (s.ok()) {
SuperVersionContext sv_context(true /* create_superversion */);
for (auto cfd : cfds_changed) {
sv_context.NewSuperVersion();
cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),
&job_context.memtables_to_free);
auto& sv_context = job_context.superversion_contexts.back();
cfd->InstallSuperVersion(&sv_context, &mutex_);
sv_context.NewSuperVersion();
}
sv_context.Clean();
job_context.Clean();
}
// list wal_dir to discover new WALs and apply new changes to the secondary
// instance
s = FindAndRecoverLogFiles();
return s;
}

Expand Down
Loading

0 comments on commit fb4c6a3

Please sign in to comment.