Skip to content

Commit

Permalink
Mempurge support for wal (facebook#8528)
Browse files Browse the repository at this point in the history
Summary:
In this PR, `mempurge` is made compatible with the Write Ahead Log: in case of recovery, the DB is now capable of recovering the data that was "mempurged" and kept in the `imm()` list of immutable memtables.
The twist was to add a uint64_t to the `memtable` struct to store the number of the earliest log file containing entries from the `memtable`. When a `Flush` operation is replaced with a `MemPurge`, the `VersionEdit` (which usually contains the new min log file number to pick up for recovery and the level 0 file path of the newly created SST file) is no longer appended to the manifest log, and every time the `deleteWal` method is called, a check is made on the list of immutable memtables.
This PR also includes a unit test that verifies that no data is lost upon Reopening of the database when the mempurge feature is activated. This extensive unit test includes two column families, with valid data contained in the imm() at time of "crash"/reopening (recovery).

Pull Request resolved: facebook#8528

Reviewed By: pdillinger

Differential Revision: D29701097

Pulled By: bjlemaire

fbshipit-source-id: 072a900fb6ccc1edcf5eef6caf88f3060238edf9
  • Loading branch information
bjlemaire authored and facebook-github-bot committed Jul 16, 2021
1 parent 4e4ec16 commit 206845c
Show file tree
Hide file tree
Showing 17 changed files with 355 additions and 37 deletions.
11 changes: 7 additions & 4 deletions db/column_family.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1059,17 +1059,20 @@ uint64_t ColumnFamilyData::GetLiveSstFilesSize() const {
}

MemTable* ColumnFamilyData::ConstructNewMemtable(
const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq,
uint64_t log_number) {
return new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
write_buffer_manager_, earliest_seq, id_);
write_buffer_manager_, earliest_seq, id_, log_number);
}

void ColumnFamilyData::CreateNewMemtable(
const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq,
uint64_t log_number) {
if (mem_ != nullptr) {
delete mem_->Unref();
}
SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq));
SetMemtable(
ConstructNewMemtable(mutable_cf_options, earliest_seq, log_number));
mem_->Ref();
}

Expand Down
5 changes: 3 additions & 2 deletions db/column_family.h
Original file line number Diff line number Diff line change
Expand Up @@ -371,9 +371,10 @@ class ColumnFamilyData {

// See Memtable constructor for explanation of earliest_seq param.
MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
SequenceNumber earliest_seq);
SequenceNumber earliest_seq,
uint64_t log_number = 0);
void CreateNewMemtable(const MutableCFOptions& mutable_cf_options,
SequenceNumber earliest_seq);
SequenceNumber earliest_seq, uint64_t log_number = 0);

TableCache* table_cache() const { return table_cache_.get(); }
BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); }
Expand Down
170 changes: 170 additions & 0 deletions db/db_flush_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,176 @@ TEST_F(DBFlushTest, MemPurgeAndCompactionFilter) {
ASSERT_EQ(Get(KEY5), p_v5);
}

TEST_F(DBFlushTest, MemPurgeWALSupport) {
Options options = CurrentOptions();

options.statistics = CreateDBStatistics();
options.statistics->set_stats_level(StatsLevel::kAll);
options.create_if_missing = true;
options.compression = kNoCompression;
options.inplace_update_support = false;
options.allow_concurrent_memtable_write = true;

// Enforce size of a single MemTable to 1MB.
options.write_buffer_size = 128 << 10;
// Activate the MemPurge prototype.
options.experimental_allow_mempurge = true;
ASSERT_OK(TryReopen(options));

const size_t KVSIZE = 10;

do {
CreateAndReopenWithCF({"pikachu"}, options);
ASSERT_OK(Put(1, "foo", "v1"));
ASSERT_OK(Put(1, "baz", "v5"));

ReopenWithColumnFamilies({"default", "pikachu"}, options);
ASSERT_EQ("v1", Get(1, "foo"));

ASSERT_EQ("v1", Get(1, "foo"));
ASSERT_EQ("v5", Get(1, "baz"));
ASSERT_OK(Put(0, "bar", "v2"));
ASSERT_OK(Put(1, "bar", "v2"));
ASSERT_OK(Put(1, "foo", "v3"));
uint32_t mempurge_count = 0;
uint32_t sst_count = 0;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DBImpl::FlushJob:MemPurgeSuccessful",
[&](void* /*arg*/) { mempurge_count++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

std::vector<std::string> keys;
for (size_t k = 0; k < KVSIZE; k++) {
keys.push_back("IamKey" + std::to_string(k));
}

std::string RNDKEY, RNDVALUE;
const std::string NOT_FOUND = "NOT_FOUND";

// Heavy overwrite workload,
// more than would fit in maximum allowed memtables.
Random rnd(719);
const size_t NUM_REPEAT = 100;
const size_t RAND_KEY_LENGTH = 8192;
const size_t RAND_VALUES_LENGTH = 1024;
std::vector<std::string> values_default(KVSIZE), values_pikachu(KVSIZE);

// Insert a very first set of keys that will be
// mempurged at least once.
for (size_t k = 0; k < KVSIZE / 2; k++) {
values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH);
values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH);
}

// Insert keys[0:KVSIZE/2] to
// both 'default' and 'pikachu' CFs.
for (size_t k = 0; k < KVSIZE / 2; k++) {
ASSERT_OK(Put(0, keys[k], values_default[k]));
ASSERT_OK(Put(1, keys[k], values_pikachu[k]));
}

// Check that the insertion was seamless.
for (size_t k = 0; k < KVSIZE / 2; k++) {
ASSERT_EQ(Get(0, keys[k]), values_default[k]);
ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
}

// Insertion of of K-V pairs, multiple times (overwrites)
// into 'default' CF. Will trigger mempurge.
for (size_t j = 0; j < NUM_REPEAT; j++) {
// Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH);
}

// Insert K-V into default CF.
for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
ASSERT_OK(Put(0, keys[k], values_default[k]));
}

// Check key validity, for all keys, both in
// default and pikachu CFs.
for (size_t k = 0; k < KVSIZE; k++) {
ASSERT_EQ(Get(0, keys[k]), values_default[k]);
}
// Note that at this point, only keys[0:KVSIZE/2]
// have been inserted into Pikachu.
for (size_t k = 0; k < KVSIZE / 2; k++) {
ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
}
}

// Insertion of of K-V pairs, multiple times (overwrites)
// into 'pikachu' CF. Will trigger mempurge.
// Check that we keep the older logs for 'default' imm().
for (size_t j = 0; j < NUM_REPEAT; j++) {
// Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH);
}

// Insert K-V into pikachu CF.
for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
ASSERT_OK(Put(1, keys[k], values_pikachu[k]));
}

// Check key validity, for all keys,
// both in default and pikachu.
for (size_t k = 0; k < KVSIZE; k++) {
ASSERT_EQ(Get(0, keys[k]), values_default[k]);
ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
}
}

// Check that there was at least one mempurge
const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
// Check that there was no SST files created during flush.
const uint32_t EXPECTED_SST_COUNT = 0;

EXPECT_GE(mempurge_count, EXPECTED_MIN_MEMPURGE_COUNT);
EXPECT_EQ(sst_count, EXPECTED_SST_COUNT);

ReopenWithColumnFamilies({"default", "pikachu"}, options);
// Check that there was no data corruption anywhere,
// not in 'default' nor in 'Pikachu' CFs.
ASSERT_EQ("v3", Get(1, "foo"));
ASSERT_OK(Put(1, "foo", "v4"));
ASSERT_EQ("v4", Get(1, "foo"));
ASSERT_EQ("v2", Get(1, "bar"));
ASSERT_EQ("v5", Get(1, "baz"));
// Check keys in 'Default' and 'Pikachu'.
// keys[0:KVSIZE/2] were for sure contained
// in the imm() at Reopen/recovery time.
for (size_t k = 0; k < KVSIZE; k++) {
ASSERT_EQ(Get(0, keys[k]), values_default[k]);
ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
}
// Insertion of random K-V pairs to trigger
// a flush in the Pikachu CF.
for (size_t j = 0; j < NUM_REPEAT; j++) {
RNDKEY = rnd.RandomString(RAND_KEY_LENGTH);
RNDVALUE = rnd.RandomString(RAND_VALUES_LENGTH);
ASSERT_OK(Put(1, RNDKEY, RNDVALUE));
}
// ASsert than there was at least one flush to storage.
EXPECT_GT(sst_count, EXPECTED_SST_COUNT);
ReopenWithColumnFamilies({"default", "pikachu"}, options);
ASSERT_EQ("v4", Get(1, "foo"));
ASSERT_EQ("v2", Get(1, "bar"));
ASSERT_EQ("v5", Get(1, "baz"));
// Since values in default are held in mutable mem()
// and imm(), check if the flush in pikachu didn't
// affect these values.
for (size_t k = 0; k < KVSIZE; k++) {
ASSERT_EQ(Get(0, keys[k]), values_default[k]);
ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
}
ASSERT_EQ(Get(1, RNDKEY), RNDVALUE);
} while (ChangeWalOptions());
}

TEST_P(DBFlushDirectIOTest, DirectIO) {
Options options;
options.create_if_missing = true;
Expand Down
16 changes: 16 additions & 0 deletions db/db_impl/db_impl_compaction_flush.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2397,6 +2397,17 @@ void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req,
assert(flush_req.size() == 1);
ColumnFamilyData* cfd = flush_req[0].first;
assert(cfd);
// Note: SchedulePendingFlush is always preceded
// with an imm()->FlushRequested() call. However,
// we want to make this code snipper more resilient to
// future changes. Therefore, we add the following if
// statement - note that calling it twice (or more)
// doesn't break anything.
if (immutable_db_options_.experimental_allow_mempurge) {
// If imm() contains silent memtables,
// requesting a flush will mark the imm_needed as true.
cfd->imm()->FlushRequested();
}
if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) {
cfd->Ref();
cfd->set_queued_for_flush(true);
Expand Down Expand Up @@ -2538,6 +2549,11 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,

for (const auto& iter : flush_req) {
ColumnFamilyData* cfd = iter.first;
if (immutable_db_options_.experimental_allow_mempurge) {
// If imm() contains silent memtables,
// requesting a flush will mark the imm_needed as true.
cfd->imm()->FlushRequested();
}
if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
// can't flush this CF, try next one
column_families_not_to_flush.push_back(cfd);
Expand Down
7 changes: 4 additions & 3 deletions db/db_impl/db_impl_open.cc
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ Status DBImpl::Recover(
// Clear memtables if recovery failed
for (auto cfd : *versions_->GetColumnFamilySet()) {
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
kMaxSequenceNumber);
kMaxSequenceNumber, cfd->GetLogNumber());
}
}
}
Expand Down Expand Up @@ -1066,7 +1066,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
flushed = true;

cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
*next_sequence);
*next_sequence, cfd->GetLogNumber());
}
}
}
Expand Down Expand Up @@ -1204,7 +1204,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
flushed = true;

cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
versions_->LastSequence());
versions_->LastSequence(),
cfd->GetLogNumber());
}
data_seen = true;
}
Expand Down
4 changes: 2 additions & 2 deletions db/db_impl/db_impl_secondary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -253,8 +253,8 @@ Status DBImplSecondary::RecoverLogFiles(
curr_log_num != log_number)) {
const MutableCFOptions mutable_cf_options =
*cfd->GetLatestMutableCFOptions();
MemTable* new_mem =
cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch);
MemTable* new_mem = cfd->ConstructNewMemtable(
mutable_cf_options, seq_of_batch, log_number);
cfd->mem()->SetNextLogNumber(log_number);
cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free);
new_mem->Ref();
Expand Down
3 changes: 2 additions & 1 deletion db/db_impl/db_impl_write.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1805,7 +1805,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
}
if (s.ok()) {
SequenceNumber seq = versions_->LastSequence();
new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
new_mem =
cfd->ConstructNewMemtable(mutable_cf_options, seq, new_log_number);
context->superversion_context.NewSuperVersion();
}
ROCKS_LOG_INFO(immutable_db_options_.info_log,
Expand Down
50 changes: 39 additions & 11 deletions db/flush_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,10 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
s = cfd_->imm()->TryInstallMemtableFlushResults(
cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
log_buffer_, &committed_flush_jobs_info_, &tmp_io_s);
log_buffer_, &committed_flush_jobs_info_, &tmp_io_s,
!(mempurge_s.ok()) /* write_edit : true if no mempurge happened (or if aborted),
but 'false' if mempurge successful: no new min log number
or new level 0 file path to write to manifest. */);
if (!tmp_io_s.ok()) {
io_status_ = tmp_io_s;
}
Expand Down Expand Up @@ -330,6 +333,17 @@ void FlushJob::Cancel() {
base_->Unref();
}

uint64_t FlushJob::ExtractEarliestLogFileNumber() {
uint64_t earliest_logno = 0;
for (MemTable* m : mems_) {
uint64_t logno = m->GetEarliestLogFileNumber();
if (logno > 0 && (earliest_logno == 0 || logno < earliest_logno)) {
earliest_logno = logno;
}
}
return earliest_logno;
}

Status FlushJob::MemPurge() {
Status s;
db_mutex_->AssertHeld();
Expand All @@ -356,12 +370,25 @@ Status FlushJob::MemPurge() {
}

assert(!memtables.empty());
SequenceNumber first_seqno = mems_[0]->GetFirstSequenceNumber();
SequenceNumber earliest_seqno = mems_[0]->GetEarliestSequenceNumber();
SequenceNumber first_seqno = kMaxSequenceNumber;
SequenceNumber earliest_seqno = kMaxSequenceNumber;
// Pick first and earliest seqno as min of all first_seqno
// and earliest_seqno of the mempurged memtables.
for (const auto& mem : mems_) {
first_seqno = mem->GetFirstSequenceNumber() < first_seqno
? mem->GetFirstSequenceNumber()
: first_seqno;
earliest_seqno = mem->GetEarliestSequenceNumber() < earliest_seqno
? mem->GetEarliestSequenceNumber()
: earliest_seqno;
}

ScopedArenaIterator iter(
NewMergingIterator(&(cfd_->internal_comparator()), memtables.data(),
static_cast<int>(memtables.size()), &arena));

uint64_t earliest_logno = ExtractEarliestLogFileNumber();

auto* ioptions = cfd_->ioptions();

// Place iterator at the First (meaning most recent) key node.
Expand Down Expand Up @@ -400,12 +427,9 @@ Status FlushJob::MemPurge() {
}
}

// mems are ordered by increasing ID, so mems_[0]->GetID
// returns the smallest memtable ID.
new_mem =
new MemTable((cfd_->internal_comparator()), *(cfd_->ioptions()),
mutable_cf_options_, cfd_->write_buffer_mgr(),
mems_[0]->GetEarliestSequenceNumber(), cfd_->GetID());
new_mem = new MemTable((cfd_->internal_comparator()), *(cfd_->ioptions()),
mutable_cf_options_, cfd_->write_buffer_mgr(),
earliest_seqno, cfd_->GetID(), earliest_logno);
assert(new_mem != nullptr);

Env* env = db_options_.env;
Expand Down Expand Up @@ -530,8 +554,12 @@ Status FlushJob::MemPurge() {
// only if it filled at less than 60% capacity (arbitrary heuristic).
if (new_mem->ApproximateMemoryUsage() < maxSize) {
db_mutex_->Lock();
cfd_->imm()
->Add(new_mem, &job_context_->memtables_to_free, false /* trigger_flush. Adding this memtable will not trigger any flush */);
cfd_->imm()->Add(new_mem,
&job_context_->memtables_to_free,
false /* -> trigger_flush=false:
* adding this memtable
* will not trigger a flush.
*/);
new_mem->Ref();
db_mutex_->Unlock();
} else {
Expand Down
1 change: 1 addition & 0 deletions db/flush_job.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ class FlushJob {
// recommend all users not to set this flag as true given that the MemPurge
// process has not matured yet.
Status MemPurge();
uint64_t ExtractEarliestLogFileNumber();
#ifndef ROCKSDB_LITE
std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const;
#endif // !ROCKSDB_LITE
Expand Down
Loading

0 comments on commit 206845c

Please sign in to comment.