Skip to content

Commit

Permalink
Attempt to recover from db with missing table files (facebook#6334)
Browse files Browse the repository at this point in the history
Summary:
There are situations when RocksDB tries to recover, but the db is in an inconsistent state due to SST files referenced in the MANIFEST being missing. In this case, previous RocksDB will just fail the recovery and return a non-ok status.
This PR enables another possibility. During recovery, RocksDB checks possible MANIFEST files, and try to recover to the most recent state without missing table file. `VersionSet::Recover()` applies version edits incrementally and "materializes" a version only when this version does not reference any missing table file. After processing the entire MANIFEST, the version created last will be the latest version.
`DBImpl::Recover()` calls `VersionSet::Recover()`. Afterwards, WAL replay will *not* be performed.
To use this capability, set `options.best_efforts_recovery = true` when opening the db. Best-efforts recovery is currently incompatible with atomic flush.

Test plan (on devserver):
```
$make check
$COMPILE_WITH_ASAN=1 make all && make check
```
Pull Request resolved: facebook#6334

Reviewed By: anand1976

Differential Revision: D19778960

Pulled By: riversand963

fbshipit-source-id: c27ea80f29bc952e7d3311ecf5ee9c54393b40a8
  • Loading branch information
riversand963 authored and facebook-github-bot committed Mar 21, 2020
1 parent 4fc2166 commit fb09ef0
Show file tree
Hide file tree
Showing 27 changed files with 2,014 additions and 66 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,7 @@ set(SOURCES
db/trim_history_scheduler.cc
db/version_builder.cc
db/version_edit.cc
db/version_edit_handler.cc
db/version_set.cc
db/wal_manager.cc
db/write_batch.cc
Expand Down
1 change: 1 addition & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
### New Features
* Basic support for user timestamp in iterator. Seek/SeekToFirst/Next and lower/upper bounds are supported. Reverse iteration is not supported. Merge is not considered.
* When file lock failure when the lock is held by the current process, return acquiring time and thread ID in the error message.
* Added a new option, best_efforts_recovery (default: false), to allow database to open in a db dir with missing table files. During best efforts recovery, missing table files are ignored, and database recovers to the most recent state without missing table file. Cross-column-family consistency is not guaranteed even if WAL is enabled.

## 6.8.0 (02/24/2020)
### Java API Changes
Expand Down
1 change: 1 addition & 0 deletions TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ cpp_library(
"db/trim_history_scheduler.cc",
"db/version_builder.cc",
"db/version_edit.cc",
"db/version_edit_handler.cc",
"db/version_set.cc",
"db/wal_manager.cc",
"db/write_batch.cc",
Expand Down
8 changes: 4 additions & 4 deletions db/column_family.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1397,8 +1397,8 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
const ImmutableDBOptions* db_options,
const FileOptions& file_options,
Cache* table_cache,
WriteBufferManager* write_buffer_manager,
WriteController* write_controller,
WriteBufferManager* _write_buffer_manager,
WriteController* _write_controller,
BlockCacheTracer* const block_cache_tracer)
: max_column_family_(0),
dummy_cfd_(new ColumnFamilyData(
Expand All @@ -1410,8 +1410,8 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
db_options_(db_options),
file_options_(file_options),
table_cache_(table_cache),
write_buffer_manager_(write_buffer_manager),
write_controller_(write_controller),
write_buffer_manager_(_write_buffer_manager),
write_controller_(_write_controller),
block_cache_tracer_(block_cache_tracer) {
// initialize linked list
dummy_cfd_->prev_ = dummy_cfd_;
Expand Down
8 changes: 6 additions & 2 deletions db/column_family.h
Original file line number Diff line number Diff line change
Expand Up @@ -647,8 +647,8 @@ class ColumnFamilySet {
ColumnFamilySet(const std::string& dbname,
const ImmutableDBOptions* db_options,
const FileOptions& file_options, Cache* table_cache,
WriteBufferManager* write_buffer_manager,
WriteController* write_controller,
WriteBufferManager* _write_buffer_manager,
WriteController* _write_controller,
BlockCacheTracer* const block_cache_tracer);
~ColumnFamilySet();

Expand Down Expand Up @@ -678,6 +678,10 @@ class ColumnFamilySet {

Cache* get_table_cache() { return table_cache_; }

WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }

WriteController* write_controller() { return write_controller_; }

private:
friend class ColumnFamilyData;
// helper function that gets called from cfd destructor
Expand Down
152 changes: 152 additions & 0 deletions db/db_basic_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1734,6 +1734,158 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
keys.data(), values.data(), statuses.data(), true);
}

TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) {
Options options = CurrentOptions();
DestroyAndReopen(options);
CreateAndReopenWithCF({"pikachu", "eevee"}, options);
size_t num_cfs = handles_.size();
ASSERT_EQ(3, num_cfs);
WriteOptions write_opts;
write_opts.disableWAL = true;
for (size_t cf = 0; cf != num_cfs; ++cf) {
for (size_t i = 0; i != 10000; ++i) {
std::string key_str = Key(static_cast<int>(i));
std::string value_str = std::to_string(cf) + "_" + std::to_string(i);

ASSERT_OK(Put(static_cast<int>(cf), key_str, value_str));
if (0 == (i % 1000)) {
ASSERT_OK(Flush(static_cast<int>(cf)));
}
}
}
for (size_t cf = 0; cf != num_cfs; ++cf) {
ASSERT_OK(Flush(static_cast<int>(cf)));
}
Close();
options.best_efforts_recovery = true;
ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
options);
num_cfs = handles_.size();
ASSERT_EQ(3, num_cfs);
for (size_t cf = 0; cf != num_cfs; ++cf) {
for (int i = 0; i != 10000; ++i) {
std::string key_str = Key(static_cast<int>(i));
std::string expected_value_str =
std::to_string(cf) + "_" + std::to_string(i);
ASSERT_EQ(expected_value_str, Get(static_cast<int>(cf), key_str));
}
}
}

namespace {
class TableFileListener : public EventListener {
public:
void OnTableFileCreated(const TableFileCreationInfo& info) override {
InstrumentedMutexLock lock(&mutex_);
cf_to_paths_[info.cf_name].push_back(info.file_path);
}
std::vector<std::string>& GetFiles(const std::string& cf_name) {
InstrumentedMutexLock lock(&mutex_);
return cf_to_paths_[cf_name];
}

private:
InstrumentedMutex mutex_;
std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_;
};
} // namespace

TEST_F(DBBasicTest, RecoverWithMissingFiles) {
Options options = CurrentOptions();
DestroyAndReopen(options);
TableFileListener* listener = new TableFileListener();
// Disable auto compaction to simplify SST file name tracking.
options.disable_auto_compactions = true;
options.listeners.emplace_back(listener);
CreateAndReopenWithCF({"pikachu", "eevee"}, options);
std::vector<std::string> all_cf_names = {kDefaultColumnFamilyName, "pikachu",
"eevee"};
size_t num_cfs = handles_.size();
ASSERT_EQ(3, num_cfs);
for (size_t cf = 0; cf != num_cfs; ++cf) {
ASSERT_OK(Put(static_cast<int>(cf), "a", "0_value"));
ASSERT_OK(Flush(static_cast<int>(cf)));
ASSERT_OK(Put(static_cast<int>(cf), "b", "0_value"));
ASSERT_OK(Flush(static_cast<int>(cf)));
ASSERT_OK(Put(static_cast<int>(cf), "c", "0_value"));
ASSERT_OK(Flush(static_cast<int>(cf)));
}

// Delete files
for (size_t i = 0; i < all_cf_names.size(); ++i) {
std::vector<std::string>& files = listener->GetFiles(all_cf_names[i]);
ASSERT_EQ(3, files.size());
for (int j = static_cast<int>(files.size() - 1); j >= static_cast<int>(i);
--j) {
ASSERT_OK(env_->DeleteFile(files[j]));
}
}
options.best_efforts_recovery = true;
ReopenWithColumnFamilies(all_cf_names, options);
// Verify data
ReadOptions read_opts;
read_opts.total_order_seek = true;
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[0]));
iter->SeekToFirst();
ASSERT_FALSE(iter->Valid());
iter.reset(db_->NewIterator(read_opts, handles_[1]));
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("a", iter->key());
iter->Next();
ASSERT_FALSE(iter->Valid());
iter.reset(db_->NewIterator(read_opts, handles_[2]));
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("a", iter->key());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("b", iter->key());
iter->Next();
ASSERT_FALSE(iter->Valid());
}
}

TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) {
Options options = CurrentOptions();
DestroyAndReopen(options);
TableFileListener* listener = new TableFileListener();
options.listeners.emplace_back(listener);
CreateAndReopenWithCF({"pikachu"}, options);
std::vector<std::string> kAllCfNames = {kDefaultColumnFamilyName, "pikachu"};
size_t num_cfs = handles_.size();
ASSERT_EQ(2, num_cfs);
for (int cf = 0; cf < static_cast<int>(kAllCfNames.size()); ++cf) {
ASSERT_OK(Put(cf, "a", "0_value"));
ASSERT_OK(Flush(cf));
ASSERT_OK(Put(cf, "b", "0_value"));
}
// Delete files
for (size_t i = 0; i < kAllCfNames.size(); ++i) {
std::vector<std::string>& files = listener->GetFiles(kAllCfNames[i]);
ASSERT_EQ(1, files.size());
for (int j = static_cast<int>(files.size() - 1); j >= static_cast<int>(i);
--j) {
ASSERT_OK(env_->DeleteFile(files[j]));
}
}
options.best_efforts_recovery = true;
ReopenWithColumnFamilies(kAllCfNames, options);
// Verify WAL is not applied
ReadOptions read_opts;
read_opts.total_order_seek = true;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[0]));
iter->SeekToFirst();
ASSERT_FALSE(iter->Valid());
iter.reset(db_->NewIterator(read_opts, handles_[1]));
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("a", iter->key());
iter->Next();
ASSERT_FALSE(iter->Valid());
}

class DBBasicTestWithParallelIO
: public DBTestBase,
public testing::WithParamInterface<std::tuple<bool, bool, bool, bool>> {
Expand Down
11 changes: 11 additions & 0 deletions db/db_impl/db_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1124,6 +1124,17 @@ class DBImpl : public DB {

virtual bool OwnTablesAndLogs() const { return true; }

// REQUIRES: db mutex held when calling this function, but the db mutex can
// be released and re-acquired. Db mutex will be held when the function
// returns.
// Currently, this function should be called only in best-efforts recovery
// mode.
// After best-efforts recovery, there may be SST files in db/cf paths that are
// not referenced in the MANIFEST. We delete these SST files. In the
// meantime, we find out the largest file number present in the paths, and
// bump up the version set's next_file_number_ to be 1 + largest_file_number.
Status CleanupFilesAfterRecovery();

private:
friend class DB;
friend class ErrorHandler;
Expand Down
52 changes: 52 additions & 0 deletions db/db_impl/db_impl_files.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "db/event_helpers.h"
#include "db/memtable_list.h"
#include "file/file_util.h"
#include "file/filename.h"
#include "file/sst_file_manager_impl.h"
#include "util/autovector.h"

Expand Down Expand Up @@ -664,4 +665,55 @@ uint64_t PrecomputeMinLogNumberToKeep(
return min_log_number_to_keep;
}

Status DBImpl::CleanupFilesAfterRecovery() {
mutex_.AssertHeld();
std::vector<std::string> paths;
paths.push_back(dbname_);
for (const auto& db_path : immutable_db_options_.db_paths) {
paths.push_back(db_path.path);
}
for (const auto* cfd : *versions_->GetColumnFamilySet()) {
for (const auto& cf_path : cfd->ioptions()->cf_paths) {
paths.push_back(cf_path.path);
}
}
// Dedup paths
std::sort(paths.begin(), paths.end());
paths.erase(std::unique(paths.begin(), paths.end()), paths.end());

uint64_t next_file_number = versions_->current_next_file_number();
uint64_t largest_file_number = next_file_number;
std::set<std::string> files_to_delete;
for (const auto& path : paths) {
std::vector<std::string> files;
env_->GetChildren(path, &files);
for (const auto& fname : files) {
uint64_t number = 0;
FileType type;
if (!ParseFileName(fname, &number, &type)) {
continue;
}
const std::string normalized_fpath = NormalizePath(path + fname);
largest_file_number = std::max(largest_file_number, number);
if (type == kTableFile && number >= next_file_number &&
files_to_delete.find(normalized_fpath) == files_to_delete.end()) {
files_to_delete.insert(normalized_fpath);
}
}
}
if (largest_file_number > next_file_number) {
versions_->next_file_number_.store(largest_file_number + 1);
}
mutex_.Unlock();
Status s;
for (const auto& fname : files_to_delete) {
s = env_->DeleteFile(fname);
if (!s.ok()) {
break;
}
}
mutex_.Lock();
return s;
}

} // namespace ROCKSDB_NAMESPACE
22 changes: 20 additions & 2 deletions db/db_impl/db_impl_open.cc
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,12 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) {
"atomic_flush is incompatible with enable_pipelined_write");
}

// TODO remove this restriction
if (db_options.atomic_flush && db_options.best_efforts_recovery) {
return Status::InvalidArgument(
"atomic_flush is currently incompatible with best-efforts recovery");
}

return Status::OK();
}

Expand Down Expand Up @@ -419,7 +425,17 @@ Status DBImpl::Recover(
}
}
assert(db_id_.empty());
Status s = versions_->Recover(column_families, read_only, &db_id_);
Status s;
bool missing_table_file = false;
if (!immutable_db_options_.best_efforts_recovery) {
s = versions_->Recover(column_families, read_only, &db_id_);
} else {
s = versions_->TryRecover(column_families, read_only, &db_id_,
&missing_table_file);
if (s.ok()) {
s = CleanupFilesAfterRecovery();
}
}
if (!s.ok()) {
return s;
}
Expand Down Expand Up @@ -499,7 +515,9 @@ Status DBImpl::Recover(
// attention to it in case we are recovering a database
// produced by an older version of rocksdb.
std::vector<std::string> filenames;
s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
if (!immutable_db_options_.best_efforts_recovery) {
s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
}
if (s.IsNotFound()) {
return Status::InvalidArgument("wal_dir not found",
immutable_db_options_.wal_dir);
Expand Down
22 changes: 22 additions & 0 deletions db/version_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -528,4 +528,26 @@ Status VersionBuilder::LoadTableHandlers(
is_initial_load, prefix_extractor);
}

BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
ColumnFamilyData* cfd)
: version_builder_(new VersionBuilder(
cfd->current()->version_set()->file_options(), cfd->table_cache(),
cfd->current()->storage_info(), cfd->ioptions()->info_log)),
version_(cfd->current()) {
version_->Ref();
}

BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
ColumnFamilyData* cfd, Version* v)
: version_builder_(new VersionBuilder(
cfd->current()->version_set()->file_options(), cfd->table_cache(),
v->storage_info(), cfd->ioptions()->info_log)),
version_(v) {
assert(version_ != cfd->current());
}

BaseReferencedVersionBuilder::~BaseReferencedVersionBuilder() {
version_->Unref();
}

} // namespace ROCKSDB_NAMESPACE
Loading

0 comments on commit fb09ef0

Please sign in to comment.