Skip to content

Commit

Permalink
Provide an option so that SST ingestion won't fall back to copy after…
Browse files Browse the repository at this point in the history
… hard linking fails (facebook#5333)

Summary:
RocksDB always tries to perform a hard link operation on the external SST file to ingest. This operation can fail if the external SST resides on a different device/FS, or the underlying FS does not support hard link. Currently RocksDB assumes that if the link fails, the user is willing to perform file copy, which is not true according to the post. This commit provides an option named  'failed_move_fall_back_to_copy' for users to choose which behavior they want.
Pull Request resolved: facebook#5333

Differential Revision: D15457597

Pulled By: HaoyuHuang

fbshipit-source-id: f3626e13f845db4f7ed970a53ec8a2b1f0d62214
  • Loading branch information
HaoyuHuang authored and facebook-github-bot committed May 24, 2019
1 parent 6a54278 commit 74a334a
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 27 deletions.
3 changes: 2 additions & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
* Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees.
* Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
* Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error.

### Performance Improvements
* Reduce binary search when iterator reseek into the same data block.
Expand All @@ -20,7 +21,7 @@

### Bug Fixes


## 6.2.0 (4/30/2019)
### New Features
* Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
Expand Down
19 changes: 10 additions & 9 deletions db/external_sst_file_ingestion_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,26 +92,27 @@ Status ExternalSstFileIngestionJob::Prepare(
// Copy/Move external files into DB
for (IngestedFileInfo& f : files_to_ingest_) {
f.fd = FileDescriptor(next_file_number++, 0, f.file_size);

f.copy_file = false;
const std::string path_outside_db = f.external_file_path;
const std::string path_inside_db =
TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(),
f.fd.GetPathId());

if (ingestion_options_.move_files) {
status = env_->LinkFile(path_outside_db, path_inside_db);
if (status.IsNotSupported()) {
// Original file is on a different FS, use copy instead of hard linking
status = CopyFile(env_, path_outside_db, path_inside_db, 0,
db_options_.use_fsync);
if (status.IsNotSupported() &&
ingestion_options_.failed_move_fall_back_to_copy) {
// Original file is on a different FS, use copy instead of hard linking.
f.copy_file = true;
} else {
f.copy_file = false;
}
} else {
f.copy_file = true;
}

if (f.copy_file) {
TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
nullptr);
status = CopyFile(env_, path_outside_db, path_inside_db, 0,
db_options_.use_fsync);
f.copy_file = true;
}
TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
if (!status.ok()) {
Expand Down
109 changes: 92 additions & 17 deletions db/external_sst_file_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,54 @@

namespace rocksdb {

// A test environment that can be configured to fail the Link operation.
class ExternalSSTTestEnv : public EnvWrapper {
public:
ExternalSSTTestEnv(Env* t, bool fail_link)
: EnvWrapper(t), fail_link_(fail_link) {}

Status LinkFile(const std::string& s, const std::string& t) override {
if (fail_link_) {
return Status::NotSupported("Link failed");
}
return target()->LinkFile(s, t);
}

void set_fail_link(bool fail_link) { fail_link_ = fail_link; }

private:
bool fail_link_;
};

class ExternSSTFileLinkFailFallbackTest
: public DBTestBase,
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
public:
ExternSSTFileLinkFailFallbackTest()
: DBTestBase("/external_sst_file_test"),
test_env_(new ExternalSSTTestEnv(env_, true)) {
sst_files_dir_ = dbname_ + "/sst_files/";
test::DestroyDir(env_, sst_files_dir_);
env_->CreateDir(sst_files_dir_);
options_ = CurrentOptions();
options_.disable_auto_compactions = true;
options_.env = test_env_;
}

void TearDown() override {
delete db_;
db_ = nullptr;
ASSERT_OK(DestroyDB(dbname_, options_));
delete test_env_;
test_env_ = nullptr;
}

protected:
std::string sst_files_dir_;
Options options_;
ExternalSSTTestEnv* test_env_;
};

class ExternalSSTFileTest
: public DBTestBase,
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
Expand Down Expand Up @@ -2014,17 +2062,23 @@ TEST_F(ExternalSSTFileTest, FileWithCFInfo) {
}

/*
* Test and verify the functionality of ingestion_options.move_files.
* Test and verify the functionality of ingestion_options.move_files and
* ingestion_options.failed_move_fall_back_to_copy
*/
TEST_F(ExternalSSTFileTest, LinkExternalSst) {
Options options = CurrentOptions();
options.disable_auto_compactions = true;
DestroyAndReopen(options);
TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
const bool fail_link = std::get<0>(GetParam());
const bool failed_move_fall_back_to_copy = std::get<1>(GetParam());
test_env_->set_fail_link(fail_link);
const EnvOptions env_options;
DestroyAndReopen(options_);
const int kNumKeys = 10000;
IngestExternalFileOptions ifo;
ifo.move_files = true;
ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy;

std::string file_path = sst_files_dir_ + "file1.sst";
// Create SstFileWriter for default column family
SstFileWriter sst_file_writer(EnvOptions(), options);
SstFileWriter sst_file_writer(env_options, options_);
ASSERT_OK(sst_file_writer.Open(file_path));
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value"));
Expand All @@ -2033,9 +2087,13 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) {
uint64_t file_size = 0;
ASSERT_OK(env_->GetFileSize(file_path, &file_size));

IngestExternalFileOptions ifo;
ifo.move_files = true;
ASSERT_OK(db_->IngestExternalFile({file_path}, ifo));
bool copyfile = false;
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"ExternalSstFileIngestionJob::Prepare:CopyFile",
[&](void* /* arg */) { copyfile = true; });
rocksdb::SyncPoint::GetInstance()->EnableProcessing();

const Status s = db_->IngestExternalFile({file_path}, ifo);

ColumnFamilyHandleImpl* cfh =
static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
Expand All @@ -2049,18 +2107,29 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) {
bytes_copied += stats.bytes_written;
bytes_moved += stats.bytes_moved;
}
// If bytes_moved > 0, it means external sst resides on the same FS
// supporting hard link operation. Therefore,
// 0 bytes should be copied, and the bytes_moved == file_size.
// Otherwise, FS does not support hard link, or external sst file resides on
// a different file system, then the bytes_copied should be equal to
// file_size.
if (bytes_moved > 0) {

if (!fail_link) {
// Link operation succeeds. External SST should be moved.
ASSERT_OK(s);
ASSERT_EQ(0, bytes_copied);
ASSERT_EQ(file_size, bytes_moved);
ASSERT_FALSE(copyfile);
} else {
ASSERT_EQ(file_size, bytes_copied);
// Link operation fails.
ASSERT_EQ(0, bytes_moved);
if (failed_move_fall_back_to_copy) {
ASSERT_OK(s);
// Copy file is true since a failed link falls back to copy file.
ASSERT_TRUE(copyfile);
ASSERT_EQ(file_size, bytes_copied);
} else {
ASSERT_TRUE(s.IsNotSupported());
// Copy file is false since a failed link does not fall back to copy file.
ASSERT_FALSE(copyfile);
ASSERT_EQ(0, bytes_copied);
}
}
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
}

class TestIngestExternalFileListener : public EventListener {
Expand Down Expand Up @@ -2666,6 +2735,12 @@ INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
std::make_tuple(true, false),
std::make_tuple(true, true)));

INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
ExternSSTFileLinkFailFallbackTest,
testing::Values(std::make_tuple(true, false),
std::make_tuple(true, true),
std::make_tuple(false, false)));

} // namespace rocksdb

int main(int argc, char** argv) {
Expand Down
2 changes: 2 additions & 0 deletions include/rocksdb/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -1398,6 +1398,8 @@ struct CompactRangeOptions {
struct IngestExternalFileOptions {
// Can be set to true to move the files instead of copying them.
bool move_files = false;
// If set to true, ingestion falls back to copy when move fails.
bool failed_move_fall_back_to_copy = true;
// If set to false, an ingested file keys could appear in existing snapshots
// that where created before the file was ingested.
bool snapshot_consistency = true;
Expand Down

0 comments on commit 74a334a

Please sign in to comment.