Skip to content

Commit

Permalink
Optionally wait on bytes_per_sync to smooth I/O (facebook#5183)
Browse files Browse the repository at this point in the history
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.

My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.

Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.

There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).

The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: facebook#5183

Differential Revision: D14953553

Pulled By: siying

fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
  • Loading branch information
ajkr authored and facebook-github-bot committed Apr 22, 2019
1 parent df38c1c commit 8272a6d
Show file tree
Hide file tree
Showing 15 changed files with 189 additions and 46 deletions.
3 changes: 3 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# Rocksdb Change Log
## Unreleased
### New Features
* Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.

## Unreleased
### New Features
Expand Down
1 change: 1 addition & 0 deletions env/env.cc
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,7 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
env_options->writable_file_max_buffer_size =
options.writable_file_max_buffer_size;
env_options->allow_fallocate = options.allow_fallocate;
env_options->strict_bytes_per_sync = options.strict_bytes_per_sync;
}

}
Expand Down
12 changes: 8 additions & 4 deletions env/env_hdfs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,12 @@ class HdfsWritableFile: public WritableFile {
hdfsFile hfile_;

public:
HdfsWritableFile(hdfsFS fileSys, const std::string& fname)
: fileSys_(fileSys), filename_(fname) , hfile_(nullptr) {
HdfsWritableFile(hdfsFS fileSys, const std::string& fname,
const EnvOptions& options)
: WritableFile(options),
fileSys_(fileSys),
filename_(fname),
hfile_(nullptr) {
ROCKS_LOG_DEBUG(mylog, "[hdfs] HdfsWritableFile opening %s\n",
filename_.c_str());
hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_WRONLY, 0, 0, 0);
Expand Down Expand Up @@ -419,7 +423,7 @@ Status HdfsEnv::NewWritableFile(const std::string& fname,
const EnvOptions& /*options*/) {
result->reset();
Status s;
HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname, options);
if (f == nullptr || !f->isValid()) {
delete f;
*result = nullptr;
Expand Down Expand Up @@ -586,7 +590,7 @@ Status HdfsEnv::UnlockFile(FileLock* /*lock*/) { return Status::OK(); }

Status HdfsEnv::NewLogger(const std::string& fname,
std::shared_ptr<Logger>* result) {
HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname, options);
if (f == nullptr || !f->isValid()) {
delete f;
*result = nullptr;
Expand Down
75 changes: 65 additions & 10 deletions env/io_posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) {
}

namespace {

size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
#ifdef OS_LINUX
struct stat buf;
Expand Down Expand Up @@ -125,7 +126,45 @@ size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
#endif
return kDefaultPageSize;
}
} // namespace

#ifdef ROCKSDB_RANGESYNC_PRESENT

#if !defined(ZFS_SUPER_MAGIC)
// The magic number for ZFS was not exposed until recently. It should be fixed
// forever so we can just copy the magic number here.
#define ZFS_SUPER_MAGIC 0x2fc12fc1
#endif

bool IsSyncFileRangeSupported(int __attribute__((__unused__)) fd) {
// `fstatfs` is only available on Linux, but so is `sync_file_range`, so
// `defined(ROCKSDB_RANGESYNC_PRESENT)` should imply `defined(OS_LINUX)`.
struct statfs buf;
int ret = fstatfs(fd, &buf);
assert(ret == 0);
if (ret != 0) {
// We don't know whether the filesystem properly supports `sync_file_range`.
// Even if it doesn't, we don't know of any safety issue with trying to call
// it anyways. So, to preserve the same behavior as before this `fstatfs`
// check was introduced, we assume `sync_file_range` is usable.
return true;
}
if (buf.f_type == ZFS_SUPER_MAGIC) {
// Testing on ZFS showed the writeback did not happen asynchronously when
// `sync_file_range` was called, even though it returned success. Avoid it
// and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
// even though this'll incur extra I/O for metadata.
return false;
}
// No known problems with other filesystems' implementations of
// `sync_file_range`, so allow them to use it.
return true;
}

#undef ZFS_SUPER_MAGIC

#endif // ROCKSDB_RANGESYNC_PRESENT

} // anonymous namespace

/*
* DirectIOHelper
Expand Down Expand Up @@ -734,7 +773,8 @@ Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) {
*/
PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
const EnvOptions& options)
: filename_(fname),
: WritableFile(options),
filename_(fname),
use_direct_io_(options.use_direct_writes),
fd_(fd),
filesize_(0),
Expand All @@ -743,6 +783,9 @@ PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
allow_fallocate_ = options.allow_fallocate;
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
#endif
#ifdef ROCKSDB_RANGESYNC_PRESENT
sync_file_range_supported_ = IsSyncFileRangeSupported(fd_);
#endif // ROCKSDB_RANGESYNC_PRESENT
assert(!options.use_mmap_writes);
}

Expand Down Expand Up @@ -945,20 +988,32 @@ Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) {
}
#endif

#ifdef ROCKSDB_RANGESYNC_PRESENT
Status PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) {
#ifdef ROCKSDB_RANGESYNC_PRESENT
assert(offset <= std::numeric_limits<off_t>::max());
assert(nbytes <= std::numeric_limits<off_t>::max());
if (sync_file_range(fd_, static_cast<off_t>(offset),
static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE) == 0) {
if (sync_file_range_supported_) {
int ret;
if (strict_bytes_per_sync_) {
// Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length
// that spans all bytes written so far tells `sync_file_range` to wait for
// any outstanding writeback requests to finish before issuing a new one.
ret =
sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes),
SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE);
} else {
ret = sync_file_range(fd_, static_cast<off_t>(offset),
static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE);
}
if (ret != 0) {
return IOError("While sync_file_range returned " + ToString(ret),
filename_, errno);
}
return Status::OK();
} else {
return IOError("While sync_file_range offset " + ToString(offset) +
" bytes " + ToString(nbytes),
filename_, errno);
}
#endif // ROCKSDB_RANGESYNC_PRESENT
return WritableFile::RangeSync(offset, nbytes);
}
#endif

#ifdef OS_LINUX
size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
Expand Down
7 changes: 5 additions & 2 deletions env/io_posix.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ class PosixWritableFile : public WritableFile {
bool allow_fallocate_;
bool fallocate_with_keep_size_;
#endif
#ifdef ROCKSDB_RANGESYNC_PRESENT
// Even if the syscall is present, the filesystem may still not properly
// support it, so we need to do a dynamic check too.
bool sync_file_range_supported_;
#endif // ROCKSDB_RANGESYNC_PRESENT

public:
explicit PosixWritableFile(const std::string& fname, int fd,
Expand All @@ -144,9 +149,7 @@ class PosixWritableFile : public WritableFile {
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(uint64_t offset, uint64_t len) override;
#endif
#ifdef ROCKSDB_RANGESYNC_PRESENT
virtual Status RangeSync(uint64_t offset, uint64_t nbytes) override;
#endif
#ifdef OS_LINUX
virtual size_t GetUniqueId(char* id, size_t max_size) const override;
#endif
Expand Down
32 changes: 31 additions & 1 deletion include/rocksdb/env.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,23 @@ struct EnvOptions {
// Default: 0
uint64_t bytes_per_sync = 0;

// When true, guarantees the file has at most `bytes_per_sync` bytes submitted
// for writeback at any given time.
//
// - If `sync_file_range` is supported it achieves this by waiting for any
// prior `sync_file_range`s to finish before proceeding. In this way,
// processing (compression, etc.) can proceed uninhibited in the gap
// between `sync_file_range`s, and we block only when I/O falls behind.
// - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
// always blocks, thus preventing the interleaving of I/O and processing.
//
// Note: Enabling this option does not provide any additional persistence
// guarantees, as it may use `sync_file_range`, which does not write out
// metadata.
//
// Default: false
bool strict_bytes_per_sync = false;

// If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
// means that file size won't change as part of preallocation.
// If false, preallocation will also change the file size. This option will
Expand Down Expand Up @@ -629,7 +646,16 @@ class WritableFile {
: last_preallocated_block_(0),
preallocation_block_size_(0),
io_priority_(Env::IO_TOTAL),
write_hint_(Env::WLTH_NOT_SET) {}
write_hint_(Env::WLTH_NOT_SET),
strict_bytes_per_sync_(false) {}

explicit WritableFile(const EnvOptions& options)
: last_preallocated_block_(0),
preallocation_block_size_(0),
io_priority_(Env::IO_TOTAL),
write_hint_(Env::WLTH_NOT_SET),
strict_bytes_per_sync_(options.strict_bytes_per_sync) {}

virtual ~WritableFile();

// Append data to the end of the file
Expand Down Expand Up @@ -744,6 +770,9 @@ class WritableFile {
// without waiting for completion.
// Default implementation does nothing.
virtual Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) {
if (strict_bytes_per_sync_) {
return Sync();
}
return Status::OK();
}

Expand Down Expand Up @@ -792,6 +821,7 @@ class WritableFile {
protected:
Env::IOPriority io_priority_;
Env::WriteLifeTimeHint write_hint_;
const bool strict_bytes_per_sync_;
};

// A file abstraction for random reading and writing.
Expand Down
21 changes: 21 additions & 0 deletions include/rocksdb/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,27 @@ struct DBOptions {
// Dynamically changeable through SetDBOptions() API.
uint64_t wal_bytes_per_sync = 0;

// When true, guarantees WAL files have at most `wal_bytes_per_sync`
// bytes submitted for writeback at any given time, and SST files have at most
// `bytes_per_sync` bytes pending writeback at any given time. This can be
// used to handle cases where processing speed exceeds I/O speed during file
// generation, which can lead to a huge sync when the file is finished, even
// with `bytes_per_sync` / `wal_bytes_per_sync` properly configured.
//
// - If `sync_file_range` is supported it achieves this by waiting for any
// prior `sync_file_range`s to finish before proceeding. In this way,
// processing (compression, etc.) can proceed uninhibited in the gap
// between `sync_file_range`s, and we block only when I/O falls behind.
// - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
// always blocks, thus preventing the interleaving of I/O and processing.
//
// Note: Enabling this option does not provide any additional persistence
// guarantees, as it may use `sync_file_range`, which does not write out
// metadata.
//
// Default: false
bool strict_bytes_per_sync = false;

// A vector of EventListeners whose callback functions will be called
// when specific RocksDB event happens.
std::vector<std::shared_ptr<EventListener>> listeners;
Expand Down
5 changes: 5 additions & 0 deletions options/db_options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ MutableDBOptions::MutableDBOptions()
max_open_files(-1),
bytes_per_sync(0),
wal_bytes_per_sync(0),
strict_bytes_per_sync(false),
compaction_readahead_size(0) {}

MutableDBOptions::MutableDBOptions(const DBOptions& options)
Expand All @@ -258,6 +259,7 @@ MutableDBOptions::MutableDBOptions(const DBOptions& options)
max_open_files(options.max_open_files),
bytes_per_sync(options.bytes_per_sync),
wal_bytes_per_sync(options.wal_bytes_per_sync),
strict_bytes_per_sync(options.strict_bytes_per_sync),
compaction_readahead_size(options.compaction_readahead_size) {}

void MutableDBOptions::Dump(Logger* log) const {
Expand Down Expand Up @@ -293,6 +295,9 @@ void MutableDBOptions::Dump(Logger* log) const {
ROCKS_LOG_HEADER(log,
" Options.wal_bytes_per_sync: %" PRIu64,
wal_bytes_per_sync);
ROCKS_LOG_HEADER(log,
" Options.strict_bytes_per_sync: %d",
strict_bytes_per_sync);
ROCKS_LOG_HEADER(log,
" Options.compaction_readahead_size: %" ROCKSDB_PRIszt,
compaction_readahead_size);
Expand Down
1 change: 1 addition & 0 deletions options/db_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ struct MutableDBOptions {
int max_open_files;
uint64_t bytes_per_sync;
uint64_t wal_bytes_per_sync;
bool strict_bytes_per_sync;
size_t compaction_readahead_size;
};

Expand Down
5 changes: 5 additions & 0 deletions options/options_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
mutable_db_options.max_background_compactions;
options.bytes_per_sync = mutable_db_options.bytes_per_sync;
options.wal_bytes_per_sync = mutable_db_options.wal_bytes_per_sync;
options.strict_bytes_per_sync = mutable_db_options.strict_bytes_per_sync;
options.max_subcompactions = immutable_db_options.max_subcompactions;
options.max_background_flushes = immutable_db_options.max_background_flushes;
options.max_log_file_size = immutable_db_options.max_log_file_size;
Expand Down Expand Up @@ -1551,6 +1552,10 @@ std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct DBOptions, wal_bytes_per_sync), OptionType::kUInt64T,
OptionVerificationType::kNormal, true,
offsetof(struct MutableDBOptions, wal_bytes_per_sync)}},
{"strict_bytes_per_sync",
{offsetof(struct DBOptions, strict_bytes_per_sync), OptionType::kBoolean,
OptionVerificationType::kNormal, true,
offsetof(struct MutableDBOptions, strict_bytes_per_sync)}},
{"stats_dump_period_sec",
{offsetof(struct DBOptions, stats_dump_period_sec), OptionType::kUInt,
OptionVerificationType::kNormal, true,
Expand Down
1 change: 1 addition & 0 deletions options/options_settable_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
"paranoid_checks=true;"
"is_fd_close_on_exec=false;"
"bytes_per_sync=4295013613;"
"strict_bytes_per_sync=true;"
"enable_thread_tracking=false;"
"recycle_log_file_num=0;"
"create_missing_column_families=true;"
Expand Down
2 changes: 2 additions & 0 deletions options/options_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
{"writable_file_max_buffer_size", "314159"},
{"bytes_per_sync", "47"},
{"wal_bytes_per_sync", "48"},
{"strict_bytes_per_sync", "true"},
};

ColumnFamilyOptions base_cf_opt;
Expand Down Expand Up @@ -277,6 +278,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_db_opt.writable_file_max_buffer_size, 314159);
ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast<uint64_t>(48));
ASSERT_EQ(new_db_opt.strict_bytes_per_sync, true);

db_options_map["max_open_files"] = "hello";
ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
Expand Down
Loading

0 comments on commit 8272a6d

Please sign in to comment.