Skip to content

Commit 33c7d4c

Browse files
shligitfacebook-github-bot
authored andcommitted
Make writable_file_max_buffer_size dynamic
Summary: The DBOptions::writable_file_max_buffer_size can be changed dynamically. Closes facebook#3053 Differential Revision: D6152720 Pulled By: shligit fbshipit-source-id: aa0c0cfcfae6a54eb17faadb148d904797c68681
1 parent c1be8d8 commit 33c7d4c

15 files changed

+105
-23
lines changed

HISTORY.md

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* Return an error on write if write_options.sync = true and write_options.disableWAL = true to warn user of inconsistent options. Previously we will not write to WAL and not respecting the sync options in this case.
77

88
### New Features
9+
* `DBOptions::writable_file_max_buffer_size` can now be changed dynamically.
910
* `DBOptions::bytes_per_sync` and `DBOptions::wal_bytes_per_sync` can now be changed dynamically, `DBOptions::wal_bytes_per_sync` will flush all memtables and switch to a new WAL file.
1011
* Support dynamic adjustment of rate limit according to demand for background I/O. It can be enabled by passing `true` to the `auto_tuned` parameter in `NewGenericRateLimiter()`. The value passed as `rate_bytes_per_sec` will still be respected as an upper-bound.
1112
* Support dynamically changing `ColumnFamilyOptions::compaction_options_fifo`.

db/c.cc

+5
Original file line numberDiff line numberDiff line change
@@ -2276,6 +2276,11 @@ void rocksdb_options_set_bytes_per_sync(
22762276
opt->rep.bytes_per_sync = v;
22772277
}
22782278

2279+
void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt,
2280+
uint64_t v) {
2281+
opt->rep.writable_file_max_buffer_size = v;
2282+
}
2283+
22792284
void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt,
22802285
unsigned char v) {
22812286
opt->rep.allow_concurrent_memtable_write = v;

db/db_impl.cc

+1
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,7 @@ Status DBImpl::SetDBOptions(
574574
env_options_for_compaction_ = env_->OptimizeForCompactionTableWrite(
575575
env_options_for_compaction_,
576576
immutable_db_options_);
577+
versions_->ChangeEnvOptions(mutable_db_options_);
577578
write_thread_.EnterUnbatched(&w, &mutex_);
578579
if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) {
579580
Status purge_wal_status = SwitchWAL(&write_context);

db/db_options_test.cc

+52
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,58 @@ TEST_F(DBOptionsTest, SetWalBytesPerSync) {
208208
ASSERT_GT(low_bytes_per_sync, counter);
209209
}
210210

211+
TEST_F(DBOptionsTest, WritableFileMaxBufferSize) {
212+
Options options;
213+
options.create_if_missing = true;
214+
options.writable_file_max_buffer_size = 1024 * 1024;
215+
options.level0_file_num_compaction_trigger = 3;
216+
options.max_manifest_file_size = 1;
217+
options.env = env_;
218+
int buffer_size = 1024 * 1024;
219+
Reopen(options);
220+
ASSERT_EQ(buffer_size,
221+
dbfull()->GetDBOptions().writable_file_max_buffer_size);
222+
223+
std::atomic<int> match_cnt(0);
224+
std::atomic<int> unmatch_cnt(0);
225+
rocksdb::SyncPoint::GetInstance()->SetCallBack(
226+
"WritableFileWriter::WritableFileWriter:0", [&](void* arg) {
227+
int value = static_cast<int>(reinterpret_cast<uintptr_t>(arg));
228+
if (value == buffer_size) {
229+
match_cnt++;
230+
} else {
231+
unmatch_cnt++;
232+
}
233+
});
234+
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
235+
int i = 0;
236+
for (; i < 3; i++) {
237+
ASSERT_OK(Put("foo", ToString(i)));
238+
ASSERT_OK(Put("bar", ToString(i)));
239+
Flush();
240+
}
241+
dbfull()->TEST_WaitForCompact();
242+
ASSERT_EQ(unmatch_cnt, 0);
243+
ASSERT_GE(match_cnt, 11);
244+
245+
buffer_size = 512 * 1024;
246+
match_cnt = 0;
247+
unmatch_cnt = 0;
248+
ASSERT_OK(
249+
dbfull()->SetDBOptions({{"writable_file_max_buffer_size", "524288"}}));
250+
ASSERT_EQ(buffer_size,
251+
dbfull()->GetDBOptions().writable_file_max_buffer_size);
252+
i = 0;
253+
for (; i < 3; i++) {
254+
ASSERT_OK(Put("foo", ToString(i)));
255+
ASSERT_OK(Put("bar", ToString(i)));
256+
Flush();
257+
}
258+
dbfull()->TEST_WaitForCompact();
259+
ASSERT_EQ(unmatch_cnt, 0);
260+
ASSERT_GE(match_cnt, 11);
261+
}
262+
211263
TEST_F(DBOptionsTest, SetOptionsAndReopen) {
212264
Random rnd(1044);
213265
auto rand_opts = GetRandomizedMutableCFOptionsMap(&rnd);

db/version_set.cc

+16-12
Original file line numberDiff line numberDiff line change
@@ -576,7 +576,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
576576
auto table_cache = cfd_->table_cache();
577577
auto ioptions = cfd_->ioptions();
578578
Status s = table_cache->GetTableProperties(
579-
vset_->env_options_, cfd_->internal_comparator(), file_meta->fd,
579+
env_options_, cfd_->internal_comparator(), file_meta->fd,
580580
tp, true /* no io */);
581581
if (s.ok()) {
582582
return s;
@@ -599,7 +599,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
599599
TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
600600
file_meta->fd.GetPathId());
601601
}
602-
s = ioptions->env->NewRandomAccessFile(file_name, &file, vset_->env_options_);
602+
s = ioptions->env->NewRandomAccessFile(file_name, &file, env_options_);
603603
if (!s.ok()) {
604604
return s;
605605
}
@@ -711,7 +711,7 @@ size_t Version::GetMemoryUsageByTableReaders() {
711711
for (auto& file_level : storage_info_.level_files_brief_) {
712712
for (size_t i = 0; i < file_level.num_files; i++) {
713713
total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
714-
vset_->env_options_, cfd_->internal_comparator(),
714+
env_options_, cfd_->internal_comparator(),
715715
file_level.files[i].fd);
716716
}
717717
}
@@ -936,7 +936,7 @@ VersionStorageInfo::VersionStorageInfo(
936936
}
937937

938938
Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
939-
uint64_t version_number)
939+
const EnvOptions& env_opt, uint64_t version_number)
940940
: env_(vset->env_),
941941
cfd_(column_family_data),
942942
info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log),
@@ -959,6 +959,7 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
959959
next_(this),
960960
prev_(this),
961961
refs_(0),
962+
env_options_(env_opt),
962963
version_number_(version_number) {}
963964

964965
void Version::Get(const ReadOptions& read_options, const LookupKey& k,
@@ -2532,7 +2533,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
25322533
LogAndApplyCFHelper(w.edit_list.front());
25332534
batch_edits.push_back(w.edit_list.front());
25342535
} else {
2535-
v = new Version(column_family_data, this, current_version_number_++);
2536+
v = new Version(column_family_data, this, env_options_,
2537+
current_version_number_++);
25362538
builder_guard.reset(new BaseReferencedVersionBuilder(column_family_data));
25372539
auto* builder = builder_guard->version_builder();
25382540
for (const auto& writer : manifest_writers_) {
@@ -2577,7 +2579,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
25772579
// Unlock during expensive operations. New writes cannot get here
25782580
// because &w is ensuring that all new writes get queued.
25792581
{
2580-
2582+
EnvOptions opt_env_opts = env_->OptimizeForManifestWrite(env_options_);
25812583
mu->Unlock();
25822584

25832585
TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest");
@@ -2599,7 +2601,6 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
25992601
ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
26002602
pending_manifest_file_number_);
26012603
unique_ptr<WritableFile> descriptor_file;
2602-
EnvOptions opt_env_opts = env_->OptimizeForManifestWrite(env_options_);
26032604
s = NewWritableFile(
26042605
env_, DescriptorFileName(dbname_, pending_manifest_file_number_),
26052606
&descriptor_file, opt_env_opts);
@@ -3064,7 +3065,8 @@ Status VersionSet::Recover(
30643065
false /* prefetch_index_and_filter_in_cache */);
30653066
}
30663067

3067-
Version* v = new Version(cfd, this, current_version_number_++);
3068+
Version* v =
3069+
new Version(cfd, this, env_options_, current_version_number_++);
30683070
builder->SaveTo(v->storage_info());
30693071

30703072
// Install recovered version
@@ -3422,7 +3424,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
34223424
assert(builders_iter != builders.end());
34233425
auto builder = builders_iter->second->version_builder();
34243426

3425-
Version* v = new Version(cfd, this, current_version_number_++);
3427+
Version* v =
3428+
new Version(cfd, this, env_options_, current_version_number_++);
34263429
builder->SaveTo(v->storage_info());
34273430
v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false);
34283431

@@ -3634,7 +3637,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
36343637
// approximate offset of "key" within the table.
36353638
TableReader* table_reader_ptr;
36363639
InternalIterator* iter = v->cfd_->table_cache()->NewIterator(
3637-
ReadOptions(), env_options_, v->cfd_->internal_comparator(), f.fd,
3640+
ReadOptions(), v->env_options_, v->cfd_->internal_comparator(), f.fd,
36383641
nullptr /* range_del_agg */, &table_reader_ptr);
36393642
if (table_reader_ptr != nullptr) {
36403643
result = table_reader_ptr->ApproximateOffsetOf(key);
@@ -3865,15 +3868,16 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
38653868
const ColumnFamilyOptions& cf_options, VersionEdit* edit) {
38663869
assert(edit->is_column_family_add_);
38673870

3868-
Version* dummy_versions = new Version(nullptr, this);
3871+
Version* dummy_versions = new Version(nullptr, this, env_options_);
38693872
// Ref() dummy version once so that later we can call Unref() to delete it
38703873
// by avoiding calling "delete" explicitly (~Version is private)
38713874
dummy_versions->Ref();
38723875
auto new_cfd = column_family_set_->CreateColumnFamily(
38733876
edit->column_family_name_, edit->column_family_, dummy_versions,
38743877
cf_options);
38753878

3876-
Version* v = new Version(new_cfd, this, current_version_number_++);
3879+
Version* v =
3880+
new Version(new_cfd, this, env_options_, current_version_number_++);
38773881

38783882
// Fill level target base information.
38793883
v->storage_info()->CalculateBaseBytes(*new_cfd->ioptions(),

db/version_set.h

+8-2
Original file line numberDiff line numberDiff line change
@@ -663,12 +663,14 @@ class Version {
663663
Version* next_; // Next version in linked list
664664
Version* prev_; // Previous version in linked list
665665
int refs_; // Number of live refs to this version
666+
const EnvOptions env_options_;
666667

667668
// A version number that uniquely represents this version. This is
668669
// used for debugging and logging purposes only.
669670
uint64_t version_number_;
670671

671-
Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
672+
Version(ColumnFamilyData* cfd, VersionSet* vset, const EnvOptions& env_opt,
673+
uint64_t version_number = 0);
672674

673675
~Version();
674676

@@ -844,6 +846,10 @@ class VersionSet {
844846

845847
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
846848
const EnvOptions& env_options() { return env_options_; }
849+
void ChangeEnvOptions(const MutableDBOptions& new_options) {
850+
env_options_.writable_file_max_buffer_size =
851+
new_options.writable_file_max_buffer_size;
852+
}
847853

848854
static uint64_t GetNumLiveVersions(Version* dummy_versions);
849855

@@ -908,7 +914,7 @@ class VersionSet {
908914
std::vector<std::string> obsolete_manifests_;
909915

910916
// env options for all reads and writes except compactions
911-
const EnvOptions& env_options_;
917+
EnvOptions env_options_;
912918

913919
// env options used for compactions. This is a copy of
914920
// env_options_ but with readaheads set to readahead_compactions_.

env/env.cc

+2
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,8 @@ EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options,
333333
const DBOptions& db_options) const {
334334
EnvOptions optimized_env_options(env_options);
335335
optimized_env_options.bytes_per_sync = db_options.wal_bytes_per_sync;
336+
optimized_env_options.writable_file_max_buffer_size =
337+
db_options.writable_file_max_buffer_size;
336338
return optimized_env_options;
337339
}
338340

env/env_posix.cc

+2
Original file line numberDiff line numberDiff line change
@@ -832,6 +832,8 @@ class PosixEnv : public Env {
832832
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
833833
// test and make this false
834834
optimized.fallocate_with_keep_size = true;
835+
optimized.writable_file_max_buffer_size =
836+
db_options.writable_file_max_buffer_size;
835837
return optimized;
836838
}
837839

include/rocksdb/c.h

+2
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync(
857857
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync(
858858
rocksdb_options_t*, uint64_t);
859859
extern ROCKSDB_LIBRARY_API void
860+
rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t*, uint64_t);
861+
extern ROCKSDB_LIBRARY_API void
860862
rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t*,
861863
unsigned char);
862864
extern ROCKSDB_LIBRARY_API void

options/db_options.cc

+5-4
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
6464
options.new_table_reader_for_compaction_inputs),
6565
compaction_readahead_size(options.compaction_readahead_size),
6666
random_access_max_buffer_size(options.random_access_max_buffer_size),
67-
writable_file_max_buffer_size(options.writable_file_max_buffer_size),
6867
use_adaptive_mutex(options.use_adaptive_mutex),
6968
listeners(options.listeners),
7069
enable_thread_tracking(options.enable_thread_tracking),
@@ -175,9 +174,6 @@ void ImmutableDBOptions::Dump(Logger* log) const {
175174
ROCKS_LOG_HEADER(
176175
log, " Options.random_access_max_buffer_size: %" ROCKSDB_PRIszt,
177176
random_access_max_buffer_size);
178-
ROCKS_LOG_HEADER(
179-
log, " Options.writable_file_max_buffer_size: %" ROCKSDB_PRIszt,
180-
writable_file_max_buffer_size);
181177
ROCKS_LOG_HEADER(log, " Options.use_adaptive_mutex: %d",
182178
use_adaptive_mutex);
183179
ROCKS_LOG_HEADER(log, " Options.rate_limiter: %p",
@@ -230,6 +226,7 @@ MutableDBOptions::MutableDBOptions()
230226
base_background_compactions(-1),
231227
max_background_compactions(-1),
232228
avoid_flush_during_shutdown(false),
229+
writable_file_max_buffer_size(1024 * 1024),
233230
delayed_write_rate(2 * 1024U * 1024U),
234231
max_total_wal_size(0),
235232
delete_obsolete_files_period_micros(6ULL * 60 * 60 * 1000000),
@@ -243,6 +240,7 @@ MutableDBOptions::MutableDBOptions(const DBOptions& options)
243240
base_background_compactions(options.base_background_compactions),
244241
max_background_compactions(options.max_background_compactions),
245242
avoid_flush_during_shutdown(options.avoid_flush_during_shutdown),
243+
writable_file_max_buffer_size(options.writable_file_max_buffer_size),
246244
delayed_write_rate(options.delayed_write_rate),
247245
max_total_wal_size(options.max_total_wal_size),
248246
delete_obsolete_files_period_micros(
@@ -259,6 +257,9 @@ void MutableDBOptions::Dump(Logger* log) const {
259257
max_background_compactions);
260258
ROCKS_LOG_HEADER(log, " Options.avoid_flush_during_shutdown: %d",
261259
avoid_flush_during_shutdown);
260+
ROCKS_LOG_HEADER(
261+
log, " Options.writable_file_max_buffer_size: %" ROCKSDB_PRIszt,
262+
writable_file_max_buffer_size);
262263
ROCKS_LOG_HEADER(log, " Options.delayed_write_rate : %" PRIu64,
263264
delayed_write_rate);
264265
ROCKS_LOG_HEADER(log, " Options.max_total_wal_size: %" PRIu64,

options/db_options.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ struct ImmutableDBOptions {
5757
bool new_table_reader_for_compaction_inputs;
5858
size_t compaction_readahead_size;
5959
size_t random_access_max_buffer_size;
60-
size_t writable_file_max_buffer_size;
6160
bool use_adaptive_mutex;
6261
std::vector<std::shared_ptr<EventListener>> listeners;
6362
bool enable_thread_tracking;
@@ -93,6 +92,7 @@ struct MutableDBOptions {
9392
int base_background_compactions;
9493
int max_background_compactions;
9594
bool avoid_flush_during_shutdown;
95+
size_t writable_file_max_buffer_size;
9696
uint64_t delayed_write_rate;
9797
uint64_t max_total_wal_size;
9898
uint64_t delete_obsolete_files_period_micros;

options/options_helper.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
9191
options.random_access_max_buffer_size =
9292
immutable_db_options.random_access_max_buffer_size;
9393
options.writable_file_max_buffer_size =
94-
immutable_db_options.writable_file_max_buffer_size;
94+
mutable_db_options.writable_file_max_buffer_size;
9595
options.use_adaptive_mutex = immutable_db_options.use_adaptive_mutex;
9696
options.listeners = immutable_db_options.listeners;
9797
options.enable_thread_tracking = immutable_db_options.enable_thread_tracking;

options/options_helper.h

+4-3
Original file line numberDiff line numberDiff line change
@@ -220,9 +220,6 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
220220
{"random_access_max_buffer_size",
221221
{offsetof(struct DBOptions, random_access_max_buffer_size),
222222
OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
223-
{"writable_file_max_buffer_size",
224-
{offsetof(struct DBOptions, writable_file_max_buffer_size),
225-
OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
226223
{"use_adaptive_mutex",
227224
{offsetof(struct DBOptions, use_adaptive_mutex), OptionType::kBoolean,
228225
OptionVerificationType::kNormal, false, 0}},
@@ -351,6 +348,10 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
351348
{offsetof(struct DBOptions, avoid_flush_during_shutdown),
352349
OptionType::kBoolean, OptionVerificationType::kNormal, true,
353350
offsetof(struct MutableDBOptions, avoid_flush_during_shutdown)}},
351+
{"writable_file_max_buffer_size",
352+
{offsetof(struct DBOptions, writable_file_max_buffer_size),
353+
OptionType::kSizeT, OptionVerificationType::kNormal, true,
354+
offsetof(struct MutableDBOptions, writable_file_max_buffer_size)}},
354355
{"allow_ingest_behind",
355356
{offsetof(struct DBOptions, allow_ingest_behind), OptionType::kBoolean,
356357
OptionVerificationType::kNormal, false,

port/win/env_win.cc

+2
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,8 @@ EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options,
786786
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
787787
// test and make this false
788788
optimized.fallocate_with_keep_size = true;
789+
optimized.writable_file_max_buffer_size =
790+
db_options.writable_file_max_buffer_size;
789791
return optimized;
790792
}
791793

util/file_reader_writer.h

+3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "rocksdb/env.h"
1414
#include "rocksdb/rate_limiter.h"
1515
#include "util/aligned_buffer.h"
16+
#include "util/sync_point.h"
1617

1718
namespace rocksdb {
1819

@@ -151,6 +152,8 @@ class WritableFileWriter {
151152
bytes_per_sync_(options.bytes_per_sync),
152153
rate_limiter_(options.rate_limiter),
153154
stats_(stats) {
155+
TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
156+
reinterpret_cast<void*>(max_buffer_size_));
154157
buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
155158
buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
156159
}

0 commit comments

Comments
 (0)