Skip to content

Commit faa0f97

Browse files
jay-zhuangfacebook-github-bot
authored andcommitted
Tiered compaction: integrate Seqno time mapping with per key placement (facebook#10370)
Summary: Using the Sequence number to time mapping to decide if a key is hot or not in compaction and place it in the corresponding level. Note: the feature is not complete, level compaction will run indefinitely until all penultimate level data is cold and small enough to not trigger compaction. Pull Request resolved: facebook#10370 Test Plan: CI * Run basic db_bench for universal compaction manually Reviewed By: siying Differential Revision: D37892338 Pulled By: jay-zhuang fbshipit-source-id: 792bbd91b1ccc2f62b5d14c53118434bcaac4bbe
1 parent 7506c1a commit faa0f97

17 files changed

+330
-80
lines changed

db/compaction/compaction.cc

+9-4
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,11 @@ bool Compaction::IsTrivialMove() const {
440440
}
441441
}
442442

443+
// PerKeyPlacement compaction should never be trivial move.
444+
if (SupportsPerKeyPlacement()) {
445+
return false;
446+
}
447+
443448
return true;
444449
}
445450

@@ -741,10 +746,10 @@ int Compaction::EvaluatePenultimateLevel(
741746
return kInvalidLevel;
742747
}
743748

744-
// TODO: will add public like `options.preclude_last_level_data_seconds` for
745-
// per_key_placement feature, will check that option here. Currently, only
746-
// set by unittest
747-
bool supports_per_key_placement = false;
749+
bool supports_per_key_placement =
750+
immutable_options.preclude_last_level_data_seconds > 0;
751+
752+
// it could be overridden by unittest
748753
TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled",
749754
&supports_per_key_placement);
750755
if (!supports_per_key_placement) {

db/compaction/compaction_iterator.cc

+10-10
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ CompactionIterator::CompactionIterator(
3434
const std::atomic<bool>* shutting_down,
3535
const std::shared_ptr<Logger> info_log,
3636
const std::string* full_history_ts_low,
37-
const SequenceNumber max_seqno_allow_zero_out)
37+
const SequenceNumber penultimate_level_cutoff_seqno)
3838
: CompactionIterator(
3939
input, cmp, merge_helper, last_sequence, snapshots,
4040
earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
@@ -44,7 +44,7 @@ CompactionIterator::CompactionIterator(
4444
std::unique_ptr<CompactionProxy>(
4545
compaction ? new RealCompaction(compaction) : nullptr),
4646
compaction_filter, shutting_down, info_log, full_history_ts_low,
47-
max_seqno_allow_zero_out) {}
47+
penultimate_level_cutoff_seqno) {}
4848

4949
CompactionIterator::CompactionIterator(
5050
InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@@ -61,7 +61,7 @@ CompactionIterator::CompactionIterator(
6161
const std::atomic<bool>* shutting_down,
6262
const std::shared_ptr<Logger> info_log,
6363
const std::string* full_history_ts_low,
64-
const SequenceNumber max_seqno_allow_zero_out)
64+
const SequenceNumber penultimate_level_cutoff_seqno)
6565
: input_(input, cmp,
6666
!compaction || compaction->DoesInputReferenceBlobFiles()),
6767
cmp_(cmp),
@@ -96,7 +96,7 @@ CompactionIterator::CompactionIterator(
9696
current_key_committed_(false),
9797
cmp_with_history_ts_low_(0),
9898
level_(compaction_ == nullptr ? 0 : compaction_->level()),
99-
max_seqno_allow_zero_out_(max_seqno_allow_zero_out) {
99+
penultimate_level_cutoff_seqno_(penultimate_level_cutoff_seqno) {
100100
assert(snapshots_ != nullptr);
101101
bottommost_level_ = compaction_ == nullptr
102102
? false
@@ -1081,18 +1081,18 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() {
10811081

10821082
void CompactionIterator::DecideOutputLevel() {
10831083
#ifndef NDEBUG
1084-
// TODO: will be set by sequence number or key range, for now, it will only be
1085-
// set by unittest
1084+
// Could be overridden by unittest
10861085
PerKeyPlacementContext context(level_, ikey_.user_key, value_,
10871086
ikey_.sequence);
10881087
TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
10891088
&context);
10901089
output_to_penultimate_level_ = context.output_to_penultimate_level;
10911090
#endif /* !NDEBUG */
10921091

1093-
// if the key is within the earliest snapshot, it has to output to the
1094-
// penultimate level.
1095-
if (ikey_.sequence > earliest_snapshot_) {
1092+
// if the key is newer than the cutoff sequence or within the earliest
1093+
// snapshot, it should output to the penultimate level.
1094+
if (ikey_.sequence > penultimate_level_cutoff_seqno_ ||
1095+
ikey_.sequence > earliest_snapshot_) {
10961096
output_to_penultimate_level_ = true;
10971097
}
10981098

@@ -1153,7 +1153,7 @@ void CompactionIterator::PrepareOutput() {
11531153
DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
11541154
ikey_.type != kTypeMerge && current_key_committed_ &&
11551155
!output_to_penultimate_level_ &&
1156-
ikey_.sequence < max_seqno_allow_zero_out_) {
1156+
ikey_.sequence < penultimate_level_cutoff_seqno_) {
11571157
if (ikey_.type == kTypeDeletion ||
11581158
(ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
11591159
ROCKS_LOG_FATAL(

db/compaction/compaction_iterator.h

+5-6
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ class CompactionIterator {
196196
const std::atomic<bool>* shutting_down = nullptr,
197197
const std::shared_ptr<Logger> info_log = nullptr,
198198
const std::string* full_history_ts_low = nullptr,
199-
const SequenceNumber max_seqno_allow_zero_out = kMaxSequenceNumber);
199+
const SequenceNumber penultimate_level_cutoff_seqno = kMaxSequenceNumber);
200200

201201
// Constructor with custom CompactionProxy, used for tests.
202202
CompactionIterator(
@@ -214,7 +214,7 @@ class CompactionIterator {
214214
const std::atomic<bool>* shutting_down = nullptr,
215215
const std::shared_ptr<Logger> info_log = nullptr,
216216
const std::string* full_history_ts_low = nullptr,
217-
const SequenceNumber max_seqno_allow_zero_out = kMaxSequenceNumber);
217+
const SequenceNumber penultimate_level_cutoff_seqno = kMaxSequenceNumber);
218218

219219
~CompactionIterator();
220220

@@ -444,10 +444,9 @@ class CompactionIterator {
444444
// output to.
445445
bool output_to_penultimate_level_{false};
446446

447-
// any key later than this sequence number, need to keep the sequence number
448-
// and not zeroed out. The sequence number is kept to track it's approximate
449-
// time.
450-
const SequenceNumber max_seqno_allow_zero_out_ = kMaxSequenceNumber;
447+
// any key later than this sequence number should have
448+
// output_to_penultimate_level_ set to true
449+
const SequenceNumber penultimate_level_cutoff_seqno_ = kMaxSequenceNumber;
451450

452451
void AdvanceInputIter() { input_.Next(); }
453452

db/compaction/compaction_job.cc

+4-3
Original file line numberDiff line numberDiff line change
@@ -282,9 +282,9 @@ void CompactionJob::Prepare() {
282282
ROCKS_LOG_WARN(db_options_.info_log,
283283
"Failed to get current time in compaction: Status: %s",
284284
status.ToString().c_str());
285-
max_seqno_allow_zero_out_ = 0;
285+
penultimate_level_cutoff_seqno_ = 0;
286286
} else {
287-
max_seqno_allow_zero_out_ =
287+
penultimate_level_cutoff_seqno_ =
288288
seqno_time_mapping_.TruncateOldEntries(_current_time);
289289
}
290290
}
@@ -1026,7 +1026,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
10261026
blob_file_builder.get(), db_options_.allow_data_in_errors,
10271027
db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
10281028
sub_compact->compaction, compaction_filter, shutting_down_,
1029-
db_options_.info_log, full_history_ts_low, max_seqno_allow_zero_out_);
1029+
db_options_.info_log, full_history_ts_low,
1030+
penultimate_level_cutoff_seqno_);
10301031
c_iter->SeekToFirst();
10311032

10321033
// Assign range delete aggregator to the target output level, which makes sure

db/compaction/compaction_job.h

+6-3
Original file line numberDiff line numberDiff line change
@@ -304,9 +304,12 @@ class CompactionJob {
304304
// it also collects the smallest_seqno -> oldest_ancester_time from the SST.
305305
SeqnoToTimeMapping seqno_time_mapping_;
306306

307-
// If a sequence number larger than max_seqno_allow_zero_out_, it won't be
308-
// zeroed out. The sequence number is kept to get approximate time of the key.
309-
SequenceNumber max_seqno_allow_zero_out_ = kMaxSequenceNumber;
307+
// cutoff sequence number for penultimate level, only set when
308+
// per_key_placement feature is enabled.
309+
// If a key with sequence number larger than penultimate_level_cutoff_seqno_,
310+
// it will be placed on the penultimate_level and seqnuence number won't be
311+
// zeroed out.
312+
SequenceNumber penultimate_level_cutoff_seqno_ = kMaxSequenceNumber;
310313

311314
// Get table file name in where it's outputting to, which should also be in
312315
// `output_directory_`.

db/compaction/tiered_compaction_test.cc

+13-17
Original file line numberDiff line numberDiff line change
@@ -53,26 +53,17 @@ class TieredCompactionTest : public DBTestBase {
5353
InternalStats::CompactionOutputsStats kBasicPerLevelStats;
5454
InternalStats::CompactionStats kBasicFlushStats;
5555

56+
std::atomic_bool enable_per_key_placement = true;
57+
5658
void SetUp() override {
5759
SyncPoint::GetInstance()->SetCallBack(
5860
"Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
5961
auto supports_per_key_placement = static_cast<bool*>(arg);
60-
*supports_per_key_placement = true;
62+
*supports_per_key_placement = enable_per_key_placement;
6163
});
6264
SyncPoint::GetInstance()->EnableProcessing();
6365
}
6466

65-
#ifndef ROCKSDB_LITE
66-
uint64_t GetSstSizeHelper(Temperature temperature) {
67-
std::string prop;
68-
EXPECT_TRUE(dbfull()->GetProperty(
69-
DB::Properties::kLiveSstFilesSizeAtTemperature +
70-
std::to_string(static_cast<uint8_t>(temperature)),
71-
&prop));
72-
return static_cast<uint64_t>(std::atoi(prop.c_str()));
73-
}
74-
#endif // ROCKSDB_LITE
75-
7667
const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
7768
VersionSet* const versions = dbfull()->GetVersionSet();
7869
assert(versions);
@@ -1054,12 +1045,14 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
10541045
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
10551046
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
10561047

1048+
latest_cold_seq = seq_history[2];
1049+
10571050
MoveFilesToLevel(kLastLevel);
10581051

10591052
// move forward the cold_seq again with range delete, take a snapshot to keep
10601053
// the range dels in bottommost
10611054
auto snap = db_->GetSnapshot();
1062-
latest_cold_seq = seq_history[2];
1055+
10631056
std::string start = Key(25), end = Key(35);
10641057
ASSERT_OK(
10651058
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
@@ -1104,9 +1097,12 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
11041097

11051098
db_->ReleaseSnapshot(snap);
11061099

1100+
// TODO: it should push the data to last level, but penultimate level file is
1101+
// already bottommost, it's a conflict between bottommost_temperature and
1102+
// tiered compaction which only applies to last level compaction.
11071103
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
1108-
ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
1109-
ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
1104+
ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
1105+
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
11101106
ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
11111107

11121108
// 3 range dels dropped, the first one is double counted as expected, which is
@@ -1123,8 +1119,8 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
11231119
// input range
11241120
latest_cold_seq = seq_history[1];
11251121
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
1126-
ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
1127-
ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
1122+
ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
1123+
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
11281124
ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
11291125
}
11301126

db/db_compaction_test.cc

-11
Original file line numberDiff line numberDiff line change
@@ -78,17 +78,6 @@ class DBCompactionTest : public DBTestBase {
7878
: DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {}
7979

8080
protected:
81-
#ifndef ROCKSDB_LITE
82-
uint64_t GetSstSizeHelper(Temperature temperature) {
83-
std::string prop;
84-
EXPECT_TRUE(dbfull()->GetProperty(
85-
DB::Properties::kLiveSstFilesSizeAtTemperature +
86-
std::to_string(static_cast<uint8_t>(temperature)),
87-
&prop));
88-
return static_cast<uint64_t>(std::atoi(prop.c_str()));
89-
}
90-
#endif // ROCKSDB_LITE
91-
9281
/*
9382
* Verifies compaction stats of cfd are valid.
9483
*

db/db_impl/db_impl.h

+2
Original file line numberDiff line numberDiff line change
@@ -2594,6 +2594,8 @@ class DBImpl : public DB {
25942594
// Pointer to WriteBufferManager stalling interface.
25952595
std::unique_ptr<StallInterface> wbm_stall_;
25962596

2597+
// seqno_time_mapping_ stores the sequence number to time mapping, it's not
2598+
// thread safe, both read and write need db mutex hold.
25972599
SeqnoToTimeMapping seqno_time_mapping_;
25982600
};
25992601

db/db_test2.cc

-12
Original file line numberDiff line numberDiff line change
@@ -33,18 +33,6 @@ namespace ROCKSDB_NAMESPACE {
3333
class DBTest2 : public DBTestBase {
3434
public:
3535
DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
36-
37-
protected:
38-
#ifndef ROCKSDB_LITE
39-
uint64_t GetSstSizeHelper(Temperature temperature) {
40-
std::string prop;
41-
EXPECT_TRUE(dbfull()->GetProperty(
42-
DB::Properties::kLiveSstFilesSizeAtTemperature +
43-
std::to_string(static_cast<uint8_t>(temperature)),
44-
&prop));
45-
return static_cast<uint64_t>(std::atoi(prop.c_str()));
46-
}
47-
#endif // ROCKSDB_LITE
4836
};
4937

5038
#ifndef ROCKSDB_LITE

db/db_test_util.cc

+9
Original file line numberDiff line numberDiff line change
@@ -1676,6 +1676,15 @@ uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily(
16761676
}
16771677
return result;
16781678
}
1679+
1680+
uint64_t DBTestBase::GetSstSizeHelper(Temperature temperature) {
1681+
std::string prop;
1682+
EXPECT_TRUE(dbfull()->GetProperty(
1683+
DB::Properties::kLiveSstFilesSizeAtTemperature +
1684+
std::to_string(static_cast<uint8_t>(temperature)),
1685+
&prop));
1686+
return static_cast<uint64_t>(std::atoi(prop.c_str()));
1687+
}
16791688
#endif // ROCKSDB_LITE
16801689

16811690
void VerifySstUniqueIds(const TablePropertiesCollection& props) {

db/db_test_util.h

+2
Original file line numberDiff line numberDiff line change
@@ -1345,6 +1345,8 @@ class DBTestBase : public testing::Test {
13451345
#ifndef ROCKSDB_LITE
13461346
uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
13471347
std::string column_family_name);
1348+
1349+
uint64_t GetSstSizeHelper(Temperature temperature);
13481350
#endif // ROCKSDB_LITE
13491351

13501352
uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {

db/event_helpers.cc

+13-2
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,19 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
148148
<< table_properties.fast_compression_estimated_data_size
149149
<< "db_id" << table_properties.db_id << "db_session_id"
150150
<< table_properties.db_session_id << "orig_file_number"
151-
<< table_properties.orig_file_number << "seqno_to_time_mapping"
152-
<< table_properties.seqno_to_time_mapping;
151+
<< table_properties.orig_file_number << "seqno_to_time_mapping";
152+
153+
if (table_properties.seqno_to_time_mapping.empty()) {
154+
jwriter << "N/A";
155+
} else {
156+
SeqnoToTimeMapping tmp;
157+
Status status = tmp.Add(table_properties.seqno_to_time_mapping);
158+
if (status.ok()) {
159+
jwriter << tmp.ToHumanString();
160+
} else {
161+
jwriter << "Invalid";
162+
}
163+
}
153164

154165
// user collected properties
155166
for (const auto& prop : table_properties.readable_properties) {

db/external_sst_file_basic_test.cc

-10
Original file line numberDiff line numberDiff line change
@@ -187,16 +187,6 @@ class ExternalSSTFileBasicTest
187187
std::string sst_files_dir_;
188188
std::unique_ptr<FaultInjectionTestEnv> fault_injection_test_env_;
189189
bool random_rwfile_supported_;
190-
#ifndef ROCKSDB_LITE
191-
uint64_t GetSstSizeHelper(Temperature temperature) {
192-
std::string prop;
193-
EXPECT_TRUE(dbfull()->GetProperty(
194-
DB::Properties::kLiveSstFilesSizeAtTemperature +
195-
std::to_string(static_cast<uint8_t>(temperature)),
196-
&prop));
197-
return static_cast<uint64_t>(std::atoi(prop.c_str()));
198-
}
199-
#endif // ROCKSDB_LITE
200190
};
201191

202192
TEST_F(ExternalSSTFileBasicTest, Basic) {

db/flush_job.cc

+2
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,8 @@ Status FlushJob::WriteLevel0Table() {
818818

819819
SequenceNumber smallest_seqno = mems_.front()->GetEarliestSequenceNumber();
820820
if (!db_impl_seqno_time_mapping_.Empty()) {
821+
// make a local copy, as the seqno_time_mapping from db_impl is not thread
822+
// safe, which will be used while not holding the db_mutex.
821823
seqno_to_time_mapping_ = db_impl_seqno_time_mapping_.Copy(smallest_seqno);
822824
}
823825

0 commit comments

Comments
 (0)