Skip to content

Commit ad0c374

Browse files
author
Lei Jin
committed
cache SuperVersion in thread local storage to avoid mutex lock
Summary: as title Test Plan: asan_check will post results later Reviewers: haobo, igor, dhruba, sdong Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D16257
1 parent e41c060 commit ad0c374

File tree

8 files changed

+173
-45
lines changed

8 files changed

+173
-45
lines changed

db/db_impl.cc

+113-33
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "db/version_set.h"
3939
#include "db/write_batch_internal.h"
4040
#include "port/port.h"
41+
#include "port/likely.h"
4142
#include "rocksdb/compaction_filter.h"
4243
#include "rocksdb/db.h"
4344
#include "rocksdb/env.h"
@@ -270,6 +271,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
270271
logfile_number_(0),
271272
super_version_(nullptr),
272273
super_version_number_(0),
274+
local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
273275
tmp_batch_(),
274276
bg_compaction_scheduled_(0),
275277
bg_manual_only_(0),
@@ -288,7 +290,8 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
288290
delayed_writes_(0),
289291
storage_options_(options),
290292
bg_work_gate_closed_(false),
291-
refitting_level_(false) {
293+
refitting_level_(false),
294+
opened_successfully_(false) {
292295
mem_->Ref();
293296
env_->GetAbsolutePath(dbname, &db_absolute_path_);
294297

@@ -319,19 +322,46 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
319322
}
320323

321324
DBImpl::~DBImpl() {
322-
autovector<MemTable*> to_delete;
323-
324325
// Wait for background work to finish
325326
if (flush_on_destroy_ && mem_->GetFirstSequenceNumber() != 0) {
326327
FlushMemTable(FlushOptions());
327328
}
329+
328330
mutex_.Lock();
329331
shutting_down_.Release_Store(this); // Any non-nullptr value is ok
330332
while (bg_compaction_scheduled_ ||
331333
bg_flush_scheduled_ ||
332334
bg_logstats_scheduled_) {
333335
bg_cv_.Wait();
334336
}
337+
mutex_.Unlock();
338+
339+
// Release SuperVersion reference kept in ThreadLocalPtr.
340+
// This must be done outside of mutex_ since unref handler can lock mutex.
341+
// It also needs to be done after FlushMemTable, which can trigger local_sv_
342+
// access.
343+
delete local_sv_;
344+
345+
mutex_.Lock();
346+
if (options_.allow_thread_local) {
347+
// Clean up obsolete files due to SuperVersion release.
348+
// (1) Need to delete to obsolete files before closing because RepairDB()
349+
// scans all existing files in the file system and builds manifest file.
350+
// Keeping obsolete files confuses the repair process.
351+
// (2) Need to check if we Open()/Recover() the DB successfully before
352+
// deleting because if VersionSet recover fails (may be due to corrupted
353+
// manifest file), it is not able to identify live files correctly. As a
354+
// result, all "live" files can get deleted by accident. However, corrupted
355+
// manifest is recoverable by RepairDB().
356+
if (opened_successfully_) {
357+
DeletionState deletion_state;
358+
FindObsoleteFiles(deletion_state, true);
359+
// manifest number starting from 2
360+
deletion_state.manifest_file_number = 1;
361+
PurgeObsoleteFiles(deletion_state);
362+
}
363+
}
364+
335365
if (super_version_ != nullptr) {
336366
bool is_last_reference __attribute__((unused));
337367
is_last_reference = super_version_->Unref();
@@ -349,6 +379,7 @@ DBImpl::~DBImpl() {
349379
delete mem_->Unref();
350380
}
351381

382+
autovector<MemTable*> to_delete;
352383
imm_.current()->Unref(&to_delete);
353384
for (MemTable* m: to_delete) {
354385
delete m;
@@ -1286,6 +1317,10 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
12861317

12871318
if (s.ok()) {
12881319
InstallSuperVersion(deletion_state);
1320+
// Reset SuperVersions cached in thread local storage
1321+
if (options_.allow_thread_local) {
1322+
ResetThreadLocalSuperVersions(&deletion_state);
1323+
}
12891324
if (madeProgress) {
12901325
*madeProgress = 1;
12911326
}
@@ -2811,26 +2846,21 @@ Status DBImpl::Get(const ReadOptions& options,
28112846
// DeletionState gets created and destructed outside of the lock -- we
28122847
// use this convinently to:
28132848
// * malloc one SuperVersion() outside of the lock -- new_superversion
2814-
// * delete one SuperVersion() outside of the lock -- superversion_to_free
2849+
// * delete SuperVersion()s outside of the lock -- superversions_to_free
28152850
//
28162851
// However, if InstallSuperVersion() gets called twice with the same,
28172852
// deletion_state, we can't reuse the SuperVersion() that got malloced because
28182853
// first call already used it. In that rare case, we take a hit and create a
2819-
// new SuperVersion() inside of the mutex. We do similar thing
2820-
// for superversion_to_free
2854+
// new SuperVersion() inside of the mutex.
28212855
void DBImpl::InstallSuperVersion(DeletionState& deletion_state) {
2856+
mutex_.AssertHeld();
28222857
// if new_superversion == nullptr, it means somebody already used it
28232858
SuperVersion* new_superversion =
28242859
(deletion_state.new_superversion != nullptr) ?
28252860
deletion_state.new_superversion : new SuperVersion();
28262861
SuperVersion* old_superversion = InstallSuperVersion(new_superversion);
28272862
deletion_state.new_superversion = nullptr;
2828-
if (deletion_state.superversion_to_free != nullptr) {
2829-
// somebody already put it there
2830-
delete old_superversion;
2831-
} else {
2832-
deletion_state.superversion_to_free = old_superversion;
2833-
}
2863+
deletion_state.superversions_to_free.push_back(old_superversion);
28342864
}
28352865

28362866
DBImpl::SuperVersion* DBImpl::InstallSuperVersion(
@@ -2839,14 +2869,31 @@ DBImpl::SuperVersion* DBImpl::InstallSuperVersion(
28392869
new_superversion->Init(mem_, imm_.current(), versions_->current());
28402870
SuperVersion* old_superversion = super_version_;
28412871
super_version_ = new_superversion;
2872+
super_version_->db = this;
28422873
++super_version_number_;
2874+
super_version_->version_number = super_version_number_;
2875+
28432876
if (old_superversion != nullptr && old_superversion->Unref()) {
28442877
old_superversion->Cleanup();
28452878
return old_superversion; // will let caller delete outside of mutex
28462879
}
28472880
return nullptr;
28482881
}
28492882

2883+
void DBImpl::ResetThreadLocalSuperVersions(DeletionState* deletion_state) {
2884+
mutex_.AssertHeld();
2885+
autovector<void*> sv_ptrs;
2886+
local_sv_->Scrape(&sv_ptrs);
2887+
for (auto ptr : sv_ptrs) {
2888+
assert(ptr);
2889+
auto sv = static_cast<SuperVersion*>(ptr);
2890+
if (static_cast<SuperVersion*>(ptr)->Unref()) {
2891+
sv->Cleanup();
2892+
deletion_state->superversions_to_free.push_back(sv);
2893+
}
2894+
}
2895+
}
2896+
28502897
Status DBImpl::GetImpl(const ReadOptions& options,
28512898
const Slice& key,
28522899
std::string* value,
@@ -2864,10 +2911,41 @@ Status DBImpl::GetImpl(const ReadOptions& options,
28642911
snapshot = versions_->LastSequence();
28652912
}
28662913

2867-
// This can be replaced by using atomics and spinlock instead of big mutex
2868-
mutex_.Lock();
2869-
SuperVersion* get_version = super_version_->Ref();
2870-
mutex_.Unlock();
2914+
// Acquire SuperVersion
2915+
SuperVersion* sv = nullptr;
2916+
if (LIKELY(options_.allow_thread_local)) {
2917+
// The SuperVersion is cached in thread local storage to avoid acquiring
2918+
// mutex when SuperVersion does not change since the last use. When a new
2919+
// SuperVersion is installed, the compaction or flush thread cleans up
2920+
// cached SuperVersion in all existing thread local storage. To avoid
2921+
// acquiring mutex for this operation, we use atomic Swap() on the thread
2922+
// local pointer to guarantee exclusive access. If the thread local pointer
2923+
// is being used while a new SuperVersion is installed, the cached
2924+
// SuperVersion can become stale. It will eventually get refreshed either
2925+
// on the next GetImpl() call or next SuperVersion installation.
2926+
sv = static_cast<SuperVersion*>(local_sv_->Swap(nullptr));
2927+
if (!sv || sv->version_number !=
2928+
super_version_number_.load(std::memory_order_relaxed)) {
2929+
RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_UPDATES);
2930+
SuperVersion* sv_to_delete = nullptr;
2931+
2932+
if (sv && sv->Unref()) {
2933+
mutex_.Lock();
2934+
sv->Cleanup();
2935+
sv_to_delete = sv;
2936+
} else {
2937+
mutex_.Lock();
2938+
}
2939+
sv = super_version_->Ref();
2940+
mutex_.Unlock();
2941+
2942+
delete sv_to_delete;
2943+
}
2944+
} else {
2945+
mutex_.Lock();
2946+
sv = super_version_->Ref();
2947+
mutex_.Unlock();
2948+
}
28712949

28722950
bool have_stat_update = false;
28732951
Version::GetStats stats;
@@ -2880,18 +2958,18 @@ Status DBImpl::GetImpl(const ReadOptions& options,
28802958
// merge_operands will contain the sequence of merges in the latter case.
28812959
LookupKey lkey(key, snapshot);
28822960
BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
2883-
if (get_version->mem->Get(lkey, value, &s, merge_context, options_)) {
2961+
if (sv->mem->Get(lkey, value, &s, merge_context, options_)) {
28842962
// Done
28852963
RecordTick(options_.statistics.get(), MEMTABLE_HIT);
2886-
} else if (get_version->imm->Get(lkey, value, &s, merge_context, options_)) {
2964+
} else if (sv->imm->Get(lkey, value, &s, merge_context, options_)) {
28872965
// Done
28882966
RecordTick(options_.statistics.get(), MEMTABLE_HIT);
28892967
} else {
28902968
StopWatchNano from_files_timer(env_, false);
28912969
StartPerfTimer(&from_files_timer);
28922970

2893-
get_version->current->Get(options, lkey, value, &s, &merge_context, &stats,
2894-
options_, value_found);
2971+
sv->current->Get(options, lkey, value, &s, &merge_context, &stats,
2972+
options_, value_found);
28952973
have_stat_update = true;
28962974
BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer);
28972975
RecordTick(options_.statistics.get(), MEMTABLE_MISS);
@@ -2900,31 +2978,32 @@ Status DBImpl::GetImpl(const ReadOptions& options,
29002978
StopWatchNano post_process_timer(env_, false);
29012979
StartPerfTimer(&post_process_timer);
29022980

2903-
bool delete_get_version = false;
29042981
if (!options_.disable_seek_compaction && have_stat_update) {
29052982
mutex_.Lock();
2906-
if (get_version->current->UpdateStats(stats)) {
2983+
if (sv->current->UpdateStats(stats)) {
29072984
MaybeScheduleFlushOrCompaction();
29082985
}
2909-
if (get_version->Unref()) {
2910-
get_version->Cleanup();
2911-
delete_get_version = true;
2912-
}
29132986
mutex_.Unlock();
2987+
}
2988+
2989+
// Release SuperVersion
2990+
if (LIKELY(options_.allow_thread_local)) {
2991+
// Put the SuperVersion back
2992+
local_sv_->Reset(static_cast<void*>(sv));
29142993
} else {
2915-
if (get_version->Unref()) {
2994+
bool delete_sv = false;
2995+
if (sv->Unref()) {
29162996
mutex_.Lock();
2917-
get_version->Cleanup();
2997+
sv->Cleanup();
29182998
mutex_.Unlock();
2919-
delete_get_version = true;
2999+
delete_sv = true;
3000+
}
3001+
if (delete_sv) {
3002+
delete sv;
29203003
}
2921-
}
2922-
if (delete_get_version) {
2923-
delete get_version;
29243004
}
29253005

29263006
// Note, tickers are atomic now - no lock protection needed any more.
2927-
29283007
RecordTick(options_.statistics.get(), NUMBER_KEYS_READ);
29293008
RecordTick(options_.statistics.get(), BYTES_READ, value->size());
29303009
BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
@@ -3772,6 +3851,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
37723851
impl->mutex_.Unlock();
37733852

37743853
if (s.ok()) {
3854+
impl->opened_successfully_ = true;
37753855
*dbptr = impl;
37763856
} else {
37773857
delete impl;

db/db_impl.h

+27-4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "rocksdb/transaction_log.h"
2727
#include "util/autovector.h"
2828
#include "util/stats_logger.h"
29+
#include "util/thread_local.h"
2930
#include "db/internal_stats.h"
3031

3132
namespace rocksdb {
@@ -152,6 +153,9 @@ class DBImpl : public DB {
152153
// all memtables that we need to free through this vector. We then
153154
// delete all those memtables outside of mutex, during destruction
154155
autovector<MemTable*> to_delete;
156+
// Version number of the current SuperVersion
157+
uint64_t version_number;
158+
DBImpl* db;
155159

156160
// should be called outside the mutex
157161
SuperVersion() = default;
@@ -170,6 +174,16 @@ class DBImpl : public DB {
170174
Version* new_current);
171175
};
172176

177+
static void SuperVersionUnrefHandle(void* ptr) {
178+
DBImpl::SuperVersion* sv = static_cast<DBImpl::SuperVersion*>(ptr);
179+
if (sv->Unref()) {
180+
sv->db->mutex_.Lock();
181+
sv->Cleanup();
182+
sv->db->mutex_.Unlock();
183+
delete sv;
184+
}
185+
}
186+
173187
// needed for CleanupIteratorState
174188
struct DeletionState {
175189
inline bool HaveSomethingToDelete() const {
@@ -195,7 +209,7 @@ class DBImpl : public DB {
195209
// a list of memtables to be free
196210
autovector<MemTable*> memtables_to_free;
197211

198-
SuperVersion* superversion_to_free; // if nullptr nothing to free
212+
autovector<SuperVersion*> superversions_to_free;
199213

200214
SuperVersion* new_superversion; // if nullptr no new superversion
201215

@@ -207,7 +221,6 @@ class DBImpl : public DB {
207221
manifest_file_number = 0;
208222
log_number = 0;
209223
prev_log_number = 0;
210-
superversion_to_free = nullptr;
211224
new_superversion =
212225
create_superversion ? new SuperVersion() : nullptr;
213226
}
@@ -217,8 +230,10 @@ class DBImpl : public DB {
217230
for (auto m : memtables_to_free) {
218231
delete m;
219232
}
220-
// free superversion. if nullptr, this will be noop
221-
delete superversion_to_free;
233+
// free superversions
234+
for (auto s : superversions_to_free) {
235+
delete s;
236+
}
222237
// if new_superversion was not used, it will be non-nullptr and needs
223238
// to be freed here
224239
delete new_superversion;
@@ -400,6 +415,9 @@ class DBImpl : public DB {
400415
// InstallSuperVersion(), i.e. incremented every time super_version_
401416
// changes.
402417
std::atomic<uint64_t> super_version_number_;
418+
// Thread's local copy of SuperVersion pointer
419+
// This needs to be destructed after mutex_
420+
ThreadLocalPtr* local_sv_;
403421

404422
std::string host_name_;
405423

@@ -489,6 +507,9 @@ class DBImpl : public DB {
489507
// Guard against multiple concurrent refitting
490508
bool refitting_level_;
491509

510+
// Indicate DB was opened successfully
511+
bool opened_successfully_;
512+
492513
// No copying allowed
493514
DBImpl(const DBImpl&);
494515
void operator=(const DBImpl&);
@@ -515,6 +536,8 @@ class DBImpl : public DB {
515536
// deletion_state which can have new_superversion already allocated.
516537
void InstallSuperVersion(DeletionState& deletion_state);
517538

539+
void ResetThreadLocalSuperVersions(DeletionState* deletion_state);
540+
518541
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props)
519542
override;
520543

include/rocksdb/options.h

+4
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,10 @@ struct Options {
714714
//
715715
// Default: 0 (disabled)
716716
size_t max_successive_merges;
717+
718+
// Allow RocksDB to use thread local storage to optimize performance.
719+
// Default: true
720+
bool allow_thread_local;
717721
};
718722

719723
//

0 commit comments

Comments
 (0)