Skip to content

Commit

Permalink
Be able to decrease background thread's CPU priority when creating da…
Browse files Browse the repository at this point in the history
…tabase backup (facebook#6602)

Summary:
When creating a database backup, the background threads will not only consume IO resources by copying files, but also consuming CPU such as by computing checksums. During peak times, the CPU consumption by the background threads might affect online queries.

This PR makes it possible to decrease CPU priority of these threads when creating a new backup.
Pull Request resolved: facebook#6602

Test Plan: make check

Reviewed By: siying, zhichao-cao

Differential Revision: D20683216

Pulled By: cheng-chang

fbshipit-source-id: 9978b9ed9488e8ce135e90ca083e5b4b7221fd84
  • Loading branch information
Cheng Chang authored and facebook-github-bot committed Mar 29, 2020
1 parent 3a35542 commit ee50b8d
Show file tree
Hide file tree
Showing 18 changed files with 324 additions and 98 deletions.
1 change: 1 addition & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* Fix spelling so that API now has correctly spelled transaction state name `COMMITTED`, while the old misspelled `COMMITED` is still available as an alias.
* Updated default format_version in BlockBasedTableOptions from 2 to 4. SST files generated with the new default can be read by RocksDB versions 5.16 and newer, and use more efficient encoding of keys in index blocks.
* `Cache::Insert` now expects clients to pass in function objects implementing the `Cache::Deleter` interface as deleters instead of plain function pointers.
* A new parameter `CreateBackupOptions` is added to both `BackupEngine::CreateNewBackup` and `BackupEngine::CreateNewBackupWithMetadata`, you can decrease CPU priority of `BackupEngine`'s background threads by setting `decrease_background_thread_cpu_priority` and `background_thread_cpu_priority` in `CreateBackupOptions`.

### Bug Fixes
* Fix a bug where range tombstone blocks in ingested files were cached incorrectly during ingestion. If range tombstones were read from those incorrectly cached blocks, the keys they covered would be exposed.
Expand Down
2 changes: 1 addition & 1 deletion db/db_impl/db_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ Status DBImpl::ResumeImpl() {
}

// Make sure the IO Status stored in version set is set to OK.
if(s.ok()) {
if (s.ok()) {
versions_->SetIOStatusOK();
}

Expand Down
13 changes: 7 additions & 6 deletions db/db_impl/db_impl_compaction_flush.cc
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,8 @@ Status DBImpl::FlushMemTableToOutputFile(
}

if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
if (!io_s.ok()&& !io_s.IsShutdownInProgress() && !io_s.IsColumnFamilyDropped()) {
if (!io_s.ok() && !io_s.IsShutdownInProgress() &&
!io_s.IsColumnFamilyDropped()) {
error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush);
} else {
Status new_bg_error = s;
Expand Down Expand Up @@ -2876,11 +2877,11 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
} else {
ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
status.ToString().c_str());
if (!io_s.ok()) {
error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction);
} else {
error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
}
if (!io_s.ok()) {
error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction);
} else {
error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
}
if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) {
// Put this cfd back in the compaction queue so we can retry after some
// time
Expand Down
2 changes: 1 addition & 1 deletion db/error_handler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err,
Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError);
bg_error_ = bg_err;
EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
db_mutex_, &auto_recovery);
db_mutex_, &auto_recovery);
return bg_error_;
} else if (bg_io_err.GetRetryable()) {
// Second, check if the error is a retryable IO error or not. if it is
Expand Down
10 changes: 5 additions & 5 deletions db/error_handler_fs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteError) {

TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeError) {
std::shared_ptr<FaultInjectionTestFS> fault_fs(
new FaultInjectionTestFS(FileSystem::Default()));
new FaultInjectionTestFS(FileSystem::Default()));
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
Expand Down Expand Up @@ -289,7 +289,7 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteError) {

TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) {
std::shared_ptr<FaultInjectionTestFS> fault_fs(
new FaultInjectionTestFS(FileSystem::Default()));
new FaultInjectionTestFS(FileSystem::Default()));
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
Expand Down Expand Up @@ -457,7 +457,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) {

TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) {
std::shared_ptr<FaultInjectionTestFS> fault_fs(
new FaultInjectionTestFS(FileSystem::Default()));
new FaultInjectionTestFS(FileSystem::Default()));
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
Expand Down Expand Up @@ -556,7 +556,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteError) {

TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableError) {
std::shared_ptr<FaultInjectionTestFS> fault_fs(
new FaultInjectionTestFS(FileSystem::Default()));
new FaultInjectionTestFS(FileSystem::Default()));
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
Expand Down Expand Up @@ -778,7 +778,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteError) {

TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) {
std::shared_ptr<FaultInjectionTestFS> fault_fs(
new FaultInjectionTestFS(FileSystem::Default()));
new FaultInjectionTestFS(FileSystem::Default()));
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
Expand Down
6 changes: 4 additions & 2 deletions db/version_set_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1378,7 +1378,8 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase,
TEST_F(EmptyDefaultCfNewManifest, Recover) {
PrepareManifest(nullptr, nullptr, &log_writer_);
log_writer_.reset();
Status s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
Status s =
SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
ASSERT_OK(s);
std::string manifest_path;
VerifyManifest(&manifest_path);
Expand Down Expand Up @@ -1440,7 +1441,8 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
PrepareManifest(nullptr, nullptr, &log_writer_);
log_writer_.reset();
Status s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
Status s =
SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
ASSERT_OK(s);

std::string manifest_path;
Expand Down
4 changes: 2 additions & 2 deletions file/filename.cc
Original file line number Diff line number Diff line change
Expand Up @@ -369,8 +369,8 @@ bool ParseFileName(const std::string& fname, uint64_t* number,
}

IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
uint64_t descriptor_number,
FSDirectory* directory_to_fsync) {
uint64_t descriptor_number,
FSDirectory* directory_to_fsync) {
// Remove leading "dbname/" and add newline to manifest file name
std::string manifest = DescriptorFileName(dbname, descriptor_number);
Slice contents = manifest;
Expand Down
4 changes: 2 additions & 2 deletions file/filename.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ extern bool ParseFileName(const std::string& filename, uint64_t* number,
// Make the CURRENT file point to the descriptor file with the
// specified number.
extern IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
uint64_t descriptor_number,
FSDirectory* directory_to_fsync);
uint64_t descriptor_number,
FSDirectory* directory_to_fsync);

// Make the IDENTITY file for the db
extern Status SetIdentityFile(Env* env, const std::string& dbname,
Expand Down
7 changes: 7 additions & 0 deletions include/rocksdb/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ class InternalKeyComparator;
class WalFilter;
class FileSystem;

enum class CpuPriority {
kIdle = 0,
kLow = 1,
kNormal = 2,
kHigh = 3,
};

// DB contents are stored in a set of blocks, each of which holds a
// sequence of key,value pairs. Each block may be compressed before
// being stored in a file. The following enum describes which
Expand Down
103 changes: 87 additions & 16 deletions include/rocksdb/utilities/backupable_db.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "rocksdb/utilities/stackable_db.h"

#include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/status.h"

namespace ROCKSDB_NAMESPACE {
Expand Down Expand Up @@ -142,6 +143,24 @@ struct BackupableDBOptions {
}
};

struct CreateBackupOptions {
// Flush will always trigger if 2PC is enabled.
// If write-ahead logs are disabled, set flush_before_backup=true to
// avoid losing unflushed key/value pairs from the memtable.
bool flush_before_backup = false;

// Callback for reporting progress.
std::function<void()> progress_callback = []() {};

// If false, background_thread_cpu_priority is ignored.
// Otherwise, the cpu priority can be decreased,
// if you try to increase the priority, the priority will not change.
// The initial priority of the threads is CpuPriority::kNormal,
// so you can decrease to priorities lower than kNormal.
bool decrease_background_thread_cpu_priority = false;
CpuPriority background_thread_cpu_priority = CpuPriority::kNormal;
};

struct RestoreOptions {
// If true, restore won't overwrite the existing log files in wal_dir. It will
// also move all log files from archive directory to wal_dir. Use this option
Expand Down Expand Up @@ -208,8 +227,13 @@ class BackupEngineReadOnly {
public:
virtual ~BackupEngineReadOnly() {}

static Status Open(Env* db_env, const BackupableDBOptions& options,
static Status Open(const BackupableDBOptions& options, Env* db_env,
BackupEngineReadOnly** backup_engine_ptr);
// keep for backward compatibility.
static Status Open(Env* db_env, const BackupableDBOptions& options,
BackupEngineReadOnly** backup_engine_ptr) {
return BackupEngineReadOnly::Open(options, db_env, backup_engine_ptr);
}

// Returns info about backups in backup_info
// You can GetBackupInfo safely, even with other BackupEngine performing
Expand All @@ -225,14 +249,29 @@ class BackupEngineReadOnly {
// responsibility to synchronize the operation, i.e. don't delete the backup
// when you're restoring from it
// See also the corresponding doc in BackupEngine
virtual Status RestoreDBFromBackup(const RestoreOptions& options,
BackupID backup_id,
const std::string& db_dir,
const std::string& wal_dir) = 0;

// keep for backward compatibility.
virtual Status RestoreDBFromBackup(
BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
const RestoreOptions& restore_options = RestoreOptions()) = 0;
const RestoreOptions& options = RestoreOptions()) {
return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
}

// See the corresponding doc in BackupEngine
virtual Status RestoreDBFromLatestBackup(const RestoreOptions& options,
const std::string& db_dir,
const std::string& wal_dir) = 0;

// keep for backward compatibility.
virtual Status RestoreDBFromLatestBackup(
const std::string& db_dir, const std::string& wal_dir,
const RestoreOptions& restore_options = RestoreOptions()) = 0;
const RestoreOptions& options = RestoreOptions()) {
return RestoreDBFromLatestBackup(options, db_dir, wal_dir);
}

// checks that each file exists and that the size of the file matches our
// expectations. it does not check file checksum.
Expand All @@ -253,27 +292,44 @@ class BackupEngine {

// BackupableDBOptions have to be the same as the ones used in previous
// BackupEngines for the same backup directory.
static Status Open(Env* db_env, const BackupableDBOptions& options,
static Status Open(const BackupableDBOptions& options, Env* db_env,
BackupEngine** backup_engine_ptr);

// same as CreateNewBackup, but stores extra application metadata
// Flush will always trigger if 2PC is enabled.
// If write-ahead logs are disabled, set flush_before_backup=true to
// avoid losing unflushed key/value pairs from the memtable.
// keep for backward compatibility.
static Status Open(Env* db_env, const BackupableDBOptions& options,
BackupEngine** backup_engine_ptr) {
return BackupEngine::Open(options, db_env, backup_engine_ptr);
}

// same as CreateNewBackup, but stores extra application metadata.
virtual Status CreateNewBackupWithMetadata(
const CreateBackupOptions& options, DB* db,
const std::string& app_metadata) = 0;

// keep here for backward compatibility.
virtual Status CreateNewBackupWithMetadata(
DB* db, const std::string& app_metadata, bool flush_before_backup = false,
std::function<void()> progress_callback = []() {}) = 0;
std::function<void()> progress_callback = []() {}) {
CreateBackupOptions options;
options.flush_before_backup = flush_before_backup;
options.progress_callback = progress_callback;
return CreateNewBackupWithMetadata(options, db, app_metadata);
}

// Captures the state of the database in the latest backup
// NOT a thread safe call
// Flush will always trigger if 2PC is enabled.
// If write-ahead logs are disabled, set flush_before_backup=true to
// avoid losing unflushed key/value pairs from the memtable.
virtual Status CreateNewBackup(const CreateBackupOptions& options, DB* db) {
return CreateNewBackupWithMetadata(options, db, "");
}

// keep here for backward compatibility.
virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false,
std::function<void()> progress_callback =
[]() {}) {
return CreateNewBackupWithMetadata(db, "", flush_before_backup,
progress_callback);
CreateBackupOptions options;
options.flush_before_backup = flush_before_backup;
options.progress_callback = progress_callback;
return CreateNewBackup(options, db);
}

// Deletes old backups, keeping latest num_backups_to_keep alive.
Expand Down Expand Up @@ -313,14 +369,29 @@ class BackupEngine {
// database will diverge from backups 4 and 5 and the new backup will fail.
// If you want to create new backup, you will first have to delete backups 4
// and 5.
virtual Status RestoreDBFromBackup(const RestoreOptions& options,
BackupID backup_id,
const std::string& db_dir,
const std::string& wal_dir) = 0;

// keep for backward compatibility.
virtual Status RestoreDBFromBackup(
BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
const RestoreOptions& restore_options = RestoreOptions()) = 0;
const RestoreOptions& options = RestoreOptions()) {
return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
}

// restore from the latest backup
virtual Status RestoreDBFromLatestBackup(const RestoreOptions& options,
const std::string& db_dir,
const std::string& wal_dir) = 0;

// keep for backward compatibility.
virtual Status RestoreDBFromLatestBackup(
const std::string& db_dir, const std::string& wal_dir,
const RestoreOptions& restore_options = RestoreOptions()) = 0;
const RestoreOptions& options = RestoreOptions()) {
return RestoreDBFromLatestBackup(options, db_dir, wal_dir);
}

// checks that each file exists and that the size of the file matches our
// expectations. it does not check file checksum.
Expand Down
30 changes: 30 additions & 0 deletions port/port_posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <stdio.h>
#include <string.h>
#include <sys/resource.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <unistd.h>
#include <cstdlib>
Expand Down Expand Up @@ -230,5 +231,34 @@ static size_t GetPageSize() {

const size_t kPageSize = GetPageSize();

void SetCpuPriority(ThreadId id, CpuPriority priority) {
#ifdef OS_LINUX
sched_param param;
param.sched_priority = 0;
switch (priority) {
case CpuPriority::kHigh:
sched_setscheduler(id, SCHED_OTHER, &param);
setpriority(PRIO_PROCESS, id, -20);
break;
case CpuPriority::kNormal:
sched_setscheduler(id, SCHED_OTHER, &param);
setpriority(PRIO_PROCESS, id, 0);
break;
case CpuPriority::kLow:
sched_setscheduler(id, SCHED_OTHER, &param);
setpriority(PRIO_PROCESS, id, 19);
break;
case CpuPriority::kIdle:
sched_setscheduler(id, SCHED_IDLE, &param);
break;
default:
assert(false);
}
#else
(void)id;
(void)priority;
#endif
}

} // namespace port
} // namespace ROCKSDB_NAMESPACE
5 changes: 5 additions & 0 deletions port/port_posix.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

#include <thread>

#include "rocksdb/options.h"
#include "rocksdb/rocksdb_namespace.h"

// size_t printf formatting named in the manner of C99 standard formatting
Expand Down Expand Up @@ -214,5 +215,9 @@ extern int GetMaxOpenFiles();

extern const size_t kPageSize;

using ThreadId = pid_t;

extern void SetCpuPriority(ThreadId id, CpuPriority priority);

} // namespace port
} // namespace ROCKSDB_NAMESPACE
Loading

0 comments on commit ee50b8d

Please sign in to comment.