Skip to content

Commit

Permalink
db_stress fault injection in release mode (facebook#9957)
Browse files Browse the repository at this point in the history
Summary:
Previously all fault injection was ignored in release mode. This PR adds it back except for read fault injection (`--read_fault_one_in > 0`) since its dependency (`IGNORE_STATUS_IF_ERROR`) is unavailable in release mode.

Other notable changes include:

- Moved `EnableWriteErrorInjection()` for `--write_fault_one_in > 0` so it's after `DB::Open()` without depending on `SyncPoint`
- Made `--read_fault_one_in > 0` return an error in release mode
- Updated `db_crashtest.py` to always set `--read_fault_one_in=0` in release mode

Pull Request resolved: facebook#9957

Test Plan:
```
$ DEBUG_LEVEL=0 make -j24 db_stress
$ DEBUG_LEVEL=0 TEST_TMPDIR=/dev/shm python3 tools/db_crashtest.py blackbox
```

Reviewed By: anand1976

Differential Revision: D36193830

Pulled By: ajkr

fbshipit-source-id: 0b97946b4e3f06e3e0f6e7833c2763da08ec5321
  • Loading branch information
ajkr authored and facebook-github-bot committed May 6, 2022
1 parent b7aaa98 commit 62d84e2
Show file tree
Hide file tree
Showing 9 changed files with 18 additions and 47 deletions.
2 changes: 0 additions & 2 deletions db_stress_tool/db_stress_common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,9 @@

ROCKSDB_NAMESPACE::Env* db_stress_listener_env = nullptr;
ROCKSDB_NAMESPACE::Env* db_stress_env = nullptr;
#ifndef NDEBUG
// If non-null, injects read error at a rate specified by the
// read_fault_one_in or write_fault_one_in flag
std::shared_ptr<ROCKSDB_NAMESPACE::FaultInjectionTestFS> fault_fs_guard;
#endif // NDEBUG
enum ROCKSDB_NAMESPACE::CompressionType compression_type_e =
ROCKSDB_NAMESPACE::kSnappyCompression;
enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e =
Expand Down
6 changes: 1 addition & 5 deletions db_stress_tool/db_stress_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
#include "util/random.h"
#include "util/string_util.h"
#include "utilities/blob_db/blob_db.h"
#include "utilities/fault_injection_fs.h"
#include "utilities/merge_operators.h"

using GFLAGS_NAMESPACE::ParseCommandLineFlags;
Expand Down Expand Up @@ -296,12 +297,7 @@ constexpr int kValueMaxLen = 100;
// wrapped posix environment
extern ROCKSDB_NAMESPACE::Env* db_stress_env;
extern ROCKSDB_NAMESPACE::Env* db_stress_listener_env;
#ifndef NDEBUG
namespace ROCKSDB_NAMESPACE {
class FaultInjectionTestFS;
} // namespace ROCKSDB_NAMESPACE
extern std::shared_ptr<ROCKSDB_NAMESPACE::FaultInjectionTestFS> fault_fs_guard;
#endif

extern enum ROCKSDB_NAMESPACE::CompressionType compression_type_e;
extern enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e;
Expand Down
5 changes: 3 additions & 2 deletions db_stress_tool/db_stress_driver.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,12 @@ bool RunStressTest(StressTest* stress) {
stress->InitDb(&shared);
stress->FinishInitDb(&shared);

#ifndef NDEBUG
if (FLAGS_sync_fault_injection) {
fault_fs_guard->SetFilesystemDirectWritable(false);
}
#endif
if (FLAGS_write_fault_one_in) {
fault_fs_guard->EnableWriteErrorInjection();
}

uint32_t n = FLAGS_threads;
uint64_t now = clock->NowMicros();
Expand Down
12 changes: 10 additions & 2 deletions db_stress_tool/db_stress_shared_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,13 +133,21 @@ class SharedState {
for (int i = 0; i < FLAGS_column_families; ++i) {
key_locks_[i].reset(new port::Mutex[num_locks]);
}
#ifndef NDEBUG
if (FLAGS_read_fault_one_in) {
#ifdef NDEBUG
// Unsupported in release mode because it relies on
// `IGNORE_STATUS_IF_ERROR` to distinguish faults not expected to lead to
// failure.
fprintf(stderr,
"Cannot set nonzero value for --read_fault_one_in in "
"release mode.");
exit(1);
#else // NDEBUG
SyncPoint::GetInstance()->SetCallBack("FaultInjectionIgnoreError",
IgnoreReadErrorCallback);
SyncPoint::GetInstance()->EnableProcessing();
#endif // NDEBUG
}
#endif // NDEBUG
}

~SharedState() {
Expand Down
8 changes: 2 additions & 6 deletions db_stress_tool/db_stress_test_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,7 @@ void StressTest::OperateDb(ThreadState* thread) {
fault_fs_guard->SetThreadLocalReadErrorContext(thread->shared->GetSeed(),
FLAGS_read_fault_one_in);
}
#endif // NDEBUG
if (FLAGS_write_fault_one_in) {
IOStatus error_msg;
if (FLAGS_injest_error_severity <= 1 || FLAGS_injest_error_severity > 2) {
Expand All @@ -691,7 +692,6 @@ void StressTest::OperateDb(ThreadState* thread) {
thread->shared->GetSeed(), FLAGS_write_fault_one_in, error_msg,
/*inject_for_all_file_types=*/false, types);
}
#endif // NDEBUG
thread->stats.Start();
for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) {
if (thread->shared->HasVerificationFailedYet() ||
Expand Down Expand Up @@ -2621,7 +2621,6 @@ void StressTest::Open(SharedState* shared) {
RegisterAdditionalListeners();
options_.create_missing_column_families = true;
if (!FLAGS_use_txn) {
#ifndef NDEBUG
// Determine whether we need to ingest file metadata write failures
// during DB reopen. If it does, enable it.
// Only ingest metadata error if it is reopening, as initial open
Expand Down Expand Up @@ -2663,7 +2662,6 @@ void StressTest::Open(SharedState* shared) {
}
}
while (true) {
#endif // NDEBUG
#ifndef ROCKSDB_LITE
// StackableDB-based BlobDB
if (FLAGS_use_blob_db) {
Expand Down Expand Up @@ -2693,7 +2691,6 @@ void StressTest::Open(SharedState* shared) {
}
}

#ifndef NDEBUG
if (ingest_meta_error || ingest_write_error || ingest_read_error) {
fault_fs_guard->SetFilesystemDirectWritable(true);
fault_fs_guard->DisableMetadataWriteErrorInjection();
Expand All @@ -2705,7 +2702,7 @@ void StressTest::Open(SharedState* shared) {
// wait for all compactions to finish to make sure DB is in
// clean state before executing queries.
s = static_cast_with_check<DBImpl>(db_->GetRootDB())
->TEST_WaitForCompact(true);
->WaitForCompact(true /* wait_unscheduled */);
if (!s.ok()) {
for (auto cf : column_families_) {
delete cf;
Expand Down Expand Up @@ -2738,7 +2735,6 @@ void StressTest::Open(SharedState* shared) {
}
break;
}
#endif // NDEBUG
} else {
#ifndef ROCKSDB_LITE
TransactionDBOptions txn_db_options;
Expand Down
12 changes: 0 additions & 12 deletions db_stress_tool/db_stress_tool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@
#include "db_stress_tool/db_stress_common.h"
#include "db_stress_tool/db_stress_driver.h"
#include "rocksdb/convenience.h"
#ifndef NDEBUG
#include "utilities/fault_injection_fs.h"
#endif

namespace ROCKSDB_NAMESPACE {
namespace {
Expand Down Expand Up @@ -82,7 +80,6 @@ int db_stress_tool(int argc, char** argv) {
dbsl_env_wrapper_guard = std::make_shared<DbStressEnvWrapper>(raw_env);
db_stress_listener_env = dbsl_env_wrapper_guard.get();

#ifndef NDEBUG
if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection ||
FLAGS_write_fault_one_in || FLAGS_open_metadata_write_fault_one_in ||
FLAGS_open_write_fault_one_in || FLAGS_open_read_fault_one_in) {
Expand All @@ -98,18 +95,10 @@ int db_stress_tool(int argc, char** argv) {
std::make_shared<CompositeEnvWrapper>(raw_env, fault_fs_guard);
raw_env = fault_env_guard.get();
}
if (FLAGS_write_fault_one_in) {
SyncPoint::GetInstance()->SetCallBack(
"BuildTable:BeforeFinishBuildTable",
[&](void*) { fault_fs_guard->EnableWriteErrorInjection(); });
SyncPoint::GetInstance()->EnableProcessing();
}
#endif

env_wrapper_guard = std::make_shared<DbStressEnvWrapper>(raw_env);
db_stress_env = env_wrapper_guard.get();

#ifndef NDEBUG
if (FLAGS_write_fault_one_in) {
// In the write injection case, we need to use the FS interface and returns
// the IOStatus with different error and flags. Therefore,
Expand All @@ -118,7 +107,6 @@ int db_stress_tool(int argc, char** argv) {
// CompositeEnvWrapper of env and fault_fs.
db_stress_env = raw_env;
}
#endif

FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());

Expand Down
2 changes: 0 additions & 2 deletions db_stress_tool/multi_ops_txns_stress.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@

#include "rocksdb/utilities/write_batch_with_index.h"
#include "util/defer.h"
#ifndef NDEBUG
#include "utilities/fault_injection_fs.h"
#endif // NDEBUG
#include "utilities/transactions/write_prepared_txn_db.h"

namespace ROCKSDB_NAMESPACE {
Expand Down
16 changes: 0 additions & 16 deletions db_stress_tool/no_batched_ops_stress.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@

#ifdef GFLAGS
#include "db_stress_tool/db_stress_common.h"
#ifndef NDEBUG
#include "utilities/fault_injection_fs.h"
#endif // NDEBUG
#include "rocksdb/utilities/transaction_db.h"

namespace ROCKSDB_NAMESPACE {
Expand Down Expand Up @@ -234,20 +232,15 @@ class NonBatchedOpsStressTest : public StressTest {
std::string from_db;
int error_count = 0;

#ifndef NDEBUG
if (fault_fs_guard) {
fault_fs_guard->EnableErrorInjection();
SharedState::ignore_read_error = false;
}
#endif // NDEBUG
Status s = db_->Get(read_opts, cfh, key, &from_db);
#ifndef NDEBUG
if (fault_fs_guard) {
error_count = fault_fs_guard->GetAndResetErrorCount();
}
#endif // NDEBUG
if (s.ok()) {
#ifndef NDEBUG
if (fault_fs_guard) {
if (error_count && !SharedState::ignore_read_error) {
// Grab mutex so multiple thread don't try to print the
Expand All @@ -259,7 +252,6 @@ class NonBatchedOpsStressTest : public StressTest {
std::terminate();
}
}
#endif // NDEBUG
// found case
thread->stats.AddGets(1, 1);
} else if (s.IsNotFound()) {
Expand All @@ -273,11 +265,9 @@ class NonBatchedOpsStressTest : public StressTest {
thread->stats.AddVerifiedErrors(1);
}
}
#ifndef NDEBUG
if (fault_fs_guard) {
fault_fs_guard->DisableErrorInjection();
}
#endif // NDEBUG
return s;
}

Expand Down Expand Up @@ -365,27 +355,22 @@ class NonBatchedOpsStressTest : public StressTest {
}

if (!use_txn) {
#ifndef NDEBUG
if (fault_fs_guard) {
fault_fs_guard->EnableErrorInjection();
SharedState::ignore_read_error = false;
}
#endif // NDEBUG
db_->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
statuses.data());
#ifndef NDEBUG
if (fault_fs_guard) {
error_count = fault_fs_guard->GetAndResetErrorCount();
}
#endif // NDEBUG
} else {
#ifndef ROCKSDB_LITE
txn->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
statuses.data());
#endif
}

#ifndef NDEBUG
if (fault_fs_guard && error_count && !SharedState::ignore_read_error) {
int stat_nok = 0;
for (const auto& s : statuses) {
Expand All @@ -409,7 +394,6 @@ class NonBatchedOpsStressTest : public StressTest {
if (fault_fs_guard) {
fault_fs_guard->DisableErrorInjection();
}
#endif // NDEBUG

for (size_t i = 0; i < statuses.size(); ++i) {
Status s = statuses[i];
Expand Down
2 changes: 2 additions & 0 deletions tools/db_crashtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,8 @@ def is_direct_io_supported(dbname):
def finalize_and_sanitize(src_params):
dest_params = dict([(k, v() if callable(v) else v)
for (k, v) in src_params.items()])
if is_release_mode():
dest_params['read_fault_one_in'] = 0
if dest_params.get("compression_max_dict_bytes") == 0:
dest_params["compression_zstd_max_train_bytes"] = 0
dest_params["compression_max_dict_buffer_bytes"] = 0
Expand Down

0 comments on commit 62d84e2

Please sign in to comment.