Skip to content

Commit

Permalink
[rocksdb] allow larger write buffers, sst files and levels in rocksdb (
Browse files Browse the repository at this point in the history
…MystenLabs#10919)

## Description 

It is observed that write buffers rarely exceed 150MB for each column
family. Allowing write buffers to grow larger would help decrease write
amplifications during compactions, and improve read performance of
recent data.

With increased sizes of write buffers, we need to increase sst file
sizes and level size limits to avoid too much churning. These changes
would help reduce write amplifications on their own too.

Also, refactor RocksDB options configuration to use a builder style API,
so it is easier to match different use cases with different combination
of API calls.

## Test Plan 

Deployed to private testnet.

---
If your changes are not user-facing and not a breaking change, you can
skip the following section. Otherwise, please indicate what changed, and
then add to the Release Notes section as highlighted during the release
process.

### Type of Change (Check all that apply)

- [ ] user-visible impact
- [ ] breaking change for a client SDKs
- [ ] breaking change for FNs (FN binary must upgrade)
- [ ] breaking change for validators or node operators (must upgrade
binaries)
- [ ] breaking change for on-chain data layout
- [ ] necessitate either a data wipe or data migration

### Release notes
  • Loading branch information
mwtian authored Apr 15, 2023
1 parent c15ee67 commit 3ca4d30
Show file tree
Hide file tree
Showing 9 changed files with 204 additions and 116 deletions.
6 changes: 4 additions & 2 deletions crates/sui-core/src/authority/authority_per_epoch_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ use sui_types::messages::{
use sui_types::signature::GenericSignature;
use tracing::{debug, error, info, trace, warn};
use typed_store::rocks::{
point_lookup_db_options, DBBatch, DBMap, DBOptions, MetricConf, TypedStoreError,
default_db_options, DBBatch, DBMap, DBOptions, MetricConf, TypedStoreError,
};
use typed_store::traits::{TableSummary, TypedStoreDebug};

Expand Down Expand Up @@ -1984,7 +1984,9 @@ impl AuthorityPerEpochStore {
}

fn transactions_table_default_config() -> DBOptions {
point_lookup_db_options()
default_db_options()
.optimize_for_point_lookup(128)
.optimize_for_write_throughput()
}

impl ExecutionComponents {
Expand Down
33 changes: 26 additions & 7 deletions crates/sui-core/src/authority/authority_store_pruner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,6 @@ impl AuthorityStorePruner {

#[cfg(test)]
mod tests {
use fs_extra::dir::get_size;
use more_asserts as ma;
use std::path::Path;
use std::time::Duration;
Expand Down Expand Up @@ -539,7 +538,7 @@ mod tests {
async fn test_db_size_after_compaction() -> Result<(), anyhow::Error> {
let primary_path = tempfile::tempdir()?.into_path();
let perpetual_db = Arc::new(AuthorityPerpetualTables::open(&primary_path, None));
let total_unique_object_ids = 200_000;
let total_unique_object_ids = 10_000;
let num_versions_per_object = 10;
let ids = ObjectID::in_range(ObjectID::ZERO, total_unique_object_ids)?;
let mut to_delete = vec![];
Expand All @@ -554,8 +553,29 @@ mod tests {
.insert(&ObjectKey(id, SequenceNumber::from(i)), &obj)?;
}
}

fn get_sst_size(path: &Path) -> u64 {
let mut size = 0;
for entry in std::fs::read_dir(path).unwrap() {
let entry = entry.unwrap();
let path = entry.path();
if let Some(ext) = path.extension() {
if ext != "sst" {
continue;
}
size += std::fs::metadata(path).unwrap().len();
}
}
size
}

let db_path = primary_path.clone().join("perpetual");
let start = ObjectKey(ObjectID::ZERO, SequenceNumber::MIN);
let end = ObjectKey(ObjectID::MAX, SequenceNumber::MAX);

perpetual_db.objects.rocksdb.flush()?;
let before_compaction_size = get_size(primary_path.clone()).unwrap();
perpetual_db.objects.compact_range_to_bottom(&start, &end)?;
let before_compaction_size = get_sst_size(&db_path);

let mut effects = TransactionEffects::default();
*effects.modified_at_versions_mut_for_testing() = to_delete;
Expand All @@ -571,11 +591,10 @@ mod tests {
)
.await;
info!("Total pruned keys = {:?}", total_pruned);
let start = ObjectKey(ObjectID::ZERO, SequenceNumber::MIN);
let end = ObjectKey(ObjectID::MAX, SequenceNumber::MAX);
perpetual_db.objects.compact_range(&start, &end)?;

let after_compaction_size = get_size(primary_path).unwrap();
perpetual_db.objects.rocksdb.flush()?;
perpetual_db.objects.compact_range_to_bottom(&start, &end)?;
let after_compaction_size = get_sst_size(&db_path);

info!(
"Before compaction disk size = {:?}, after compaction disk size = {:?}",
Expand Down
57 changes: 29 additions & 28 deletions crates/sui-core/src/authority/authority_store_tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ use sui_types::storage::ObjectStore;
use typed_store::metrics::SamplingInterval;
use typed_store::rocks::util::{empty_compaction_filter, reference_count_merge_operator};
use typed_store::rocks::{
optimized_for_high_throughput_options, read_size_from_env, DBBatch, DBMap, DBOptions,
MetricConf, ReadWriteOptions,
default_db_options, read_size_from_env, DBBatch, DBMap, DBOptions, MetricConf, ReadWriteOptions,
};
use typed_store::traits::{Map, TableSummary, TypedStoreDebug};

Expand All @@ -28,8 +27,8 @@ const ENV_VAR_OBJECTS_BLOCK_CACHE_SIZE: &str = "OBJECTS_BLOCK_CACHE_MB";
const ENV_VAR_LOCKS_BLOCK_CACHE_SIZE: &str = "LOCKS_BLOCK_CACHE_MB";
const ENV_VAR_TRANSACTIONS_BLOCK_CACHE_SIZE: &str = "TRANSACTIONS_BLOCK_CACHE_MB";
const ENV_VAR_EFFECTS_BLOCK_CACHE_SIZE: &str = "EFFECTS_BLOCK_CACHE_MB";
const ENV_VAR_INDIRECT_OBJECTS_BLOCK_CACHE_SIZE: &str = "INDIRECT_OBJECTS_BLOCK_CACHE_MB";
const ENV_VAR_EVENTS_BLOCK_CACHE_SIZE: &str = "EVENTS_BLOCK_CACHE_MB";
const ENV_VAR_INDIRECT_OBJECTS_BLOCK_CACHE_SIZE: &str = "INDIRECT_OBJECTS_BLOCK_CACHE_MB";

/// AuthorityPerpetualTables contains data that must be preserved from one epoch to the next.
#[derive(DBMapUtils)]
Expand Down Expand Up @@ -408,51 +407,53 @@ impl Iterator for LiveSetIter<'_> {

// These functions are used to initialize the DB tables
fn owned_object_transaction_locks_table_default_config() -> DBOptions {
optimized_for_high_throughput_options(
read_size_from_env(ENV_VAR_LOCKS_BLOCK_CACHE_SIZE).unwrap_or(1024),
false,
)
default_db_options()
.optimize_for_write_throughput()
.optimize_for_read(read_size_from_env(ENV_VAR_LOCKS_BLOCK_CACHE_SIZE).unwrap_or(1024))
}

fn objects_table_default_config() -> DBOptions {
DBOptions {
options: optimized_for_high_throughput_options(
read_size_from_env(ENV_VAR_OBJECTS_BLOCK_CACHE_SIZE).unwrap_or(5 * 1024),
false,
)
.options,
options: default_db_options()
.optimize_for_write_throughput()
.optimize_for_read(
read_size_from_env(ENV_VAR_OBJECTS_BLOCK_CACHE_SIZE).unwrap_or(5 * 1024),
)
.options,
rw_options: ReadWriteOptions {
ignore_range_deletions: true,
},
}
}

fn transactions_table_default_config() -> DBOptions {
optimized_for_high_throughput_options(
read_size_from_env(ENV_VAR_TRANSACTIONS_BLOCK_CACHE_SIZE).unwrap_or(512),
true,
)
default_db_options()
.optimize_for_write_throughput()
.optimize_for_point_lookup(
read_size_from_env(ENV_VAR_TRANSACTIONS_BLOCK_CACHE_SIZE).unwrap_or(512),
)
}

fn effects_table_default_config() -> DBOptions {
optimized_for_high_throughput_options(
read_size_from_env(ENV_VAR_EFFECTS_BLOCK_CACHE_SIZE).unwrap_or(1024),
true,
)
default_db_options()
.optimize_for_write_throughput()
.optimize_for_point_lookup(
read_size_from_env(ENV_VAR_EFFECTS_BLOCK_CACHE_SIZE).unwrap_or(1024),
)
}

fn events_table_default_config() -> DBOptions {
optimized_for_high_throughput_options(
read_size_from_env(ENV_VAR_EVENTS_BLOCK_CACHE_SIZE).unwrap_or(1024),
false,
)
default_db_options()
.optimize_for_write_throughput()
.optimize_for_read(read_size_from_env(ENV_VAR_EVENTS_BLOCK_CACHE_SIZE).unwrap_or(1024))
}

fn indirect_move_objects_table_default_config() -> DBOptions {
let mut options = optimized_for_high_throughput_options(
read_size_from_env(ENV_VAR_INDIRECT_OBJECTS_BLOCK_CACHE_SIZE).unwrap_or(512),
true,
);
let mut options = default_db_options()
.optimize_for_write_throughput()
.optimize_for_point_lookup(
read_size_from_env(ENV_VAR_INDIRECT_OBJECTS_BLOCK_CACHE_SIZE).unwrap_or(512),
);
options.options.set_merge_operator(
"refcount operator",
reference_count_merge_operator,
Expand Down
4 changes: 2 additions & 2 deletions crates/sui-core/src/epoch/committee_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use std::sync::Arc;
use sui_types::base_types::ObjectID;
use sui_types::committee::{Committee, EpochId};
use sui_types::error::{SuiError, SuiResult};
use typed_store::rocks::{point_lookup_db_options, DBMap, DBOptions, MetricConf};
use typed_store::rocks::{default_db_options, DBMap, DBOptions, MetricConf};
use typed_store::traits::{TableSummary, TypedStoreDebug};

use typed_store::Map;
Expand All @@ -31,7 +31,7 @@ pub struct CommitteeStoreTables {

// These functions are used to initialize the DB tables
fn committee_table_default_config() -> DBOptions {
point_lookup_db_options()
default_db_options().optimize_for_point_lookup(64)
}

impl CommitteeStore {
Expand Down
8 changes: 6 additions & 2 deletions crates/sui-node/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ use sui_types::quorum_driver_types::QuorumDriverEffectsQueueResult;
use sui_types::sui_system_state::epoch_start_sui_system_state::EpochStartSystemState;
use sui_types::sui_system_state::epoch_start_sui_system_state::EpochStartSystemStateTrait;
use sui_types::sui_system_state::SuiSystemStateTrait;
use typed_store::rocks::default_db_options;
use typed_store::DBMetrics;

use crate::metrics::GrpcMetrics;
Expand Down Expand Up @@ -196,9 +197,11 @@ impl SuiNode {
&genesis_committee,
None,
));

let perpetual_options = default_db_options().optimize_db_for_write_throughput(4);
let store = AuthorityStore::open(
&config.db_path().join("store"),
None,
Some(perpetual_options.options),
genesis,
&committee_store,
config.indirect_objects_threshold,
Expand All @@ -218,11 +221,12 @@ impl SuiNode {
let cache_metrics = Arc::new(ResolverMetrics::new(&prometheus_registry));
let signature_verifier_metrics = SignatureVerifierMetrics::new(&prometheus_registry);

let epoch_options = default_db_options().optimize_db_for_write_throughput(4);
let epoch_store = AuthorityPerEpochStore::new(
config.protocol_public_key(),
committee.clone(),
&config.db_path().join("store"),
None,
Some(epoch_options.options),
EpochMetrics::new(&registry_service.default_registry()),
epoch_start_configuration,
store.clone(),
Expand Down
4 changes: 2 additions & 2 deletions crates/sui-rosetta/src/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use sui_sdk::SuiClient;
use sui_types::base_types::{EpochId, SuiAddress};
use sui_types::messages_checkpoint::CheckpointSequenceNumber;
use tracing::{debug, error, info, warn};
use typed_store::rocks::{point_lookup_db_options, DBMap, DBOptions, MetricConf};
use typed_store::rocks::{default_db_options, DBMap, DBOptions, MetricConf};
use typed_store::traits::TableSummary;
use typed_store::traits::TypedStoreDebug;
use typed_store::Map;
Expand Down Expand Up @@ -349,5 +349,5 @@ impl CheckpointIndexStore {
}

fn default_config() -> DBOptions {
point_lookup_db_options()
default_db_options().optimize_for_point_lookup(64)
}
16 changes: 8 additions & 8 deletions crates/sui-storage/src/indexes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,8 @@ use sui_types::error::{SuiError, SuiResult};
use sui_types::messages::TransactionEvents;
use sui_types::object::Owner;
use sui_types::query::TransactionFilter;
use typed_store::rocks::{default_db_options, point_lookup_db_options, DBBatch, DBMap, MetricConf};
use typed_store::rocks::{
optimized_for_high_throughput_options, read_size_from_env, DBOptions, ReadWriteOptions,
default_db_options, read_size_from_env, DBBatch, DBMap, DBOptions, MetricConf, ReadWriteOptions,
};
use typed_store::traits::Map;
use typed_store::traits::{TableSummary, TypedStoreDebug};
Expand Down Expand Up @@ -156,7 +155,7 @@ fn transactions_by_move_function_table_default_config() -> DBOptions {
default_db_options()
}
fn timestamps_table_default_config() -> DBOptions {
point_lookup_db_options()
default_db_options().optimize_for_point_lookup(64)
}
fn owner_index_table_default_config() -> DBOptions {
default_db_options()
Expand All @@ -169,11 +168,12 @@ fn index_table_default_config() -> DBOptions {
}
fn coin_index_table_default_config() -> DBOptions {
DBOptions {
options: optimized_for_high_throughput_options(
read_size_from_env(ENV_VAR_COIN_INDEX_BLOCK_CACHE_SIZE_MB).unwrap_or(5 * 1024),
false,
)
.options,
options: default_db_options()
.optimize_for_write_throughput()
.optimize_for_read(
read_size_from_env(ENV_VAR_COIN_INDEX_BLOCK_CACHE_SIZE_MB).unwrap_or(5 * 1024),
)
.options,
rw_options: ReadWriteOptions {
ignore_range_deletions: true,
},
Expand Down
Loading

0 comments on commit 3ca4d30

Please sign in to comment.