Skip to content

Commit

Permalink
[rocksdb] more config updates to improve write throughput (MystenLabs…
Browse files Browse the repository at this point in the history
…#10952)

## Description 

### Blob storage for transactions and effects

Sui transactions and effects are 300B and 600B minimum respectively.
Narwhal payload minimum size is similar to Sui transactions. Narwhal
headers and certificates are 2KB ~ 3KB with 100 nodes. Currently they
are stored as values in sst files, which are read and often written
again each time the file is compacted, while they are not changing. This
behavior can be optimized by using rocksdb blob storage:
https://rocksdb.org/blog/2021/05/26/integrated-blob-db.html.

From the blog post above, it seems the additional lookup cost is
minimal. Also, transactions and effects are accessed most often when
they are recent, when they are still in memtables.

### Increase block size to 16KiB

Default block size is increased from 4KiB to 16KiB, which seems to be
the production setting in Facebook. This should help reduce metadata
size, but may increase read amplifications and block cache load. For
tables optimized for point lookup, block size is still 4KiB.

### Other changes

Default write buffer and base level target sizes are restored to default
values, i.e. what they were before
3ca4d30. They are increased only for
tables optimized for write throughput.

## Test Plan 

Verified on private testnet, with shared counter and batch workloads.

---
If your changes are not user-facing and not a breaking change, you can
skip the following section. Otherwise, please indicate what changed, and
then add to the Release Notes section as highlighted during the release
process.

### Type of Change (Check all that apply)

- [ ] user-visible impact
- [ ] breaking change for a client SDKs
- [ ] breaking change for FNs (FN binary must upgrade)
- [ ] breaking change for validators or node operators (must upgrade
binaries)
- [ ] breaking change for on-chain data layout
- [ ] necessitate either a data wipe or data migration

### Release notes
  • Loading branch information
mwtian authored Apr 17, 2023
1 parent 33895bb commit c0e15b1
Show file tree
Hide file tree
Showing 6 changed files with 197 additions and 110 deletions.
35 changes: 28 additions & 7 deletions crates/sui-core/src/authority/authority_per_epoch_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ pub struct AuthorityPerEpochStore {
#[derive(DBMapUtils)]
pub struct AuthorityEpochTables {
/// This is map between the transaction digest and transactions found in the `transaction_lock`.
#[default_options_override_fn = "transactions_table_default_config"]
#[default_options_override_fn = "signed_transactions_table_default_config"]
signed_transactions:
DBMap<TransactionDigest, TrustedEnvelope<SenderSignedData, AuthoritySignInfo>>,

Expand Down Expand Up @@ -206,6 +206,7 @@ pub struct AuthorityEpochTables {
/// progress. But it is more complex, because it would be necessary to track inflight
/// executions not ordered by indices. For now, tracking inflight certificates as a map
/// seems easier.
#[default_options_override_fn = "pending_execution_table_default_config"]
pending_execution: DBMap<TransactionDigest, TrustedExecutableTransaction>,

/// Track which transactions have been processed in handle_consensus_transaction. We must be
Expand All @@ -219,6 +220,7 @@ pub struct AuthorityEpochTables {
consensus_message_processed: DBMap<SequencedConsensusTransactionKey, bool>,

/// Map stores pending transactions that this authority submitted to consensus
#[default_options_override_fn = "pending_consensus_transactions_table_default_config"]
pending_consensus_transactions: DBMap<ConsensusTransactionKey, ConsensusTransaction>,

// todo - this table will be deleted after switch to EpochFlag::InMemoryCheckpointRoots
Expand Down Expand Up @@ -264,6 +266,7 @@ pub struct AuthorityEpochTables {
/// the sequence number of checkpoint does not match height here.
///
/// The boolean value indicates whether this is the last checkpoint of the epoch.
#[default_options_override_fn = "pending_checkpoints_table_default_config"]
pending_checkpoints: DBMap<CheckpointCommitHeight, PendingCheckpoint>,

/// Checkpoint builder maintains internal list of transactions it included in checkpoints here
Expand Down Expand Up @@ -297,6 +300,30 @@ pub struct AuthorityEpochTables {
override_protocol_upgrade_buffer_stake: DBMap<u64, u64>,
}

fn signed_transactions_table_default_config() -> DBOptions {
default_db_options()
.optimize_for_write_throughput()
.optimize_for_large_values_no_scan()
}

fn pending_execution_table_default_config() -> DBOptions {
default_db_options()
.optimize_for_write_throughput()
.optimize_for_large_values_no_scan()
}

fn pending_consensus_transactions_table_default_config() -> DBOptions {
default_db_options()
.optimize_for_write_throughput()
.optimize_for_large_values_no_scan()
}

fn pending_checkpoints_table_default_config() -> DBOptions {
default_db_options()
.optimize_for_write_throughput()
.optimize_for_large_values_no_scan()
}

impl AuthorityEpochTables {
pub fn open(epoch: EpochId, parent_path: &Path, db_options: Option<Options>) -> Self {
Self::open_tables_transactional(
Expand Down Expand Up @@ -1983,12 +2010,6 @@ impl AuthorityPerEpochStore {
}
}

fn transactions_table_default_config() -> DBOptions {
default_db_options()
.optimize_for_point_lookup(128)
.optimize_for_write_throughput()
}

impl ExecutionComponents {
fn new(
protocol_config: &ProtocolConfig,
Expand Down
3 changes: 3 additions & 0 deletions crates/sui-core/src/authority/authority_store_tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,7 @@ fn objects_table_default_config() -> DBOptions {
fn transactions_table_default_config() -> DBOptions {
default_db_options()
.optimize_for_write_throughput()
.optimize_for_large_values_no_scan()
.optimize_for_point_lookup(
read_size_from_env(ENV_VAR_TRANSACTIONS_BLOCK_CACHE_SIZE).unwrap_or(512),
)
Expand All @@ -439,6 +440,7 @@ fn transactions_table_default_config() -> DBOptions {
fn effects_table_default_config() -> DBOptions {
default_db_options()
.optimize_for_write_throughput()
.optimize_for_large_values_no_scan()
.optimize_for_point_lookup(
read_size_from_env(ENV_VAR_EFFECTS_BLOCK_CACHE_SIZE).unwrap_or(1024),
)
Expand All @@ -447,6 +449,7 @@ fn effects_table_default_config() -> DBOptions {
fn events_table_default_config() -> DBOptions {
default_db_options()
.optimize_for_write_throughput()
.optimize_for_large_values_no_scan()
.optimize_for_read(read_size_from_env(ENV_VAR_EVENTS_BLOCK_CACHE_SIZE).unwrap_or(1024))
}

Expand Down
4 changes: 2 additions & 2 deletions crates/typed-store-derive/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ pub fn derive_dbmap_utils_general(input: TokenStream) -> TokenStream {
};
// Safe to call unwrap because we will have at least one field_name entry in the struct
let rwopt_cfs: std::collections::HashMap<String, typed_store::rocks::ReadWriteOptions> = opt_cfs.iter().map(|q| (q.0.as_str().to_string(), q.1.rw_options.clone())).collect();
let opt_cfs: Vec<_> = opt_cfs.iter().map(|q| (q.0.as_str(), &q.1.options)).collect();
let opt_cfs: Vec<_> = opt_cfs.iter().map(|q| (q.0.as_str(), q.1.options.clone())).collect();
let db = match (as_secondary_with_path, is_transaction) {
(Some(p), _) => typed_store::rocks::open_cf_opts_secondary(path, Some(&p), global_db_options_override, metric_conf, &opt_cfs),
(_, true) => typed_store::rocks::open_cf_opts_transactional(path, global_db_options_override, metric_conf, &opt_cfs),
Expand Down Expand Up @@ -789,7 +789,7 @@ pub fn derive_sallydb_general(input: TokenStream) -> TokenStream {
};
// Safe to call unwrap because we will have at least one field_name entry in the struct
let rwopt_cfs: std::collections::HashMap<String, typed_store::rocks::ReadWriteOptions> = opt_cfs.iter().map(|q| (q.0.as_str().to_string(), q.1.rw_options.clone())).collect();
let opt_cfs: Vec<_> = opt_cfs.iter().map(|q| (q.0.as_str(), &q.1.options)).collect();
let opt_cfs: Vec<_> = opt_cfs.iter().map(|q| (q.0.as_str(), q.1.options.clone())).collect();
let db = match access_type {
RocksDBAccessType::Secondary(Some(p)) => typed_store::rocks::open_cf_opts_secondary(path, Some(&p), global_db_options_override, metric_conf, &opt_cfs),
_ => typed_store::rocks::open_cf_opts(path, global_db_options_override, metric_conf, &opt_cfs)
Expand Down
Loading

0 comments on commit c0e15b1

Please sign in to comment.