Skip to content

Commit

Permalink
[forge] Add three_region_simulation_with_different_node_speed (aptos-…
Browse files Browse the repository at this point in the history
…labs#5576)

Add a test where, on top of realistic network latencies, we add more realistic heterogeneous processing speed.

Among other things, this tests that chain-health backoff works correctly, and we don't experience any chain pauses.
  • Loading branch information
igor-aptos authored Nov 22, 2022
1 parent 37be68b commit dc5ef1f
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Continuous E2E Network Latency Test With Different Node Speed

permissions:
issues: write
pull-requests: write

on:
workflow_dispatch:
schedule:
- cron: "0 */8 * * *"

jobs:
# Test under sub optimal circumstances (network delay and different node processing speed)
run-forge-three-region:
uses: ./.github/workflows/run-forge.yaml
secrets: inherit
with:
FORGE_NAMESPACE: forge-three-region-with-different-node-speed
# Run for 30 minutes
FORGE_RUNNER_DURATION_SECS: 3600
# Pre release has chaos applied
FORGE_TEST_SUITE: three_region_simulation_with_different_node_speed
POST_TO_SLACK: true
FORGE_ENABLE_FAILPOINTS: true
1 change: 1 addition & 0 deletions aptos-move/aptos-vm/src/aptos_vm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1093,6 +1093,7 @@ impl VMAdapter for AptosVM {
(vm_status, output, Some("waypoint_write_set".to_string()))
}
PreprocessedTransaction::UserTransaction(txn) => {
fail_point!("aptos_vm::execution::user_transaction");
let sender = txn.sender().to_string();
let _timer = TXN_TOTAL_SECONDS.start_timer();
let (vm_status, output) =
Expand Down
43 changes: 40 additions & 3 deletions testsuite/forge-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use testcases::performance_with_fullnode_test::PerformanceBenchmarkWithFN;
use testcases::state_sync_performance::{
StateSyncFullnodeFastSyncPerformance, StateSyncValidatorPerformance,
};
use testcases::three_region_simulation_test::ThreeRegionSimulationTest;
use testcases::three_region_simulation_test::{ExecutionDelayConfig, ThreeRegionSimulationTest};
use testcases::twin_validator_test::TwinValidatorTest;
use testcases::validator_join_leave_test::ValidatorJoinLeaveTest;
use testcases::validator_reboot_stress_test::ValidatorRebootStressTest;
Expand Down Expand Up @@ -473,9 +473,44 @@ fn single_test_suite(test_name: &str) -> Result<ForgeConfig<'static>> {
.with_initial_validator_count(NonZeroUsize::new(12).unwrap())
.with_initial_fullnode_count(12)
.with_emit_job(EmitJobRequest::default().mode(EmitJobMode::ConstTps { tps: 5000 }))
.with_network_tests(vec![&ThreeRegionSimulationTest])
.with_network_tests(vec![&ThreeRegionSimulationTest {
add_execution_delay: None,
}])
// TODO(rustielin): tune these success critiera after we have a better idea of the test behavior
.with_success_criteria(SuccessCriteria::new(3000, 100000, true, None, None, None)),
"three_region_simulation_with_different_node_speed" => config
.with_initial_validator_count(NonZeroUsize::new(30).unwrap())
.with_initial_fullnode_count(30)
.with_emit_job(EmitJobRequest::default().mode(EmitJobMode::ConstTps { tps: 5000 }))
.with_network_tests(vec![&ThreeRegionSimulationTest {
add_execution_delay: Some(ExecutionDelayConfig {
inject_delay_node_fraction: 0.5,
inject_delay_max_transaction_percentage: 40,
inject_delay_per_transaction_ms: 2,
}),
}])
.with_node_helm_config_fn(Arc::new(move |helm_values| {
helm_values["validator"]["config"]["api"]["failpoints_enabled"] = true.into();
// helm_values["validator"]["config"]["consensus"]["max_sending_block_txns"] =
// 4000.into();
// helm_values["validator"]["config"]["consensus"]["max_sending_block_bytes"] =
// 1000000.into();
helm_values["fullnode"]["config"]["state_sync"]["state_sync_driver"]
["bootstrapping_mode"] = "ExecuteTransactionsFromGenesis".into();
helm_values["fullnode"]["config"]["state_sync"]["state_sync_driver"]
["continuous_syncing_mode"] = "ExecuteTransactions".into();
}))
.with_success_criteria(SuccessCriteria::new(
1000,
100000,
true,
None,
None,
Some(StateProgressThreshold {
max_no_progress_secs: 30.0,
max_round_gap: 10,
}),
)),
"network_bandwidth" => config
.with_initial_validator_count(NonZeroUsize::new(8).unwrap())
.with_network_tests(vec![&NetworkBandwidthTest]),
Expand Down Expand Up @@ -907,7 +942,9 @@ fn chaos_test_suite(duration: Duration) -> ForgeConfig<'static> {
.with_initial_validator_count(NonZeroUsize::new(30).unwrap())
.with_network_tests(vec![
&NetworkBandwidthTest,
&ThreeRegionSimulationTest,
&ThreeRegionSimulationTest {
add_execution_delay: None,
},
&NetworkLossTest,
])
.with_success_criteria(SuccessCriteria::new(
Expand Down
97 changes: 94 additions & 3 deletions testsuite/testcases/src/three_region_simulation_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,30 @@
use crate::{LoadDestination, NetworkLoadTest};
use aptos_logger::info;
use forge::{
GroupNetworkDelay, NetworkContext, NetworkTest, Swarm, SwarmChaos, SwarmNetworkBandwidth,
SwarmNetworkDelay, Test,
GroupNetworkDelay, NetworkContext, NetworkTest, Swarm, SwarmChaos, SwarmExt,
SwarmNetworkBandwidth, SwarmNetworkDelay, Test,
};
use rand::Rng;
use tokio::runtime::Runtime;

pub struct ThreeRegionSimulationTest;
/// Config for additing variable processing overhead/delay into
/// execution, to make different nodes have different processing speed.
pub struct ExecutionDelayConfig {
/// Fraction (0.0 - 1.0) of nodes on which any delay will be introduced
pub inject_delay_node_fraction: f64,
/// For nodes with delay, what percentage (0-100) of transaction will be delayed.
/// (this is needed because delay that can be introduced is integer number of ms)
/// Different node speed come from this setting, each node is selected a number
/// between 1 and given max.
pub inject_delay_max_transaction_percentage: u32,
/// Fixed busy-loop delay applied to each transaction that is delayed,
/// before it is executed.
pub inject_delay_per_transaction_ms: u32,
}

pub struct ThreeRegionSimulationTest {
pub add_execution_delay: Option<ExecutionDelayConfig>,
}

impl Test for ThreeRegionSimulationTest {
fn name(&self) -> &'static str {
Expand Down Expand Up @@ -99,6 +118,70 @@ fn create_bandwidth_limit() -> SwarmNetworkBandwidth {
}
}

fn add_execution_delay(swarm: &mut dyn Swarm, config: &ExecutionDelayConfig) -> anyhow::Result<()> {
let runtime = Runtime::new().unwrap();
let validators = swarm.get_validator_clients_with_names();

runtime.block_on(async {
let mut rng = rand::thread_rng();
for (name, validator) in validators {
let sleep_fraction = if rng.gen_bool(config.inject_delay_node_fraction) {
rng.gen_range(1_u32, config.inject_delay_max_transaction_percentage)
} else {
0
};
let name = name.clone();
info!(
"Validator {} adding {}% of transactions with 1ms execution delay",
name, sleep_fraction
);
validator
.set_failpoint(
"aptos_vm::execution::user_transaction".to_string(),
format!(
"{}%delay({})",
sleep_fraction, config.inject_delay_per_transaction_ms
),
)
.await
.map_err(|e| {
anyhow::anyhow!(
"set_failpoint to add execution delay on {} failed, {:?}",
name,
e
)
})?;
}
Ok::<(), anyhow::Error>(())
})
}

fn remove_execution_delay(swarm: &mut dyn Swarm) -> anyhow::Result<()> {
let runtime = Runtime::new().unwrap();
let validators = swarm.get_validator_clients_with_names();

runtime.block_on(async {
for (name, validator) in validators {
let name = name.clone();

validator
.set_failpoint(
"aptos_vm::execution::block_metadata".to_string(),
"off".to_string(),
)
.await
.map_err(|e| {
anyhow::anyhow!(
"set_failpoint to remove execution delay on {} failed, {:?}",
name,
e
)
})?;
}
Ok::<(), anyhow::Error>(())
})
}

impl NetworkLoadTest for ThreeRegionSimulationTest {
fn setup(&self, ctx: &mut NetworkContext) -> anyhow::Result<LoadDestination> {
// inject network delay
Expand All @@ -111,10 +194,18 @@ impl NetworkLoadTest for ThreeRegionSimulationTest {
let chaos = SwarmChaos::Bandwidth(bandwidth);
ctx.swarm().inject_chaos(chaos)?;

if let Some(config) = &self.add_execution_delay {
add_execution_delay(ctx.swarm(), config)?;
}

Ok(LoadDestination::AllNodes)
}

fn finish(&self, swarm: &mut dyn Swarm) -> anyhow::Result<()> {
if self.add_execution_delay.is_some() {
remove_execution_delay(swarm)?;
}

swarm.remove_all_chaos()
}
}
Expand Down

0 comments on commit dc5ef1f

Please sign in to comment.