From 50f9c7492ae04500e0b6b0df92c13f794505de86 Mon Sep 17 00:00:00 2001 From: Brandon Williams Date: Sun, 5 Dec 2021 11:32:28 -0800 Subject: [PATCH] remove cluster test --- .../actions/land-blocking/cti-codebuild.sh | 29 - .../actions/land-blocking/find-lbt-images.sh | 4 +- .github/dependabot.yml | 8 - .github/workflows/ci-post-land.yml | 74 -- .github/workflows/ci-test.yml | 19 +- .github/workflows/daily.yml | 36 - Cargo.lock | 104 -- Cargo.toml | 1 - .../src/persistent_safety_storage.rs | 2 +- docker/build-aws.sh | 11 +- docker/build-common.sh | 1 - docker/build-push-local.sh | 2 +- docker/build_push.sh | 2 +- docker/cluster-test-util/Dockerfile | 9 - docker/cluster-test-util/build-and-push.sh | 11 - docker/cluster-test/Dockerfile | 37 - docker/cluster-test/build.sh | 8 - docker/cluster-test/buildspec.yaml | 28 - docker/docker_republish.sh | 2 +- docker/tag-and-push.sh | 2 +- language/benchmarks/README.md | 3 - scripts/cluster_test_pod_template.yaml | 30 - scripts/ct-k8s-status.sh | 19 - scripts/cti | 293 ------ scripts/dockerhub_prune.sh | 1 - state-sync/state-sync-v1/src/counters.rs | 3 - testsuite/cluster-test/Cargo.toml | 76 -- testsuite/cluster-test/README.md | 31 - testsuite/cluster-test/src/aws.rs | 127 --- testsuite/cluster-test/src/cluster.rs | 346 ------- testsuite/cluster-test/src/cluster_builder.rs | 617 ------------ .../src/cluster_swarm/cluster_swarm_kube.rs | 905 ------------------ .../src/cluster_swarm/configs/fullnode.yaml | 73 -- .../cluster_swarm/configs/safetyrules.yaml | 9 - .../src/cluster_swarm/configs/validator.yaml | 70 -- .../cluster_swarm/fluent-bit/fluent-bit.conf | 39 - .../src/cluster_swarm/fluent-bit/parsers.conf | 5 - .../cluster-test/src/cluster_swarm/mod.rs | 24 - .../templates/diem_node_service_template.yaml | 30 - .../templates/diem_node_spec_template.yaml | 93 -- .../cluster_swarm/templates/job_template.yaml | 44 - .../templates/lsr_service_template.yaml | 19 - .../templates/lsr_spec_template.yaml | 73 -- .../templates/vault_service_template.yaml | 25 - .../templates/vault_spec_template.yaml | 134 --- testsuite/cluster-test/src/effects/mod.rs | 29 - .../cluster-test/src/effects/network_delay.rs | 112 --- .../cluster-test/src/effects/packet_loss.rs | 51 - .../src/effects/stop_validator.rs | 41 - .../src/experiments/accurate_measurment.rs | 114 --- .../src/experiments/compatibility_test.rs | 422 -------- .../src/experiments/cpu_flamegraph.rs | 120 --- .../cluster-test/src/experiments/load_test.rs | 518 ---------- testsuite/cluster-test/src/experiments/mod.rs | 167 ---- .../packet_loss_random_validators.rs | 94 -- .../src/experiments/performance_benchmark.rs | 312 ------ ...mance_benchmark_three_region_simulation.rs | 100 -- .../src/experiments/reboot_cluster.rs | 66 -- .../experiments/reboot_random_validators.rs | 116 --- .../src/experiments/reconfiguration_test.rs | 246 ----- .../src/experiments/recovery_time.rs | 119 --- .../src/experiments/state_sync_performance.rs | 185 ---- .../src/experiments/twin_validator.rs | 166 ---- .../src/experiments/versioning_test.rs | 227 ----- testsuite/cluster-test/src/genesis_helper.rs | 396 -------- .../cluster-test/src/health/commit_check.rs | 106 -- .../src/health/debug_interface_log_tail.rs | 136 --- .../cluster-test/src/health/fullnode_check.rs | 83 -- .../cluster-test/src/health/liveness_check.rs | 83 -- testsuite/cluster-test/src/health/log_tail.rs | 62 -- testsuite/cluster-test/src/health/mod.rs | 255 ----- testsuite/cluster-test/src/instance.rs | 454 --------- testsuite/cluster-test/src/lib.rs | 75 -- testsuite/cluster-test/src/main.rs | 877 ----------------- testsuite/cluster-test/src/prometheus.rs | 212 ---- testsuite/cluster-test/src/report.rs | 87 -- testsuite/cluster-test/src/stats.rs | 47 - testsuite/cluster-test/src/suite.rs | 169 ---- testsuite/diem-swarm/src/main.rs | 24 - testsuite/forge/src/github.rs | 2 +- testsuite/smoke-test/src/workspace_builder.rs | 13 +- x.toml | 1 - 82 files changed, 11 insertions(+), 9755 deletions(-) delete mode 100755 .github/actions/land-blocking/cti-codebuild.sh delete mode 100644 docker/cluster-test-util/Dockerfile delete mode 100755 docker/cluster-test-util/build-and-push.sh delete mode 100644 docker/cluster-test/Dockerfile delete mode 100755 docker/cluster-test/build.sh delete mode 100644 docker/cluster-test/buildspec.yaml delete mode 100644 scripts/cluster_test_pod_template.yaml delete mode 100755 scripts/ct-k8s-status.sh delete mode 100755 scripts/cti delete mode 100644 testsuite/cluster-test/Cargo.toml delete mode 100644 testsuite/cluster-test/README.md delete mode 100644 testsuite/cluster-test/src/aws.rs delete mode 100644 testsuite/cluster-test/src/cluster.rs delete mode 100644 testsuite/cluster-test/src/cluster_builder.rs delete mode 100644 testsuite/cluster-test/src/cluster_swarm/cluster_swarm_kube.rs delete mode 100644 testsuite/cluster-test/src/cluster_swarm/configs/fullnode.yaml delete mode 100644 testsuite/cluster-test/src/cluster_swarm/configs/safetyrules.yaml delete mode 100644 testsuite/cluster-test/src/cluster_swarm/configs/validator.yaml delete mode 100644 testsuite/cluster-test/src/cluster_swarm/fluent-bit/fluent-bit.conf delete mode 100644 testsuite/cluster-test/src/cluster_swarm/fluent-bit/parsers.conf delete mode 100644 testsuite/cluster-test/src/cluster_swarm/mod.rs delete mode 100644 testsuite/cluster-test/src/cluster_swarm/templates/diem_node_service_template.yaml delete mode 100644 testsuite/cluster-test/src/cluster_swarm/templates/diem_node_spec_template.yaml delete mode 100644 testsuite/cluster-test/src/cluster_swarm/templates/job_template.yaml delete mode 100644 testsuite/cluster-test/src/cluster_swarm/templates/lsr_service_template.yaml delete mode 100644 testsuite/cluster-test/src/cluster_swarm/templates/lsr_spec_template.yaml delete mode 100644 testsuite/cluster-test/src/cluster_swarm/templates/vault_service_template.yaml delete mode 100644 testsuite/cluster-test/src/cluster_swarm/templates/vault_spec_template.yaml delete mode 100644 testsuite/cluster-test/src/effects/mod.rs delete mode 100644 testsuite/cluster-test/src/effects/network_delay.rs delete mode 100644 testsuite/cluster-test/src/effects/packet_loss.rs delete mode 100644 testsuite/cluster-test/src/effects/stop_validator.rs delete mode 100644 testsuite/cluster-test/src/experiments/accurate_measurment.rs delete mode 100644 testsuite/cluster-test/src/experiments/compatibility_test.rs delete mode 100644 testsuite/cluster-test/src/experiments/cpu_flamegraph.rs delete mode 100644 testsuite/cluster-test/src/experiments/load_test.rs delete mode 100644 testsuite/cluster-test/src/experiments/mod.rs delete mode 100644 testsuite/cluster-test/src/experiments/packet_loss_random_validators.rs delete mode 100644 testsuite/cluster-test/src/experiments/performance_benchmark.rs delete mode 100644 testsuite/cluster-test/src/experiments/performance_benchmark_three_region_simulation.rs delete mode 100644 testsuite/cluster-test/src/experiments/reboot_cluster.rs delete mode 100644 testsuite/cluster-test/src/experiments/reboot_random_validators.rs delete mode 100644 testsuite/cluster-test/src/experiments/reconfiguration_test.rs delete mode 100644 testsuite/cluster-test/src/experiments/recovery_time.rs delete mode 100644 testsuite/cluster-test/src/experiments/state_sync_performance.rs delete mode 100644 testsuite/cluster-test/src/experiments/twin_validator.rs delete mode 100644 testsuite/cluster-test/src/experiments/versioning_test.rs delete mode 100644 testsuite/cluster-test/src/genesis_helper.rs delete mode 100644 testsuite/cluster-test/src/health/commit_check.rs delete mode 100644 testsuite/cluster-test/src/health/debug_interface_log_tail.rs delete mode 100644 testsuite/cluster-test/src/health/fullnode_check.rs delete mode 100644 testsuite/cluster-test/src/health/liveness_check.rs delete mode 100644 testsuite/cluster-test/src/health/log_tail.rs delete mode 100644 testsuite/cluster-test/src/health/mod.rs delete mode 100644 testsuite/cluster-test/src/instance.rs delete mode 100644 testsuite/cluster-test/src/lib.rs delete mode 100644 testsuite/cluster-test/src/main.rs delete mode 100644 testsuite/cluster-test/src/prometheus.rs delete mode 100644 testsuite/cluster-test/src/report.rs delete mode 100644 testsuite/cluster-test/src/stats.rs delete mode 100644 testsuite/cluster-test/src/suite.rs diff --git a/.github/actions/land-blocking/cti-codebuild.sh b/.github/actions/land-blocking/cti-codebuild.sh deleted file mode 100755 index bebd52e9fbf2a..0000000000000 --- a/.github/actions/land-blocking/cti-codebuild.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# Copyright (c) The Diem Core Contributors -# SPDX-License-Identifier: Apache-2.0 - -# Expects these environment variables -if [ -z $VERSION ] || [ -z $ADDL_TAG ]; then - echo "Must specify image VERSION and ADDL_TAG to build" - exit 1 -fi - -set +e -date -RETRYABLE_EXIT_CODE=2 -for ((i = 0; i < 3; i++)); do - echo "Build attempt $i" - docker/build-aws.sh --build-all-cti --version $VERSION --addl_tags canary,${ADDL_TAG} - return_code=$? - if [[ $return_code -eq 0 ]]; then - echo "Build successful" - exit 0 - fi - if [[ $return_code -ne ${RETRYABLE_EXIT_CODE} ]]; then - echo "Build failed" - exit 1 - fi - echo "Retrying build" -done -echo "Build failed after retries" -exit 1 diff --git a/.github/actions/land-blocking/find-lbt-images.sh b/.github/actions/land-blocking/find-lbt-images.sh index 99d90d8bd5e62..1a988e244fac7 100755 --- a/.github/actions/land-blocking/find-lbt-images.sh +++ b/.github/actions/land-blocking/find-lbt-images.sh @@ -2,9 +2,7 @@ # Copyright (c) The Diem Core Contributors # SPDX-License-Identifier: Apache-2.0 -# adapted from --build-all-cti option from diem/docker/build-aws.sh, which is how -# land blocking test builds images before running cluster-test -REPOS=(diem/validator diem/cluster_test diem/init diem/validator_tcb) +REPOS=(diem/validator diem/init diem/validator_tcb) # the number of commits backwards we want to look END=50 diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 4aefd5a9abe9f..770e7cc9787b9 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -58,14 +58,6 @@ updates: - "docker" - "dependencies" - - package-ecosystem: "docker" - directory: "/docker/cluster-test" - schedule: - interval: "daily" - labels: - - "docker" - - "dependencies" - - package-ecosystem: "docker" directory: "/docker/init" schedule: diff --git a/.github/workflows/ci-post-land.yml b/.github/workflows/ci-post-land.yml index 1e4689e317208..2b22bf70a1e12 100644 --- a/.github/workflows/ci-post-land.yml +++ b/.github/workflows/ci-post-land.yml @@ -150,78 +150,6 @@ jobs: run: | docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} - run-cluster-test-pre-release-suite: - needs: prepare - name: Run the pre-release suite of Cluster Test - runs-on: self-hosted - if: ${{ startsWith(needs.prepare.outputs.changes-target-branch, 'release') }} - # The pre-release suite run time varies 1~1.5 hr. - timeout-minutes: 120 - steps: - - uses: actions/checkout@v2.4.0 - with: - fetch-depth: 0 #get all the history!!! - - name: set_env - id: set_env - run: | - HEAD_GIT_REV=$(git rev-parse --short=8 HEAD) - echo "HEAD_GIT_REV=$HEAD_GIT_REV" >> $GITHUB_ENV - IMAGE_TAG=$(echo ${GITHUB_REF#refs/heads/})_$HEAD_GIT_REV - echo $IMAGE_TAG - echo "IMAGE_TAG=$IMAGE_TAG" >> $GITHUB_ENV - - name: poll_images - # Poll until images are ready - env: - AWS_REGION: us-west-2 - RETRIES: 30 - run: | - set +e - retry=0 - status=1 - while [[ $status != 0 && $retry -lt $RETRIES ]]; do - status=0 - for image in diem/validator diem/validator_tcb diem/init diem/cluster_test; do - aws ecr describe-images --region $AWS_REGION --repository-name $image --image-ids=imageTag=$IMAGE_TAG - status=$((status + $?)) - done - retry=$((retry + 1)) - if [[ $status != 0 ]] ; then - echo "CI has not pushed all images to ECR." - echo "Wait $((1*$retry)) of $((1*$RETRIES)) minutes before retry." - sleep 1m - fi - done - exit $status - - name: Run Cluster Test - run: | - date - BASE_GIT_REV=$(git rev-parse $HEAD_GIT_REV^) - ./scripts/cti --tag $IMAGE_TAG --timeout-secs 7200 \ - --env SLACK_CHANGELOG_URL=${{ secrets.WEBHOOK_CHANGELOG }} \ - --changelog $BASE_GIT_REV $HEAD_GIT_REV \ - --suite pre_release - - name: Push alert - if: ${{ failure() }} - run: | - jq -n \ - --arg msg "*${{ github.job }}* job in ${{ github.workflow }} workflow failed with $IMAGE_TAG." \ - --arg url "https://github.com/${{ github.repository }}/actions/runs/${{github.run_id}}" \ - '{ - "attachments": [ - { - "text": $msg, - "actions": [ - { - "type": "button", - "text": "Visit Job", - "url": $url - } - ] - } - ] - }' > /tmp/payload - curl -X POST -H 'Content-type: application/json' -d @/tmp/payload ${{ secrets.WEBHOOK_PUSH }} - run-forge-test-pre-release-suite: needs: prepare name: Run the pre-release suite of Forge Test @@ -347,7 +275,6 @@ jobs: docker/build_push.sh -u -p -b ${BRANCH} -n tools || success=$(echo "tools" >> "${tmpfile}"; echo 1) docker/build_push.sh -u -p -b ${BRANCH} -n validator || success=$(echo "validator" >> "${tmpfile}"; echo 1) docker/build_push.sh -u -p -b ${BRANCH} -n validator-tcb || success=$(echo "validator-tcb" >> "${tmpfile}"; echo 1) - docker/build_push.sh -u -p -b ${BRANCH} -n cluster-test || success=$(echo "cluster-test" >> "${tmpfile}"; echo 1) docker/build_push.sh -u -p -b ${BRANCH} -n forge || success=$(echo "forge" >> "${tmpfile}"; echo 1) if [[ "$success" == "1" ]]; then cat "${tmpfile}" @@ -369,7 +296,6 @@ jobs: docker/build_push.sh -u -b ${BRANCH} -n tools || success=$(echo "tools" >> "${tmpfile}"; echo 1) docker/build_push.sh -u -b ${BRANCH} -n validator || success=$(echo "validator" >> "${tmpfile}"; echo 1) docker/build_push.sh -u -b ${BRANCH} -n validator-tcb || success=$(echo "validator-tcb" >> "${tmpfile}"; echo 1) - docker/build_push.sh -u -b ${BRANCH} -n cluster-test || success=$(echo "cluster-test" >> "${tmpfile}"; echo 1) docker/build_push.sh -u -b ${BRANCH} -n forge || success=$(echo "forge" >> "${tmpfile}"; echo 1) if [[ "$success" == "1" ]]; then cat "${tmpfile}" diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml index 2bcccfca537fe..dd99db1456034 100644 --- a/.github/workflows/ci-test.yml +++ b/.github/workflows/ci-test.yml @@ -179,7 +179,7 @@ jobs: strategy: matrix: target_images: - [client faucet cluster-test forge, init tools validator validator-tcb] + [client faucet forge, init tools validator validator-tcb] steps: - uses: actions/checkout@v2.4.0 with: @@ -290,22 +290,6 @@ jobs: echo "PREV_TAG=$compat_prev_tag" >> $GITHUB_ENV echo "BUILD_PREV=0" >> $GITHUB_ENV fi - - name: build extra images - id: build-extra-images - run: | - res=land_$BASE_GIT_REV - if [ $BUILD_PREV -eq 1 ]; then - compat_prev_tag=$res - echo "Starting codebuild for $compat_prev_tag" - VERSION=$BASE_GIT_REV ADDL_TAG=$compat_prev_tag .github/actions/land-blocking/cti-codebuild.sh &> codebuild-prev.log & - prev_build_pid=$! - wait $prev_build_pid - echo "====== codebuild-prev.log start ======" - cat codebuild-prev.log - else - res=$PREV_TAG; - fi - echo "::set-output name=prev-tag::$(echo $res)"; - name: Early terminate workflow if: ${{ failure() }} uses: ./.github/actions/early-terminator @@ -959,7 +943,6 @@ jobs: # On `push` this value will be empty and will "do-the-right-thing" ref: ${{ github.event.pull_request.head.sha }} - name: Launch forge test - # NOTE Remember to update PR comment payload if cti cmd is updated. run: | set +e date diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index deeb43d4306ff..6d964f64b9d65 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -100,42 +100,6 @@ jobs: # run: bash <(curl -s https://codecov.io/bash) -f $CODECOV_OUTPUT/lcovhtml/lcov.info -F unittest; #- uses: ./.github/actions/build-teardown - json-rpc-backward-compat-test: - # Test old client from release (prod) and pre-release (rc) branches - # against new server in the main branch through cluster-test's - # json-rpc interface. - runs-on: ubuntu-20.04-xl - container: - image: ghcr.io/diem/diem_build_environment:main - volumes: - - "${{github.workspace}}:/opt/git/diem" - env: - DEVNET_MINT_TEST_KEY: ${{ secrets.DEVNET_MINT_TEST_KEY }} - DEVNET_ENDPOINT: dev.testnet.diem.com - MESSAGE_PAYLOAD_FILE: /tmp/message - strategy: - fail-fast: false - matrix: - release-branch: [release-1.5, release-1.4] - steps: - - uses: actions/checkout@v2.4.0 - with: - ref: ${{ matrix.release-branch }} - - uses: ./.github/actions/build-setup - - name: Run cluster test diag on devnet - run: | - echo ${DEVNET_MINT_TEST_KEY} | base64 -d > /tmp/mint_test.key - RUST_BACKTRACE=full cargo run -p cluster-test -- --diag --swarm --mint-file=/tmp/mint_test.key --peers=dev.testnet.diem.com:80:80 --chain-id=DEVNET > ${MESSAGE_PAYLOAD_FILE} - - name: Run cluster test to submit random txn to devnet - run: | - RUST_BACKTRACE=full cargo run -p cluster-test -- --emit-tx --swarm --mint-file=/tmp/mint_test.key --peers=dev.testnet.diem.com:80:80 --chain-id=DEVNET --accounts-per-client=2 --workers-per-ac=2 --duration=30 >> ${MESSAGE_PAYLOAD_FILE} - - uses: ./.github/actions/slack-file - with: - webhook: ${{ secrets.WEBHOOK_BREAKING_CHANGE }} - payload-file: ${{ env.MESSAGE_PAYLOAD_FILE }} - if: ${{ failure() }} - - uses: ./.github/actions/build-teardown - prover-inconsistency-test: runs-on: ubuntu-20.04-xl container: diff --git a/Cargo.lock b/Cargo.lock index ac3e9954bf390..731685910a310 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -815,71 +815,6 @@ dependencies = [ "walkdir", ] -[[package]] -name = "cluster-test" -version = "0.1.0" -dependencies = [ - "anyhow", - "async-trait", - "bcs", - "chrono", - "consensus-types", - "debug-interface", - "diem-client", - "diem-config", - "diem-crypto", - "diem-framework-releases", - "diem-genesis-tool", - "diem-global-constants", - "diem-infallible", - "diem-logger", - "diem-management", - "diem-mempool", - "diem-network-address-encryption", - "diem-node", - "diem-operational-tool", - "diem-rest-client", - "diem-retrier", - "diem-sdk", - "diem-secure-storage", - "diem-swarm", - "diem-temppath", - "diem-time-service", - "diem-transaction-builder", - "diem-types", - "diem-workspace-hack", - "flate2", - "forge", - "futures", - "generate-key", - "hex", - "itertools 0.10.1", - "k8s-openapi", - "kube", - "language-e2e-tests", - "move-core-types", - "network", - "network-builder", - "num_cpus", - "once_cell", - "rand 0.8.4", - "regex", - "reqwest", - "rusoto_autoscaling", - "rusoto_core", - "rusoto_s3", - "rusoto_sts", - "seed-peer-generator", - "serde 1.0.130", - "serde_json", - "serde_yaml", - "state-sync-v1", - "structopt 0.3.21", - "termion", - "tokio", - "toml", -] - [[package]] name = "codespan" version = "0.11.1" @@ -3345,18 +3280,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "398ea4fabe40b9b0d885340a2a991a44c8a645624075ad966d21f88688e2b69e" -[[package]] -name = "flate2" -version = "1.0.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0" -dependencies = [ - "cfg-if 1.0.0", - "crc32fast", - "libc", - "miniz_oxide", -] - [[package]] name = "fnv" version = "1.0.7" @@ -7077,20 +7000,6 @@ dependencies = [ "syn 1.0.74", ] -[[package]] -name = "rusoto_autoscaling" -version = "0.46.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7645ce7ec0d2a2f0e2ffdd3f6e04901f70fb396e4d1767b883ebc7361dbf310" -dependencies = [ - "async-trait", - "bytes", - "futures", - "rusoto_core", - "serde_urlencoded 0.6.1", - "xml-rs", -] - [[package]] name = "rusoto_core" version = "0.46.0" @@ -7149,19 +7058,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "rusoto_s3" -version = "0.46.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abc3f56f14ccf91f880b9a9c2d0556d8523e8c155041c54db155b384a1dd1119" -dependencies = [ - "async-trait", - "bytes", - "futures", - "rusoto_core", - "xml-rs", -] - [[package]] name = "rusoto_signature" version = "0.46.0" diff --git a/Cargo.toml b/Cargo.toml index ade643146d485..2962a60e6def9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -186,7 +186,6 @@ members = [ "storage/storage-service", "testsuite/cli", "testsuite/cli/diem-wallet", - "testsuite/cluster-test", "testsuite/diem-fuzzer", "testsuite/diem-fuzzer/fuzz", "testsuite/diem-swarm", diff --git a/consensus/safety-rules/src/persistent_safety_storage.rs b/consensus/safety-rules/src/persistent_safety_storage.rs index 17b1b78702d19..5b7ba648d12c5 100644 --- a/consensus/safety-rules/src/persistent_safety_storage.rs +++ b/consensus/safety-rules/src/persistent_safety_storage.rs @@ -78,7 +78,7 @@ impl PersistentSafetyStorage { ) -> Result<(), Error> { let result = internal_store.import_private_key(CONSENSUS_KEY, consensus_private_key); // Attempting to re-initialize existing storage. This can happen in environments like - // cluster test. Rather than be rigid here, leave it up to the developer to detect + // forge. Rather than be rigid here, leave it up to the developer to detect // inconsistencies or why they did not reset storage between rounds. Do not repeat the // checks again below, because it is just too strange to have a partially configured // storage. diff --git a/docker/build-aws.sh b/docker/build-aws.sh index 11a81b8abfb9a..044f585ca0614 100755 --- a/docker/build-aws.sh +++ b/docker/build-aws.sh @@ -23,16 +23,7 @@ BUILD_PROJECTS=() while [[ "$1" =~ ^- ]]; do case $1 in --build-all ) - BUILD_PROJECTS=(diem-validator diem-cluster-test diem-init diem-faucet diem-safety-rules diem-tools diem-forge) - ;; - # NOTE: This is used in land-blocking job `.github/workflows/ci-test.yml` - # If you change the list of projects to be built for `--build-all-cti`, please - # change the list in `.github/actions/land-blocking/find-lbt-images.sh` as well - --build-all-cti ) - BUILD_PROJECTS=(diem-validator diem-cluster-test diem-init diem-safety-rules) - ;; - --build-cluster-test ) - BUILD_PROJECTS=(diem-cluster-test) + BUILD_PROJECTS=(diem-validator diem-init diem-faucet diem-safety-rules diem-tools diem-forge) ;; --build-validator ) BUILD_PROJECTS=(diem-validator) diff --git a/docker/build-common.sh b/docker/build-common.sh index 538c5f1531536..1941a8258617b 100755 --- a/docker/build-common.sh +++ b/docker/build-common.sh @@ -47,7 +47,6 @@ fi if [ "$IMAGE_TARGETS" = "test" ] || [ "$IMAGE_TARGETS" = "all" ]; then # These non-release binaries are built separately to avoid feature unification issues cargo build --release \ - -p cluster-test \ -p cli \ -p diem-faucet \ -p forge-cli \ diff --git a/docker/build-push-local.sh b/docker/build-push-local.sh index 454fc1fbd3998..06d0a7bc2a68b 100755 --- a/docker/build-push-local.sh +++ b/docker/build-push-local.sh @@ -10,7 +10,7 @@ aws ecr get-login-password \ --username AWS \ --password-stdin "$REPO" -BUILD_PROJECTS=(validator cluster-test init safety-rules) +BUILD_PROJECTS=(validator init safety-rules) TAG=${TAG:-"dev_$(whoami)_$(git rev-parse --short HEAD)"} echo "[$(date)] Using tag $TAG" diff --git a/docker/build_push.sh b/docker/build_push.sh index 5c0e8195cc91f..42fda51bb6e63 100755 --- a/docker/build_push.sh +++ b/docker/build_push.sh @@ -8,7 +8,7 @@ function usage { echo "build_push.sh [-p] -g -b -n [-u]" echo "-p indicates this a prebuild, where images are built and pushed to dockerhub with an prefix of 'pre_', should be run on the 'auto' branch, trigger by bors." echo "-b the branch we're building on, or the branch we're targeting if a prebuild" - echo "-n name, one of init, faucet, validator, validator-tcb, cluster-test, forge" + echo "-n name, one of init, faucet, validator, validator-tcb, forge" echo "-u 'upload', or 'push' the docker images will be pushed to dockerhub, otherwise only locally tag" echo "-o the org to target on dockerhub. Defaults to 'diem'" echo "should be called from the root folder of the diem project, and must have it's .git history" diff --git a/docker/cluster-test-util/Dockerfile b/docker/cluster-test-util/Dockerfile deleted file mode 100644 index bcc7130a83db3..0000000000000 --- a/docker/cluster-test-util/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -# Please only make changes to this Dockerfile which are backward compatible. Avoid breaking changes like removing yum packages -# Since we only use the "latest" version of this image, older jobs might still be using the "latest" image -FROM amazonlinux:2.0.20200722.0@sha256:1481659e18042055e174f9f5e61998bf7ab57f8c9432f3c9412a56d116cc0c68 - -RUN yum -y update && \ - yum install -y git perf procps aws-cli iproute iproute-tc iptables iputils && \ - yum clean all && \ - rm -rf /var/cache/yum && \ - git clone --depth 1 https://github.com/brendangregg/FlameGraph /usr/local/etc/FlameGraph diff --git a/docker/cluster-test-util/build-and-push.sh b/docker/cluster-test-util/build-and-push.sh deleted file mode 100755 index 1280284f655f0..0000000000000 --- a/docker/cluster-test-util/build-and-push.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -# Copyright (c) The Diem Core Contributors -# SPDX-License-Identifier: Apache-2.0 -set -e - -DIR="$( cd "$( dirname "$0" )" && pwd )" - -cd $DIR - -docker build . --tag 853397791086.dkr.ecr.us-west-2.amazonaws.com/cluster-test-util:latest -docker push 853397791086.dkr.ecr.us-west-2.amazonaws.com/cluster-test-util:latest diff --git a/docker/cluster-test/Dockerfile b/docker/cluster-test/Dockerfile deleted file mode 100644 index 5bad386817d55..0000000000000 --- a/docker/cluster-test/Dockerfile +++ /dev/null @@ -1,37 +0,0 @@ -FROM debian:buster-20211011@sha256:f9182ead292f45165f4a851e5ff98ea0800e172ccedce7d17764ffaae5ed4d6e AS debian-base - -FROM debian-base AS toolchain - -# To use http/https proxy while building, use: -# docker build --build-arg https_proxy=http://fwdproxy:8080 --build-arg http_proxy=http://fwdproxy:8080 - -RUN apt-get update && apt-get install -y cmake curl clang git pkg-config libssl-dev - -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain none -ENV PATH "$PATH:/root/.cargo/bin" - -WORKDIR /diem -COPY rust-toolchain /diem/rust-toolchain -RUN rustup install $(cat rust-toolchain) - -FROM toolchain AS builder - -ARG ENABLE_FAILPOINTS -COPY . /diem - -RUN IMAGE_TARGETS="test" ./docker/build-common.sh - -FROM debian-base - -RUN apt-get update && apt-get install -y libssl1.1 openssh-client wget && apt-get clean && rm -r /var/lib/apt/lists/* -RUN cd /usr/local/bin && wget "https://storage.googleapis.com/kubernetes-release/release/v1.17.0/bin/linux/amd64/kubectl" -O kubectl && chmod +x kubectl -RUN mkdir /etc/cluster-test -WORKDIR /etc/cluster-test -COPY --from=builder /diem/target/release/cluster-test /usr/local/bin/cluster-test -ENTRYPOINT ["cluster-test"] -ARG BUILD_DATE -ARG GIT_REV -ARG GIT_UPSTREAM -LABEL org.label-schema.schema-version="1.0" -LABEL org.label-schema.build-date=$BUILD_DATE -LABEL org.label-schema.vcs-ref=$GIT_REV diff --git a/docker/cluster-test/build.sh b/docker/cluster-test/build.sh deleted file mode 100755 index b355f52336539..0000000000000 --- a/docker/cluster-test/build.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -# Copyright (c) The Diem Core Contributors -# SPDX-License-Identifier: Apache-2.0 -set -e - -DIR="$( cd "$( dirname "$0" )" && pwd )" - -$DIR/../diem-build.sh $DIR/Dockerfile diem/cluster_test:latest "$@" diff --git a/docker/cluster-test/buildspec.yaml b/docker/cluster-test/buildspec.yaml deleted file mode 100644 index 9e680971be0ed..0000000000000 --- a/docker/cluster-test/buildspec.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# This buildspec is for AWS Codebuild -version: 0.2 - -env: - secrets-manager: - DOCKERHUB_USERNAME: dockerhub_ro_username - DOCKERHUB_PASSWORD: dockerhub_ro_password - -phases: - install: - runtime-versions: - docker: 18 - pre_build: - commands: - - echo logging in to dockerhub. - - echo "$DOCKERHUB_PASSWORD" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin - build: - commands: - - echo Build started on `date` - - echo Building the Docker image... - - bash docker/cluster-test/build.sh - post_build: - commands: - - echo Build completed on `date` - # Tag and push the docker images - - echo Logging in to Amazon ECR... - - $(aws ecr get-login --no-include-email --region us-west-2) - - SOURCE=diem/cluster_test:latest TARGET_REPO=$DIEM_CLUSTER_TEST_REPO TARGET_TAGS="${TAGS}:dev_$(git rev-parse --short=8 HEAD)" docker/tag-and-push.sh diff --git a/docker/docker_republish.sh b/docker/docker_republish.sh index 2454c7917150f..b6b17352f0a8a 100755 --- a/docker/docker_republish.sh +++ b/docker/docker_republish.sh @@ -25,7 +25,7 @@ OUTPUT_TAG=; TARGET_REPO="docker.io" TARGET_ORG="diem" DISABLE_TRUST="false" -IMAGES="init faucet tools validator validator_tcb cluster_test client forge" +IMAGES="init faucet tools validator validator_tcb client forge" #parse args while getopts "t:o:r:g:i:dh" arg; do diff --git a/docker/tag-and-push.sh b/docker/tag-and-push.sh index 48ed1ea8baf2c..72022f6c1737c 100755 --- a/docker/tag-and-push.sh +++ b/docker/tag-and-push.sh @@ -4,7 +4,7 @@ # tag-and-push.sh is used tag an image with multiple tags and push them to the target repo. Use ":" as the separator # between multiple tags # Example: -# SOURCE=diem_validator:latest TARGET_REPO=1234567890.dkr.ecr.us-west-2.amazonaws.com/diem_cluster_test TARGET_TAGS=master:master_39cnja0 tag-and-push.sh +# SOURCE=diem_validator:latest TARGET_REPO=1234567890.dkr.ecr.us-west-2.amazonaws.com/diem_forge TARGET_TAGS=master:master_39cnja0 tag-and-push.sh set -e diff --git a/language/benchmarks/README.md b/language/benchmarks/README.md index 3f57e8e8607c7..271756946dba5 100644 --- a/language/benchmarks/README.md +++ b/language/benchmarks/README.md @@ -14,12 +14,9 @@ We currently have four local benchmark candidates: - `executor_benchmark` in `diem/executor` - `txn_bench` in `diem/language/benchmark` - `Arith` and `call` benchmark in `diem/language/benchmark` -- `diem-swarm` with transaction traffic generated by `cluster-test`. The first item is a comprehensive benchmark of diem adapter, executor and storage that generates a block of p2p transactions and tries to execute and commit it to the DiemDB in local storage. The second item is a benchmark of Diem adapter only with a fake executor and an in-memory storage that executes randomly generated p2p transactions. The third item, although it’s still invoking Diem adapter, is mostly testing on the MoveVM’s ability of handling simple arithmetic operations and call stacks. -`diem-swarm` is the most comprehensive suite which spawns a single validator node on your local machine. To start this suite, you will need to execute `scripts/cli/start_cli_swarm.sh` and start transaction generation by following the `cluster-test` instruction emitted in the instruction. - ## Step 3: Select the running process in Instrument. Open instrument and create a time profiler project. diff --git a/scripts/cluster_test_pod_template.yaml b/scripts/cluster_test_pod_template.yaml deleted file mode 100644 index 9c9718b999ca3..0000000000000 --- a/scripts/cluster_test_pod_template.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: {pod_name} - labels: - app: cluster-test -spec: - nodeSelector: - nodeType: clustertest - restartPolicy: Never - serviceAccountName: clustertest - tolerations: - - key: "clustertest" - operator: "Exists" - effect: "NoSchedule" - containers: - - name: main - image: 853397791086.dkr.ecr.us-west-2.amazonaws.com/diem/cluster_test:{cluster_test_image_tag} - imagePullPolicy: Always - env: [{env_variables}] - command: [timeout, "{timeout_secs}", cluster-test, --deploy={image_tag} {extra_args}] - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app - operator: In - values: ["cluster-test"] - topologyKey: "kubernetes.io/hostname" diff --git a/scripts/ct-k8s-status.sh b/scripts/ct-k8s-status.sh deleted file mode 100755 index 7e5cd497e37b0..0000000000000 --- a/scripts/ct-k8s-status.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -# Copyright (c) The Diem Core Contributors -# SPDX-License-Identifier: Apache-2.0 - -set -euo pipefail - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - -source "${DIR}/ct.vars" - -for ((i = 0; i < ${K8S_POOL_SIZE}; i++)); do - ws="ct-${i}" - echo "Workspace: $ws" - echo "Cluster test pods:" - context="arn:aws:eks:us-west-2:853397791086:cluster/${ws}-k8s-testnet" - kubectl get pods --context="${context}" -l app=cluster-test --sort-by=.status.startTime - echo -done diff --git a/scripts/cti b/scripts/cti deleted file mode 100755 index cda5d7f09b9d0..0000000000000 --- a/scripts/cti +++ /dev/null @@ -1,293 +0,0 @@ -#!/bin/bash -# Copyright (c) The Diem Core Contributors -# SPDX-License-Identifier: Apache-2.0 -set -e -set -o pipefail - -TAG="" -CLUSTER_TEST_TAG="" -VALIDATOR_TAG="" -PR="" -WORKSPACE="" -ENV="" -REPORT="" -LOCAL_BUILD="" -EXIT_CODE=0 -# Default timeout is 45 mins -TIMEOUT_SECS=2700 - -K8S_CONTEXT_PATTERN='arn:aws:eks:us-west-2:853397791086:cluster/CLUSTERNAME-k8s-testnet' - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - -source "${DIR}/ct.vars" - -# Colorize Output -RESTORE=$(echo -en '\001\033[0m\002') -BLUE=$(echo -en '\001\033[01;34m\002') - -# https://stackoverflow.com/a/5533586 -# This function shuffles the elements of an array named "array" in-place using the Knuth-Fisher-Yates shuffle algorithm -shuffle() { - local i tmp size max rand - - # $RANDOM % (i+1) is biased because of the limited range of $RANDOM - # Compensate by using a range which is a multiple of the array size. - size=${#array[*]} - max=$(( 32768 / size * size )) - - for ((i=size-1; i>0; i--)); do - while (( (rand=$RANDOM) >= max )); do :; done - rand=$(( rand % (i+1) )) - tmp=${array[i]} array[i]=${array[rand]} array[rand]=$tmp - done -} - -join_args() { - retval_join_args="" - for var in "$@" - do - retval_join_args="${retval_join_args}, \"${var}\"" - done -} - -join_env_vars() { - retval_join_env_vars="" - for var in $* - do - IFS='=' read -ra env_var <<< "$var" - retval_join_env_vars="{\"name\":\"${env_var[0]}\", \"value\":\"${env_var[1]}\"}, ${retval_join_env_vars}" - done -} - -kube_init_context () { - aws eks --region us-west-2 describe-cluster --name ct-0-k8s-testnet &>/dev/null || (echo "Failed to access EKS, try awsmfa?"; exit 1) - local highest_pool_index=$(($K8S_POOL_SIZE - 1)) - local context=${K8S_CONTEXT_PATTERN/CLUSTERNAME/ct-${highest_pool_index}} - if kubectl config get-contexts ${context} &> /dev/null; then - return - fi - for ((i = 0; i < ${K8S_POOL_SIZE}; i++)); do - aws eks --region us-west-2 update-kubeconfig --name ct-${i}-k8s-testnet - done -} - -kube_select_cluster () { - retval_kube_select_cluster="" - array=() - for ((i = 0; i < ${K8S_POOL_SIZE}; i++)); do - array+=("${i}") - done - # shuffle all the elements of the array to randomly pick a cluster - shuffle - for attempt in {1..360} ; do - for i in "${array[@]}"; do - local context=${K8S_CONTEXT_PATTERN/CLUSTERNAME/ct-${i}} - local running_pods=$(kubectl --context="${context}" get pods -l app=cluster-test --field-selector=status.phase==Running 2> /dev/null | grep -v ^NAME | wc -l) - local pending_pods=$(kubectl --context="${context}" get pods -l app=cluster-test --field-selector=status.phase==Pending 2> /dev/null | grep -v ^NAME | wc -l) - local prometheus_healthy_containers=$(kubectl --context="${context}" get pod/libra-testnet-prometheus-server-0 | grep 'libra-testnet-prometheus-server-0' | awk '{print $2}') - if [[ "${pending_pods}" -gt 0 ]]; then - echo "ct-${i} has ${pending_pods} pending pods. Skipping." - elif [[ ${prometheus_healthy_containers} != "2/2" ]]; then - echo "prometheus is not healthy for ct-${i}. Skipping." - elif [[ ${running_pods} -gt 0 ]]; then - echo "ct-${i} has ${running_pods} running pods. Skipping." - else - retval_kube_select_cluster="ct-${i}" - return - fi - done - echo "All clusters have jobs running on them. Retrying in 10 secs." - sleep 10 - done - echo "Failed to schedule job on a cluster as all are busy" - exit 1 -} - -kube_wait_pod () { - local pod_name="${1}" - local context="${2}" - for i in {1..360} ; do - local phase=$(kubectl --context="${context}" get pod "${pod_name}" -o jsonpath="{.status.phase}" || echo -n "kubectlfailed") - if [[ "${phase}" == "kubectlfailed" ]]; then - echo "kubectl get pod ${pod_name} failed. Retrying." - sleep 10 - continue - fi - if [[ $phase != "Pending" && $phase != "Unknown" ]]; then - echo "${pod_name} reached phase : ${phase}" - return - fi - if kubectl --context="${context}" get pod "${pod_name}" | grep -i -e ImagePullBackOff -e InvalidImageName -e ErrImagePull &>/dev/null; then - image_name=$(kubectl --context="${context}" get pod "${pod_name}" -o jsonpath="{.spec.containers[0].image}") - echo "${pod_name} name failed to be scheduled because there was an error pulling the image : ${image_name}" - # Delete the pod so that it doesn't block other pods from being scheduled on this - kubectl --context="${context}" delete pod "${pod_name}" - exit 1 - fi - echo "Waiting for ${pod_name} to be scheduled. Current phase : ${phase}" - sleep 10 - done - echo "Pod ${pod_name} failed to be scheduled" - exit 1 -} - -while (( "$#" )); do - case "$1" in - --perf-run) - echo "--perf-run is deprecated. Use --suite perf instead" - exit 1 - ;; - -R|--report) - REPORT=$2 - shift 2 - ;; - -p|--pr) - PR=$2 - shift 2 - ;; - -L|--local-build) - LOCAL_BUILD="yes" - shift 1 - ;; - --timeout-secs) - TIMEOUT_SECS=$2 - shift 2 - ;; - -S|--stable) - TAG=stable - shift 1 - ;; - -T|--tag) - TAG=$2 - shift 2 - ;; - --cluster-test-tag) - CLUSTER_TEST_TAG=$2 - shift 2 - ;; - --validator-tag) - VALIDATOR_TAG=$2 - shift 2 - ;; - -W|--workspace) - WORKSPACE=$2 - shift 2 - ;; - -E|--env) - ENV="$ENV $2" - shift 2 - ;; - -c|--container|-i|--image|--deploy) - echo "$1 command is not allowed in cti" - exit 1 - ;; - *) # end argument parsing - break - ;; - esac -done - -if [ -z "$PR" ] && [ -z "$TAG" ] && [ -z "$LOCAL_BUILD" ] -then - echo "No PR or tag or --local-build specified" - echo "Usage:" - echo "cti [--pr |--stable|--tag |--local-build] [--workspace ] [-E ] [--timeout-secs ] " - echo - echo "--local-build - Build image locally" - echo "--pr : Build image from pull request #" - echo "-S|--stable: Use latest image available in CI" - echo "-T|--tag : Use image with tag TAG" - echo "--timeout-secs: Timeout in seconds" - echo "--cluster-test-tag : Use this tag for cluster test" - echo "-W|--workspace : Use custom workplace " - echo "-E|--env : Set environment variable, ex. -E RUST_LOG=debug. Can be repeated, e.g. -E A=B -E C=D" - echo "-R|--report file.json: Generate json report into file.json" - echo - echo "To see cluster test runner arguments run cti --stable --help" - echo - echo "Examples:" - echo "cti --pr --run bench # Run benchmark" - echo "cti --stable --emit-tx # Submit transactions until Ctrl+C pressed" - exit 1 -fi - -if [ -z "$TAG" ]; then - if [ -z "$LOCAL_BUILD" ]; then - aws codebuild list-projects >/dev/null || (echo "Failed to access codebuild, try awsmfa?"; exit 1) - ./docker/build-aws.sh --build-all-cti --version pull/$PR - TAG=dev_${USER}_pull_${PR} - echo "**TIP Use cti -T $TAG <...> to restart this run with same tag without rebuilding it" - else - TAG="dev_$(whoami)_$(git rev-parse --short HEAD)" - TAG=$TAG ./docker/build-push-local.sh - fi -fi - -CLUSTER_TEST_TAG=${CLUSTER_TEST_TAG:-${TAG}} -VALIDATOR_TAG=${VALIDATOR_TAG:-${TAG}} - -OUTPUT_TEE=${CTI_OUTPUT_LOG:-$(mktemp)} - -if ! which kubectl &>/dev/null; then - echo "kubectl is not installed. Please install kubectl. On mac, you can use : brew install kubectl" - exit 1 -fi -echo "Running cluster-test on Kubernetes" -kube_init_context -pod_name="cluster-test-$(whoami)-$(date +%s)" -pod_name=${pod_name/_/-} #underscore not allowed in pod name -specfile=$(mktemp) -echo "Pod Spec : ${specfile}" -join_args "$@" -if [[ -z "${WORKSPACE}" ]]; then - kube_select_cluster - WORKSPACE=${retval_kube_select_cluster} -fi -RUN_ID="${WORKSPACE}-${pod_name}" -ENV="$ENV AWS_ROLE_SESSION_NAME=AWS_ROLE_SESSION_NAME RUN_ID=$RUN_ID" -join_env_vars $ENV -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -sed -e "s/{pod_name}/${pod_name}/g" \ - -e "s/{image_tag}/${VALIDATOR_TAG}/g" \ - -e "s/{timeout_secs}/${TIMEOUT_SECS}/g" \ - -e "s/{cluster_test_image_tag}/${CLUSTER_TEST_TAG}/g" \ - -e "s^{env_variables}^${retval_join_env_vars}^g" \ - -e "s+{extra_args}+${retval_join_args}+g" \ - ${DIR}/cluster_test_pod_template.yaml > ${specfile} - -echo "Using cluster : ${WORKSPACE}" -context=${K8S_CONTEXT_PATTERN/CLUSTERNAME/${WORKSPACE}} -kubectl --context=${context} apply -f ${specfile} || (echo "Failed to create cluster-test pod"; exit 1) -kube_wait_pod ${pod_name} ${context} -START_UTC=$(TZ=UTC date +"%Y-%m-%dT%H:%M:%SZ") -START_TS_MS=$(date +%s)000 -echo "**********" -echo "${BLUE}Auto refresh Dashboard:${RESTORE} http://grafana.${WORKSPACE}-k8s-testnet.aws.hlw3truzy4ls.com/d/performance/performance?from=${START_TS_MS}&to=now&refresh=5s" -echo "${BLUE}Tail logs:${RESTORE} http://kibana.${WORKSPACE}-k8s-testnet.aws.hlw3truzy4ls.com/app/kibana#/discover?_g=(refreshInterval:(pause:!f,value:10000),time:(from:'$START_UTC',to:now))" -echo "**********" -kubectl --context=${context} logs -f "${pod_name}" | tee $OUTPUT_TEE -pod_status=$(kubectl --context=${context} get pods "${pod_name}" -o jsonpath="{.status.phase}") -END_UTC=$(TZ=UTC date +"%Y-%m-%dT%H:%M:%SZ") -END_TS_MS=$(date +%s)000 -echo "**********" -LOGS_LINK="http://kibana.${WORKSPACE}-k8s-testnet.aws.hlw3truzy4ls.com/app/kibana#/discover?_g=(time:(from:'${START_UTC}',to:'${END_UTC}'))" -VAL_LOG_LINK="${LOGS_LINK}&_a=(columns:!(log),query:(language:kuery,query:'kubernetes.pod_name:\\\"val-1\\\"'),sort:!(!('@timestamp',desc)))" -DASHBOARD_LINK="http://grafana.${WORKSPACE}-k8s-testnet.aws.hlw3truzy4ls.com/d/performance/performance?from=${START_TS_MS}&to=${END_TS_MS}" -echo "${BLUE}Logs snapshot:${RESTORE} ${LOGS_LINK}" -echo "${BLUE}Dashboard snapshot:${RESTORE} ${DASHBOARD_LINK}" -echo "**********" -if [[ "${pod_status}" != "Succeeded" ]]; then - echo "${pod_name} status: ${pod_status}" - EXIT_CODE=1 -fi - -if [ ! -z "$REPORT" ]; then - cat $OUTPUT_TEE | awk '/====json-report-begin===/{f=1;next} /====json-report-end===/{f=0} f' > "${REPORT}" - [ ! -s "${REPORT}" ] && echo '{"text": "Cluster test runner terminated"}' > "${REPORT}" # If no report was generated, fill with default report - LINKS="Logs: ${LOGS_LINK}\nDashboard: ${DASHBOARD_LINK}\nValidator 1 logs: ${VAL_LOG_LINK}" - jq ".links=\"${LINKS}\"" "${REPORT}" > "${REPORT}.bak" - mv "${REPORT}.bak" "${REPORT}" -fi - -exit ${EXIT_CODE} diff --git a/scripts/dockerhub_prune.sh b/scripts/dockerhub_prune.sh index e7698e33d866c..5d1a5ea5a34ab 100755 --- a/scripts/dockerhub_prune.sh +++ b/scripts/dockerhub_prune.sh @@ -153,7 +153,6 @@ function prune_repo { } prune_repo "diem/client" -prune_repo "diem/cluster_test" prune_repo "diem/init" prune_repo "diem/faucet" prune_repo "diem/tools" diff --git a/state-sync/state-sync-v1/src/counters.rs b/state-sync/state-sync-v1/src/counters.rs index 33b5cdd2f9890..a2283f2a4e169 100644 --- a/state-sync/state-sync-v1/src/counters.rs +++ b/state-sync/state-sync-v1/src/counters.rs @@ -194,9 +194,6 @@ pub static TIMESTAMP: Lazy = Lazy::new(|| { .unwrap() }); -/// Notice: this metric is used in CT full node health check -/// ~/diem/testsuite/cluster-test/health/fullnode_check.rs -/// please make corresponding changes if this field is updated pub static VERSION: Lazy = Lazy::new(|| { register_int_gauge_vec!( "diem_state_sync_version", diff --git a/testsuite/cluster-test/Cargo.toml b/testsuite/cluster-test/Cargo.toml deleted file mode 100644 index 417a76aecba75..0000000000000 --- a/testsuite/cluster-test/Cargo.toml +++ /dev/null @@ -1,76 +0,0 @@ -[package] -name = "cluster-test" -version = "0.1.0" -authors = ["Diem Association "] -description = "Diem cluster test" -repository = "https://github.com/diem/diem" -homepage = "https://diem.com" -license = "Apache-2.0" -publish = false -edition = "2018" - -[dependencies] -anyhow = "1.0.38" -flate2 = { version = "1.0.20", features = ["rust_backend"], default-features = false } -hex = "0.4.3" -itertools = "0.10.0" -once_cell = "1.7.2" -rand = "0.8.3" -regex = { version = "1.4.3", default-features = false, features = ["std", "perf"] } -reqwest = { version = "0.11.2", features = ["blocking", "json"] } -serde_json = "1.0.64" -serde_yaml = "0.8.17" -termion = "1.5.6" -serde = { version = "1.0.124", features = ["derive"] } -structopt = "0.3.21" -rusoto_core = "0.46.0" -rusoto_autoscaling = "0.46.0" -rusoto_sts = "0.46.0" -rusoto_s3 = "0.46.0" -chrono = "0.4.19" -toml = { version = "0.5.8", default-features = false } - -debug-interface = { path = "../../crates/debug-interface"} -diem-client = { path = "../../crates/diem-client"} -diem-retrier = { path = "../../crates/diem-retrier" } -num_cpus = "1.13.0" - -consensus-types = { path = "../../consensus/consensus-types" } -generate-key = { path = "../../config/generate-key" } -bcs = "0.1.2" -diem-crypto = { path = "../../crates/diem-crypto" } -diem-config = { path = "../../config" } -diem-framework-releases = { path = "../../diem-move/diem-framework/DPN/releases" } -diem-genesis-tool = { path = "../../config/management/genesis", features = ["testing"] } -diem-global-constants = { path = "../../config/global-constants" } -diem-logger = { path = "../../crates/diem-logger" } -diem-management = { path = "../../config/management", features = ["testing"] } -diem-mempool = { path = "../../mempool" } -diem-infallible = { path = "../../crates/diem-infallible" } -diem-network-address-encryption = { path = "../../config/management/network-address-encryption" } -diem-node = { path = "../../diem-node" } -diem-operational-tool = {path = "../../config/management/operational", features = ["testing"] } -diem-rest-client = { path = "../../crates/diem-rest-client"} -diem-secure-storage = { path = "../../secure/storage", features = ["testing"] } -diem-swarm = { path = "../diem-swarm" } -diem-temppath = { path = "../../crates/diem-temppath" } -diem-time-service = { path = "../../crates/diem-time-service" } -diem-types = { path = "../../types", features = ["fuzzing"] } -diem-workspace-hack = { version = "0.1", path = "../../crates/diem-workspace-hack" } -forge = { path = "../forge" } -language-e2e-tests = { path = "../../diem-move/e2e-tests" } -move-core-types = { path = "../../language/move-core/types" } -network = { path = "../../network" } -network-builder = { path = "../../network/builder" } -seed-peer-generator = { path = "../../config/seed-peer-generator" } -state-sync-v1 = { path = "../../state-sync/state-sync-v1" } -diem-sdk = { path = "../../sdk" } -diem-transaction-builder = { path = "../../sdk/transaction-builder" } - -futures = "0.3.12" -tokio = { version = "1.8.1", features = ["full"] } -async-trait = "0.1.42" - -kube = "0.51.0" - -k8s-openapi = { version = "0.11.0", default-features = false, features = ["v1_15"] } diff --git a/testsuite/cluster-test/README.md b/testsuite/cluster-test/README.md deleted file mode 100644 index 7d9f1f421b609..0000000000000 --- a/testsuite/cluster-test/README.md +++ /dev/null @@ -1,31 +0,0 @@ -**Cluster test** is framework that introduces different failures to system and verifies that some degree of liveliness and safety is preserved during those experiments. - -Cluster test works with real AWS cluster and uses root ssh access and AWS api to introduce failures. - - -###### Structure - -Major components of cluster test include: -* **Effect** is a single type of failure, normally affecting single specific node. For example, `Reboot{SomeNode}` -* **Experiment** is some condition we want to test our system with. Usually experiment is set of Effects, for example `RebootRandomValidators` experiment would generate some number of `Reboot` effects for subset of nodes in cluster. -* **HealthCheck** is how we verify whether experiment was successful or not. Examples are `LivenessHealthCheck` that verifies that validators produce commits and `CommitHistoryHealthCheck` that verifies safety, in terms that validators do not produce contradicting commits. - -###### Test lifecycle - -Normally experiment lifecycle consist of multiple stages. -This lifecycle is managed by test runner: -* Before experiment runner verifies that cluster is healthy -* Experiment is started in separate thread. Experiment has timeout, if it does not finish within this timeout it is considered to be failure -* When `Experiment` is running it also reports set of validators that affected by it through `Experiment::affected_validators()`. We still verify that liveness and safety is not violated for any other validators. For example, when rebooting 3 validators we make sure that all other validators still make progress. -* After experiment completes, we verify that all nodes in cluster becomes healthy again within some timeout - -###### Run and build - -Normally we run cluster_test on linux machine in AWS. In order to build linux binary on mac laptop we have cross compilation script: - -`docker/cluster_test/build.sh` - -This script requires docker for mac and starts docker container with build environment. - -Build in this container is incremental, first build takes a lot of time but second build is much faster. -As a result, build script produces binary by default. Running it with `--build-docker-image` will also produce docker image. diff --git a/testsuite/cluster-test/src/aws.rs b/testsuite/cluster-test/src/aws.rs deleted file mode 100644 index 8a5afd7dd40e1..0000000000000 --- a/testsuite/cluster-test/src/aws.rs +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use anyhow::{anyhow, bail, format_err, Result}; -use diem_logger::{info, warn}; -use rusoto_autoscaling::{ - AutoScalingGroupNamesType, Autoscaling, AutoscalingClient, SetDesiredCapacityType, -}; -use rusoto_core::Region; -use rusoto_sts::WebIdentityProvider; - -/// set_asg_size sets the size of the given autoscaling group -#[allow(clippy::collapsible_if)] -pub async fn set_asg_size( - desired_capacity: i64, - buffer_percent: f64, - asg_name: &str, - wait_for_completion: bool, - scaling_down: bool, -) -> Result<()> { - let buffer = if scaling_down { - 0 - } else { - ((desired_capacity as f64 * buffer_percent) / 100_f64).ceil() as i64 - }; - info!( - "Scaling to desired_capacity : {}, buffer: {}, asg_name: {}", - desired_capacity, buffer, asg_name - ); - let set_desired_capacity_type = SetDesiredCapacityType { - auto_scaling_group_name: asg_name.to_string(), - desired_capacity: desired_capacity + buffer, - honor_cooldown: Some(false), - }; - let credentials_provider = WebIdentityProvider::from_k8s_env(); - - let dispatcher = rusoto_core::HttpClient::new() - .map_err(|e| anyhow!("Failed to create request dispatcher, met Error:{}", e))?; - let asc = AutoscalingClient::new_with(dispatcher, credentials_provider, Region::UsWest2); - diem_retrier::retry_async(diem_retrier::fixed_retry_strategy(10_000, 60), || { - let asc = asc.clone(); - let set_desired_capacity_type = set_desired_capacity_type.clone(); - Box::pin(async move { - asc.set_desired_capacity(set_desired_capacity_type) - .await - .map_err(|e| { - warn!("set_desired_capacity failed: {}, retrying", e); - format_err!("set_desired_capacity failed: {}", e) - }) - }) - }) - .await?; - if !wait_for_completion { - return Ok(()); - } - diem_retrier::retry_async(diem_retrier::fixed_retry_strategy(10_000, 60), || { - let asc_clone = asc.clone(); - Box::pin(async move { - let mut total = 0; - let mut current_token = None; - loop { - let current_token_clone = current_token.clone(); - let auto_scaling_group_names_type = AutoScalingGroupNamesType { - auto_scaling_group_names: Some(vec![asg_name.to_string()]), - // https://docs.aws.amazon.com/autoscaling/ec2/APIReference/API_DescribeAutoScalingGroups.html - // max value is 100 - max_records: Some(100), - next_token: current_token_clone, - }; - let asgs = asc_clone - .describe_auto_scaling_groups(auto_scaling_group_names_type) - .await?; - if asgs.auto_scaling_groups.is_empty() { - bail!("asgs.auto_scaling_groups.is_empty()"); - } - let asg = &asgs.auto_scaling_groups[0]; - if scaling_down { - total += asg - .instances - .clone() - .ok_or_else(|| format_err!("instances not found for auto_scaling_group"))? - .len() as i64; - } else { - total += asg - .instances - .clone() - .ok_or_else(|| format_err!("instances not found for auto_scaling_group"))? - .iter() - .filter(|instance| instance.lifecycle_state == "InService") - .count() as i64; - } - if asgs.next_token.is_none() { - break; - } - current_token = asgs.next_token; - } - info!( - "Waiting for scaling to complete. Current size: {}, Min Desired Size: {}", - total, desired_capacity - ); - if scaling_down { - if total > desired_capacity { - bail!( - "Waiting for scale-down to complete. Current size: {}, Min Desired Size: {}", - total, - desired_capacity - ); - } else { - info!("Scale down completed"); - Ok(()) - } - } else if total < desired_capacity { - bail!( - "Waiting for scale-up to complete. Current size: {}, Min Desired Size: {}", - total, - desired_capacity - ); - } else { - info!("Scale up completed"); - Ok(()) - } - }) - }) - .await -} diff --git a/testsuite/cluster-test/src/cluster.rs b/testsuite/cluster-test/src/cluster.rs deleted file mode 100644 index 1d9060440e49e..0000000000000 --- a/testsuite/cluster-test/src/cluster.rs +++ /dev/null @@ -1,346 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::instance::{Instance, ValidatorGroup}; -use anyhow::{format_err, Result}; -use diem_client::AccountAddress; -use diem_crypto::{ - ed25519::{Ed25519PrivateKey, Ed25519PublicKey}, - test_utils::KeyPair, - Uniform, -}; -use diem_rest_client::Client as RestClient; -use diem_sdk::types::{AccountKey, LocalAccount}; -use diem_types::{ - account_config::{ - diem_root_address, testnet_dd_account_address, treasury_compliance_account_address, - }, - chain_id::ChainId, - waypoint::Waypoint, -}; -use forge::query_sequence_numbers; -use rand::prelude::*; -use reqwest::Client; - -const DD_KEY: &str = "dd.key"; - -#[derive(Clone)] -pub struct Cluster { - // guaranteed non-empty - validator_instances: Vec, - fullnode_instances: Vec, - lsr_instances: Vec, - vault_instances: Vec, - mint_key_pair: KeyPair, - waypoint: Option, - pub chain_id: ChainId, -} - -pub fn dummy_key_pair() -> KeyPair { - Ed25519PrivateKey::generate_for_testing().into() -} - -impl Cluster { - pub fn from_host_port( - peers: Vec<(String, u32, Option)>, - mint_file: &str, - chain_id: ChainId, - vasp: bool, - ) -> Self { - let http_client = Client::new(); - let instances: Vec = peers - .into_iter() - .map(|host_port| { - Instance::new( - format!("{}:{}", &host_port.0, host_port.1), /* short_hash */ - host_port.0, - host_port.1, - host_port.2, - http_client.clone(), - ) - }) - .collect(); - - let mint_key_pair = if vasp { - dummy_key_pair() - } else { - KeyPair::from(generate_key::load_key(mint_file)) - }; - Self { - validator_instances: instances, - fullnode_instances: vec![], - lsr_instances: vec![], - vault_instances: vec![], - mint_key_pair, - waypoint: None, - chain_id, - } - } - - fn get_mint_key_pair_from_file( - mint_file: &str, - ) -> KeyPair { - let mint_key: Ed25519PrivateKey = generate_key::load_key(mint_file); - KeyPair::from(mint_key) - } - - pub fn new( - validator_instances: Vec, - fullnode_instances: Vec, - lsr_instances: Vec, - vault_instances: Vec, - waypoint: Option, - ) -> Self { - Self { - validator_instances, - fullnode_instances, - lsr_instances, - vault_instances, - mint_key_pair: Self::get_mint_key_pair_from_file("/tmp/mint.key"), - waypoint, - chain_id: ChainId::test(), - } - } - - pub fn random_validator_instance(&self) -> Instance { - let mut rnd = rand::thread_rng(); - self.validator_instances - .choose(&mut rnd) - .expect("random_validator_instance requires non-empty validator_instances") - .clone() - } - - pub fn validator_instances(&self) -> &[Instance] { - &self.validator_instances - } - - pub fn random_fullnode_instance(&self) -> Instance { - let mut rnd = rand::thread_rng(); - self.fullnode_instances - .choose(&mut rnd) - .expect("random_full_node_instance requires non-empty fullnode_instances") - .clone() - } - - pub fn fullnode_instances(&self) -> &[Instance] { - &self.fullnode_instances - } - - pub fn lsr_instances(&self) -> &[Instance] { - &self.lsr_instances - } - - pub fn vault_instances(&self) -> &[Instance] { - &self.vault_instances - } - - pub fn all_instances(&self) -> impl Iterator { - self.validator_instances - .iter() - .chain(self.fullnode_instances.iter()) - .chain(self.lsr_instances.iter()) - .chain(self.vault_instances.iter()) - } - - pub fn validator_and_fullnode_instances(&self) -> impl Iterator { - self.validator_instances - .iter() - .chain(self.fullnode_instances.iter()) - } - - pub fn into_validator_instances(self) -> Vec { - self.validator_instances - } - - pub fn into_fullnode_instances(self) -> Vec { - self.fullnode_instances - } - - pub fn into_lsr_instances(self) -> Vec { - self.lsr_instances - } - - pub fn into_vault_instances(self) -> Vec { - self.vault_instances - } - - pub fn mint_key_pair(&self) -> &KeyPair { - &self.mint_key_pair - } - - fn account_key(&self) -> AccountKey { - AccountKey::from_private_key(self.mint_key_pair.private_key.clone()) - } - - async fn load_account_with_mint_key( - &self, - client: &RestClient, - address: AccountAddress, - ) -> Result { - let sequence_number = query_sequence_numbers(client, &[address]) - .await - .map_err(|e| { - format_err!( - "query_sequence_numbers on {:?} for account {} failed: {}", - client, - address, - e - ) - })?[0]; - Ok(LocalAccount::new( - address, - self.account_key(), - sequence_number, - )) - } - - pub async fn load_diem_root_account(&self, client: &RestClient) -> Result { - self.load_account_with_mint_key(client, diem_root_address()) - .await - } - - pub async fn load_faucet_account(&self, client: &RestClient) -> Result { - self.load_account_with_mint_key(client, testnet_dd_account_address()) - .await - } - - pub async fn load_tc_account(&self, client: &RestClient) -> Result { - self.load_account_with_mint_key(client, treasury_compliance_account_address()) - .await - } - - pub async fn load_dd_account(&self, client: &RestClient) -> Result { - let mint_key: Ed25519PrivateKey = generate_key::load_key(DD_KEY); - let account_key = AccountKey::from_private_key(mint_key); - let address = account_key.authentication_key().derived_address(); - let sequence_number = query_sequence_numbers(client, &[address]) - .await - .map_err(|e| { - format_err!( - "query_sequence_numbers on {:?} for dd account failed: {}", - client, - e - ) - })?[0]; - Ok(LocalAccount::new(address, account_key, sequence_number)) - } - - pub fn get_validator_instance(&self, name: &str) -> Option<&Instance> { - self.validator_instances - .iter() - .find(|instance| instance.peer_name() == name) - } - - /// Splits this cluster into two - /// - /// Returns tuple of two clusters: - /// First element in tuple contains cluster with c random instances from self - /// Second element in tuple contains cluster with remaining instances from self - pub fn split_n_validators_random(&self, c: usize) -> (Self, Self) { - assert!(c <= self.validator_instances.len()); - let mut rng = ThreadRng::default(); - let mut sub = vec![]; - let mut rem = self.validator_instances.clone(); - for _ in 0..c { - let idx_remove = rng.gen_range(0..rem.len()); - let instance = rem.remove(idx_remove); - sub.push(instance); - } - ( - self.new_validator_sub_cluster(sub), - self.new_validator_sub_cluster(rem), - ) - } - - pub fn split_n_fullnodes_random(&self, c: usize) -> (Self, Self) { - assert!(c <= self.fullnode_instances.len()); - let mut rng = ThreadRng::default(); - let mut sub = vec![]; - let mut rem = self.fullnode_instances.clone(); - for _ in 0..c { - let idx_remove = rng.gen_range(0..rem.len()); - let instance = rem.remove(idx_remove); - sub.push(instance); - } - ( - self.new_fullnode_sub_cluster(sub), - self.new_fullnode_sub_cluster(rem), - ) - } - - fn new_validator_sub_cluster(&self, instances: Vec) -> Self { - Cluster { - validator_instances: instances, - fullnode_instances: vec![], - lsr_instances: vec![], - vault_instances: vec![], - mint_key_pair: self.mint_key_pair.clone(), - waypoint: self.waypoint, - chain_id: ChainId::test(), - } - } - - fn new_fullnode_sub_cluster(&self, instances: Vec) -> Self { - Cluster { - validator_instances: vec![], - fullnode_instances: instances, - lsr_instances: vec![], - vault_instances: vec![], - mint_key_pair: self.mint_key_pair.clone(), - waypoint: self.waypoint, - chain_id: ChainId::test(), - } - } - - pub fn validator_sub_cluster(&self, ids: Vec) -> Cluster { - let mut instances = Vec::with_capacity(ids.len()); - for id in ids { - let instance = self.get_validator_instance(&id); - match instance { - Some(instance) => instances.push(instance.clone()), - None => panic!("Can not make sub_cluster: instance {} is not found", id), - } - } - assert!(!instances.is_empty(), "No instances for subcluster"); - self.new_validator_sub_cluster(instances) - } - - pub fn find_instance_by_pod(&self, pod: &str) -> Option<&Instance> { - self.validator_and_fullnode_instances() - .find(|i| i.peer_name() == pod) - } - - pub fn instances_for_group( - &self, - validator_group: ValidatorGroup, - ) -> impl Iterator { - self.all_instances() - .filter(move |v| v.validator_group() == validator_group) - } - - pub fn lsr_instances_for_validators(&self, validators: &[Instance]) -> Vec { - validators - .iter() - .filter_map(|l| { - self.lsr_instances - .iter() - .find(|x| l.validator_group() == x.validator_group()) - .cloned() - }) - .collect() - } - - pub fn vault_instances_for_validators(&self, validators: &[Instance]) -> Vec { - validators - .iter() - .filter_map(|v| { - self.vault_instances - .iter() - .find(|x| v.validator_group() == x.validator_group()) - .cloned() - }) - .collect() - } -} diff --git a/testsuite/cluster-test/src/cluster_builder.rs b/testsuite/cluster-test/src/cluster_builder.rs deleted file mode 100644 index 383143a09ac44..0000000000000 --- a/testsuite/cluster-test/src/cluster_builder.rs +++ /dev/null @@ -1,617 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -use crate::{ - aws, - cluster::Cluster, - cluster_swarm::{ - cluster_swarm_kube::{ClusterSwarmKube, KubeNode}, - ClusterSwarm, - }, - genesis_helper::GenesisHelper, - instance::{ - fullnode_pod_name, lsr_pod_name, validator_pod_name, vault_pod_name, - ApplicationConfig::{Fullnode, Validator, Vault, LSR}, - FullnodeConfig, Instance, InstanceConfig, LSRConfig, ValidatorConfig, ValidatorGroup, - VaultConfig, - }, -}; -use anyhow::{format_err, Result}; -use diem_logger::info; -use futures::future::try_join_all; -use std::{fs::File, io::Write, path::Path}; -use structopt::StructOpt; - -use consensus_types::safety_data::SafetyData; -use diem_genesis_tool::layout::Layout; -use diem_global_constants::{ - CONSENSUS_KEY, DIEM_ROOT_KEY, EXECUTION_KEY, FULLNODE_NETWORK_KEY, GENESIS_WAYPOINT, - OPERATOR_KEY, OWNER_KEY, SAFETY_DATA, TREASURY_COMPLIANCE_KEY, VALIDATOR_NETWORK_ADDRESS_KEYS, - VALIDATOR_NETWORK_KEY, WAYPOINT, -}; -use diem_secure_storage::{CryptoStorage, KVStorage, Namespaced, Storage, VaultStorage}; -use diem_types::{chain_id::ChainId, network_address::NetworkAddress, waypoint::Waypoint}; -use std::str::FromStr; - -const VAULT_TOKEN: &str = "root"; -const VAULT_PORT: u32 = 8200; -const DIEM_ROOT_NS: &str = "val-0"; -const VAULT_BACKEND: &str = "vault"; -const GENESIS_PATH: &str = "/tmp/genesis.blob"; - -#[derive(Clone, StructOpt, Debug)] -pub struct ClusterBuilderParams { - #[structopt(long, default_value = "1")] - pub fullnodes_per_validator: u32, - #[structopt(long, parse(try_from_str), default_value = "30")] - pub num_validators: u32, - #[structopt(long)] - pub enable_lsr: Option, - #[structopt( - long, - help = "Backend used by lsr. Possible Values are in-memory, on-disk, vault", - default_value = "vault" - )] - pub lsr_backend: String, - #[structopt( - long, - help = "Directory containing Move module bytecodes to be published in genesis" - )] - pub move_modules_dir: Option, -} - -impl ClusterBuilderParams { - pub fn enable_lsr(&self) -> bool { - self.enable_lsr.unwrap_or(true) - } -} - -pub struct ClusterBuilder { - pub current_tag: String, - pub cluster_swarm: ClusterSwarmKube, -} - -impl ClusterBuilder { - pub fn new(current_tag: String, cluster_swarm: ClusterSwarmKube) -> Self { - Self { - current_tag, - cluster_swarm, - } - } - - pub async fn setup_cluster( - &self, - params: &ClusterBuilderParams, - clean_data: bool, - ) -> Result { - self.cluster_swarm - .cleanup() - .await - .map_err(|e| format_err!("cleanup on startup failed: {}", e))?; - let current_tag = &self.current_tag; - info!( - "Deploying with {} tag for validators and fullnodes", - current_tag - ); - let asg_name = format!( - "{}-k8s-testnet-validators", - self.cluster_swarm - .get_workspace() - .await - .expect("Failed to get workspace") - ); - let mut instance_count = - params.num_validators + (params.fullnodes_per_validator * params.num_validators); - if params.enable_lsr() { - if params.lsr_backend == "vault" { - instance_count += params.num_validators * 2; - } else { - instance_count += params.num_validators; - } - } - if clean_data { - // First scale down to zero instances and wait for it to complete so that we don't schedule pods on - // instances which are going into termination state - aws::set_asg_size(0, 0.0, &asg_name, true, true) - .await - .map_err(|err| format_err!("{} scale down failed: {}", asg_name, err))?; - // Then scale up and bring up new instances - aws::set_asg_size(instance_count as i64, 5.0, &asg_name, true, false) - .await - .map_err(|err| format_err!("{} scale up failed: {}", asg_name, err))?; - } - let modules_dir = if let Some(modules_dir) = ¶ms.move_modules_dir { - modules_dir.clone() - } else { - // No modules specified on command line. Create a tmpdir and populate it with the Diem genesis modules - let mut tempdir = diem_temppath::TempPath::new(); - tempdir.create_as_dir()?; - tempdir.persist(); - for b in diem_framework_releases::current_module_blobs() { - let mut temppath = - diem_temppath::TempPath::new_with_temp_dir(tempdir.path().to_path_buf()); - temppath.create_as_file()?; - temppath.persist(); // otherwise, file will disappear when temppath goes out of scope - let mut file = File::create(temppath.path())?; - file.write_all(b)?; - file.sync_all()?; - } - tempdir.path().to_str().unwrap().to_string() - }; - - let (validators, lsrs, vaults, fullnodes, waypoint) = self - .spawn_validator_and_fullnode_set( - params.num_validators, - params.fullnodes_per_validator, - params.enable_lsr(), - ¶ms.lsr_backend, - current_tag, - &modules_dir, - clean_data, - ) - .await - .map_err(|e| format_err!("Failed to spawn_validator_and_fullnode_set: {}", e))?; - let cluster = Cluster::new(validators, fullnodes, lsrs, vaults, waypoint); - - info!( - "Deployed {} validators and {} fns", - cluster.validator_instances().len(), - cluster.fullnode_instances().len(), - ); - Ok(cluster) - } - - /// Creates a set of validators and fullnodes with the given parameters - pub async fn spawn_validator_and_fullnode_set( - &self, - num_validators: u32, - num_fullnodes_per_validator: u32, - enable_lsr: bool, - lsr_backend: &str, - image_tag: &str, - move_modules_dir: &str, - clean_data: bool, - ) -> Result<( - Vec, - Vec, - Vec, - Vec, - Option, - )> { - let vault_nodes; - let mut lsr_nodes = vec![]; - let mut vaults = vec![]; - let mut lsrs = vec![]; - let mut waypoint = None; - - if enable_lsr { - if lsr_backend == "vault" { - vault_nodes = try_join_all((0..num_validators).map(|i| async move { - let pod_name = vault_pod_name(i); - self.cluster_swarm.allocate_node(&pod_name).await - })) - .await?; - let mut vault_instances: Vec<_> = vault_nodes - .iter() - .enumerate() - .map(|(i, node)| async move { - let vault_config = VaultConfig {}; - if clean_data { - self.cluster_swarm.clean_data(&node.name).await?; - } - self.cluster_swarm - .spawn_new_instance(InstanceConfig { - validator_group: ValidatorGroup::new_for_index(i as u32), - application_config: Vault(vault_config), - }) - .await - }) - .collect(); - vaults.append(&mut vault_instances); - } else { - vault_nodes = vec![]; - } - lsr_nodes = try_join_all((0..num_validators).map(|i| async move { - let pod_name = lsr_pod_name(i); - self.cluster_swarm.allocate_node(&pod_name).await - })) - .await?; - let mut lsr_instances: Vec<_> = lsr_nodes - .iter() - .enumerate() - .map(|(i, node)| { - let vault_nodes = &vault_nodes; - async move { - let vault_addr = if enable_lsr && lsr_backend == "vault" { - Some(vault_nodes[i].internal_ip.clone()) - } else { - None - }; - let vault_namespace = if enable_lsr && lsr_backend == "vault" { - Some(validator_pod_name(i as u32)) - } else { - None - }; - let lsr_config = LSRConfig { - image_tag: image_tag.to_string(), - lsr_backend: lsr_backend.to_string(), - vault_addr, - vault_namespace, - }; - if clean_data { - self.cluster_swarm.clean_data(&node.name).await?; - } - self.cluster_swarm - .spawn_new_instance(InstanceConfig { - validator_group: ValidatorGroup::new_for_index(i as u32), - application_config: LSR(lsr_config), - }) - .await - } - }) - .collect(); - lsrs.append(&mut lsr_instances); - } else { - vault_nodes = vec![]; - } - - let lsrs = try_join_all(lsrs).await?; - let vaults = try_join_all(vaults).await?; - - let validator_nodes = try_join_all((0..num_validators).map(|i| async move { - let pod_name = validator_pod_name(i); - self.cluster_swarm.allocate_node(&pod_name).await - })) - .await?; - - let fullnode_nodes = try_join_all((0..num_validators).flat_map(move |validator_index| { - (0..num_fullnodes_per_validator).map(move |fullnode_index| async move { - let pod_name = fullnode_pod_name(validator_index, fullnode_index); - self.cluster_swarm.allocate_node(&pod_name).await - }) - })) - .await?; - - if !vault_nodes.is_empty() { - info!("Generating genesis with management tool."); - try_join_all(vault_nodes.iter().enumerate().map(|(i, node)| async move { - diem_retrier::retry_async(diem_retrier::fixed_retry_strategy(5000, 15), || { - Box::pin(async move { self.initialize_vault(i as u32, node).await }) - }) - .await - })) - .await?; - - waypoint = Some( - self.generate_genesis( - num_validators, - &vault_nodes, - &validator_nodes, - &fullnode_nodes, - move_modules_dir, - ) - .await?, - ); - info!("Done generating genesis."); - } - - let validators = (0..num_validators).map(|i| { - let validator_nodes = &validator_nodes; - let lsr_nodes = &lsr_nodes; - let vault_nodes = &vault_nodes; - async move { - let vault_addr = if enable_lsr && lsr_backend == "vault" { - Some(vault_nodes[i as usize].internal_ip.clone()) - } else { - None - }; - let vault_namespace = if enable_lsr && lsr_backend == "vault" { - Some(validator_pod_name(i)) - } else { - None - }; - let safety_rules_addr = if enable_lsr { - Some(lsr_nodes[i as usize].internal_ip.clone()) - } else { - None - }; - let validator_config = ValidatorConfig { - enable_lsr, - image_tag: image_tag.to_string(), - safety_rules_addr, - vault_addr, - vault_namespace, - }; - if clean_data { - self.cluster_swarm - .clean_data(&validator_nodes[i as usize].name) - .await?; - } - self.cluster_swarm - .spawn_new_instance(InstanceConfig { - validator_group: ValidatorGroup::new_for_index(i), - application_config: Validator(validator_config), - }) - .await - } - }); - - let fullnodes = (0..num_validators).flat_map(|validator_index| { - let fullnode_nodes = &fullnode_nodes; - let validator_nodes = &validator_nodes; - let vault_nodes = &vault_nodes; - (0..num_fullnodes_per_validator).map(move |fullnode_index| async move { - let vault_addr = if enable_lsr && lsr_backend == "vault" { - Some(vault_nodes[validator_index as usize].internal_ip.clone()) - } else { - None - }; - let vault_namespace = if enable_lsr && lsr_backend == "vault" { - Some(validator_pod_name(validator_index)) - } else { - None - }; - let seed_peer_ip = validator_nodes[validator_index as usize] - .internal_ip - .clone(); - let fullnode_config = FullnodeConfig { - fullnode_index, - image_tag: image_tag.to_string(), - seed_peer_ip, - vault_addr, - vault_namespace, - }; - if clean_data { - self.cluster_swarm - .clean_data( - &fullnode_nodes[(validator_index * num_fullnodes_per_validator - + fullnode_index) - as usize] - .name, - ) - .await?; - } - self.cluster_swarm - .spawn_new_instance(InstanceConfig { - validator_group: ValidatorGroup::new_for_index(validator_index), - application_config: Fullnode(fullnode_config), - }) - .await - }) - }); - - let validators = try_join_all(validators).await?; - let fullnodes = try_join_all(fullnodes).await?; - Ok((validators, lsrs, vaults, fullnodes, waypoint)) - } - - async fn initialize_vault(&self, validator_index: u32, vault_node: &KubeNode) -> Result<()> { - let addr = vault_node.internal_ip.clone(); - tokio::task::spawn_blocking(move || { - let pod_name = validator_pod_name(validator_index); - let mut vault_storage = Storage::from(Namespaced::new( - &pod_name, - Box::new(Storage::from(VaultStorage::new( - format!("http://{}:{}", addr, VAULT_PORT), - VAULT_TOKEN.to_string(), - None, - None, - true, - None, - None, - ))), - )); - if validator_index == 0 { - vault_storage.create_key(DIEM_ROOT_KEY).map_err(|e| { - format_err!("Failed to create {}__{} : {}", pod_name, DIEM_ROOT_KEY, e) - })?; - let key = vault_storage - .export_private_key(DIEM_ROOT_KEY) - .map_err(|e| { - format_err!("Failed to export {}__{} : {}", pod_name, DIEM_ROOT_KEY, e) - })?; - vault_storage - .import_private_key(TREASURY_COMPLIANCE_KEY, key) - .map_err(|e| { - format_err!( - "Failed to import {}__{} : {}", - pod_name, - TREASURY_COMPLIANCE_KEY, - e - ) - })?; - } - let keys = vec![ - OWNER_KEY, - OPERATOR_KEY, - CONSENSUS_KEY, - EXECUTION_KEY, - VALIDATOR_NETWORK_KEY, - FULLNODE_NETWORK_KEY, - ]; - for key in keys { - vault_storage - .create_key(key) - .map_err(|e| format_err!("Failed to create {}__{} : {}", pod_name, key, e))?; - } - vault_storage - .set(SAFETY_DATA, SafetyData::new(0, 0, 0, 0, None)) - .map_err(|e| format_err!("Failed to create {}/{}: {}", pod_name, SAFETY_DATA, e))?; - vault_storage - .set(WAYPOINT, Waypoint::default()) - .map_err(|e| format_err!("Failed to create {}/{} : {}", pod_name, WAYPOINT, e))?; - vault_storage - .set(GENESIS_WAYPOINT, Waypoint::default()) - .map_err(|e| format_err!("Failed to create {}/{} : {}", pod_name, WAYPOINT, e))?; - diem_network_address_encryption::Encryptor::new(vault_storage) - .initialize_for_testing() - .map_err(|e| { - format_err!( - "Failed to create {}/{} : {}", - pod_name, - VALIDATOR_NETWORK_ADDRESS_KEYS, - e - ) - })?; - Ok::<(), anyhow::Error>(()) - }) - .await??; - Ok(()) - } - - async fn generate_genesis( - &self, - num_validators: u32, - vault_nodes: &[KubeNode], - validator_nodes: &[KubeNode], - fullnode_nodes: &[KubeNode], - move_modules_dir: &str, - ) -> Result { - let genesis_helper = GenesisHelper::new("/tmp/genesis.json"); - let owners: Vec<_> = (0..num_validators).map(validator_pod_name).collect(); - let layout = Layout { - owners: owners.clone(), - operators: owners, - diem_root: DIEM_ROOT_NS.to_string(), - treasury_compliance: DIEM_ROOT_NS.to_string(), - }; - let layout_path = "/tmp/layout.yaml"; - write!( - File::create(layout_path).map_err(|e| format_err!( - "Failed to create {} : {}", - layout_path, - e - ))?, - "{}", - toml::to_string(&layout)? - ) - .map_err(|e| format_err!("Failed to write {} : {}", layout_path, e))?; - let token_path = "/tmp/token"; - write!( - File::create(token_path).map_err(|e| format_err!( - "Failed to create {} : {}", - token_path, - e - ))?, - "{}", - VAULT_TOKEN - ) - .map_err(|e| format_err!("Failed to write {} : {}", token_path, e))?; - genesis_helper - .set_layout(layout_path, "common") - .await - .map_err(|e| format_err!("Failed to set_layout : {}", e))?; - genesis_helper - .set_move_modules(move_modules_dir, "common") - .await - .map_err(|e| format_err!("Failed to set_move_modules : {}", e))?; - genesis_helper - .diem_root_key( - VAULT_BACKEND, - format!("http://{}:{}", vault_nodes[0].internal_ip, VAULT_PORT).as_str(), - token_path, - DIEM_ROOT_NS, - DIEM_ROOT_NS, - ) - .await - .map_err(|e| format_err!("Failed to diem_root_key : {}", e))?; - genesis_helper - .treasury_compliance_key( - VAULT_BACKEND, - format!("http://{}:{}", vault_nodes[0].internal_ip, VAULT_PORT).as_str(), - token_path, - DIEM_ROOT_NS, - DIEM_ROOT_NS, - ) - .await - .map_err(|e| format_err!("Failed to diem_root_key : {}", e))?; - - for (i, node) in vault_nodes.iter().enumerate() { - let pod_name = validator_pod_name(i as u32); - genesis_helper - .owner_key( - VAULT_BACKEND, - format!("http://{}:{}", node.internal_ip, VAULT_PORT).as_str(), - token_path, - &pod_name, - &pod_name, - ) - .await - .map_err(|e| format_err!("Failed to owner_key for {} : {}", pod_name, e))?; - genesis_helper - .operator_key( - VAULT_BACKEND, - format!("http://{}:{}", node.internal_ip, VAULT_PORT).as_str(), - token_path, - &pod_name, - &pod_name, - ) - .await - .map_err(|e| format_err!("Failed to operator_key for {} : {}", pod_name, e))?; - let fullnode_ip = if fullnode_nodes.is_empty() { - "0.0.0.0" - } else { - &fullnode_nodes[i].internal_ip - }; - genesis_helper - .validator_config( - &pod_name, - NetworkAddress::from_str( - format!("/ip4/{}/tcp/{}", validator_nodes[i].internal_ip, 6180).as_str(), - ) - .expect("Failed to parse network address"), - NetworkAddress::from_str(format!("/ip4/{}/tcp/{}", fullnode_ip, 6182).as_str()) - .expect("Failed to parse network address"), - ChainId::test(), - VAULT_BACKEND, - format!("http://{}:{}", node.internal_ip, VAULT_PORT).as_str(), - token_path, - &pod_name, - &pod_name, - ) - .await - .map_err(|e| format_err!("Failed to validator_config for {} : {}", pod_name, e))?; - genesis_helper - .set_operator(&pod_name, &pod_name) - .await - .map_err(|e| format_err!("Failed to set_operator for {} : {}", pod_name, e))?; - } - genesis_helper - .genesis(ChainId::test(), Path::new(GENESIS_PATH)) - .await?; - let waypoint = genesis_helper - .create_waypoint(ChainId::test()) - .await - .map_err(|e| format_err!("Failed to create_waypoint : {}", e))?; - for (i, node) in vault_nodes.iter().enumerate() { - let pod_name = validator_pod_name(i as u32); - genesis_helper - .create_and_insert_waypoint( - ChainId::test(), - VAULT_BACKEND, - format!("http://{}:{}", node.internal_ip, VAULT_PORT).as_str(), - token_path, - &pod_name, - ) - .await - .map_err(|e| { - format_err!( - "Failed to create_and_insert_waypoint for {} : {}", - pod_name, - e - ) - })?; - } - genesis_helper - .extract_private_key( - format!("{}__{}", DIEM_ROOT_NS, DIEM_ROOT_KEY).as_str(), - "/tmp/mint.key", - VAULT_BACKEND, - format!("http://{}:{}", vault_nodes[0].internal_ip, VAULT_PORT).as_str(), - token_path, - ) - .await - .map_err(|e| format_err!("Failed to extract_private_key : {}", e))?; - Ok(waypoint) - } -} diff --git a/testsuite/cluster-test/src/cluster_swarm/cluster_swarm_kube.rs b/testsuite/cluster-test/src/cluster_swarm/cluster_swarm_kube.rs deleted file mode 100644 index 2c08d23472509..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/cluster_swarm_kube.rs +++ /dev/null @@ -1,905 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use std::{collections::HashMap, env, sync::Arc}; - -use anyhow::{bail, format_err, Result}; -use async_trait::async_trait; - -use diem_logger::*; -use futures::{future::try_join_all, join, lock::Mutex, Future, FutureExt, TryFuture}; -use k8s_openapi::api::core::v1::{ConfigMap, Node, Pod, Service}; -use kube::{ - api::{Api, DeleteParams, PostParams}, - client::Client, - Config, -}; - -use crate::{cluster_swarm::ClusterSwarm, instance::Instance}; -use rand::{distributions::Alphanumeric, thread_rng, Rng}; - -use crate::instance::{ - ApplicationConfig::{Fullnode, Validator, Vault, LSR}, - InstanceConfig, -}; -use diem_config::config::DEFAULT_JSON_RPC_PORT; -use k8s_openapi::api::batch::v1::Job; -use kube::api::ListParams; -use reqwest::Client as HttpClient; -use rusoto_core::Region; -use rusoto_s3::{PutObjectRequest, S3Client, S3}; -use rusoto_sts::WebIdentityProvider; -use serde::de::DeserializeOwned; -use std::{collections::HashSet, convert::TryFrom, process::Command, time::Duration}; -use tokio::sync::Semaphore; - -pub const CFG_SEED: &str = "1337133713371337133713371337133713371337133713371337133713371337"; -const DEFAULT_NAMESPACE: &str = "default"; -const ERROR_NOT_FOUND: u16 = 404; -const GENESIS_PATH: &str = "/tmp/genesis.blob"; -const HEALTH_CHECK_URL: &str = "http://127.0.0.1:8001"; -const KUBECTL_BIN: &str = "/usr/local/bin/kubectl"; - -// We use the macros below to get around the current limitations of the -// "include_str!" macro (which loads the file content at compile time, rather -// than at runtime). -// TODO(joshlind): Remove me once we support runtime file loading. - -// Config file names. -macro_rules! FULLNODE_CONFIG { - () => { - "configs/fullnode.yaml" - }; -} -macro_rules! SAFETY_RULES_CONFIG { - () => { - "configs/safetyrules.yaml" - }; -} -macro_rules! VALIDATOR_CONFIG { - () => { - "configs/validator.yaml" - }; -} - -// Fluent bit file names. -macro_rules! FLUENT_BIT_CONF { - () => { - "fluent-bit/fluent-bit.conf" - }; -} -macro_rules! FLUENT_BIT_PARSERS_CONF { - () => { - "fluent-bit/parsers.conf" - }; -} - -// Template file names. -macro_rules! JOB_TEMPLATE { - () => { - "templates/job_template.yaml" - }; -} -macro_rules! DIEM_NODE_SERVICE_TEMPLATE { - () => { - "templates/diem_node_service_template.yaml" - }; -} -macro_rules! DIEM_NODE_SPEC_TEMPLATE { - () => { - "templates/diem_node_spec_template.yaml" - }; -} -macro_rules! LSR_SERVICE_TEMPLATE { - () => { - "templates/lsr_service_template.yaml" - }; -} -macro_rules! LSR_SPEC_TEMPLATE { - () => { - "templates/lsr_spec_template.yaml" - }; -} -macro_rules! VAULT_SERVICE_TEMPLATE { - () => { - "templates/vault_service_template.yaml" - }; -} -macro_rules! VAULT_SPEC_TEMPLATE { - () => { - "templates/vault_spec_template.yaml" - }; -} - -#[derive(Clone)] -pub struct ClusterSwarmKube { - client: Client, - http_client: HttpClient, - s3_client: S3Client, - pub node_map: Arc>>, -} - -impl ClusterSwarmKube { - pub async fn new() -> Result { - let http_client = HttpClient::new(); - // This uses kubectl proxy locally to forward connections to kubernetes api server - Command::new(KUBECTL_BIN).arg("proxy").spawn()?; - diem_retrier::retry_async(k8s_retry_strategy(), || { - Box::pin(async move { - debug!("Running local kube pod healthcheck on {}", HEALTH_CHECK_URL); - reqwest::get(HEALTH_CHECK_URL).await?.text().await?; - info!("Local kube pod healthcheck passed"); - Ok::<(), reqwest::Error>(()) - }) - }) - .await?; - let config = Config::new( - reqwest::Url::parse(HEALTH_CHECK_URL).expect("Failed to parse kubernetes endpoint url"), - ); - let client = Client::try_from(config)?; - let credentials_provider = WebIdentityProvider::from_k8s_env(); - let dispatcher = - rusoto_core::HttpClient::new().expect("failed to create request dispatcher"); - let s3_client = S3Client::new_with(dispatcher, credentials_provider, Region::UsWest2); - let node_map = Arc::new(Mutex::new(HashMap::new())); - Ok(Self { - client, - http_client, - s3_client, - node_map, - }) - } - - fn service_spec(&self, peer_id: String) -> Result { - let service_yaml = format!( - include_str!(DIEM_NODE_SERVICE_TEMPLATE!()), - peer_id = &peer_id - ); - get_spec_instance_from_template(service_yaml) - } - - fn lsr_spec(&self, pod_name: &str, node_name: &str, image_tag: &str) -> Result<(Pod, Service)> { - let pod_yaml = format!( - include_str!(LSR_SPEC_TEMPLATE!()), - pod_name = pod_name, - image_tag = image_tag, - node_name = node_name, - ); - let pod_spec = get_spec_instance_from_template(pod_yaml)?; - - let service_yaml = format!(include_str!(LSR_SERVICE_TEMPLATE!()), pod_name = pod_name,); - let service_spec = get_spec_instance_from_template(service_yaml)?; - - Ok((pod_spec, service_spec)) - } - - fn vault_spec(&self, validator_index: u32, node_name: &str) -> Result<(Pod, Service)> { - let pod_yaml = format!( - include_str!(VAULT_SPEC_TEMPLATE!()), - validator_index = validator_index, - node_name = node_name, - ); - let pod_spec = get_spec_instance_from_template(pod_yaml)?; - - let service_yaml = format!( - include_str!(VAULT_SERVICE_TEMPLATE!()), - validator_index = validator_index, - ); - let service_spec = get_spec_instance_from_template(service_yaml)?; - - Ok((pod_spec, service_spec)) - } - - fn diem_node_spec( - &self, - pod_app: &str, - pod_name: &str, - node_name: &str, - image_tag: &str, - ) -> Result { - let pod_yaml = format!( - include_str!(DIEM_NODE_SPEC_TEMPLATE!()), - pod_app = pod_app, - pod_name = pod_name, - image_tag = image_tag, - node_name = node_name, - ); - get_spec_instance_from_template(pod_yaml) - } - - fn job_spec( - &self, - k8s_node: &str, - docker_image: &str, - command: &str, - job_name: &str, - back_off_limit: u32, - ) -> Result<(Job, String)> { - let suffix = thread_rng() - .sample_iter(&Alphanumeric) - .take(10) - .map(char::from) - .collect::() - .to_ascii_lowercase(); - let job_full_name = format!("{}-{}", job_name, suffix); - - let job_yaml = format!( - include_str!(JOB_TEMPLATE!()), - name = &job_full_name, - label = job_name, - image = docker_image, - node_name = k8s_node, - command = command, - back_off_limit = back_off_limit, - ); - let job_spec = get_spec_instance_from_template(job_yaml)?; - Ok((job_spec, job_full_name)) - } - - async fn wait_job_completion( - &self, - job_name: &str, - back_off_limit: u32, - killed: bool, - ) -> Result { - diem_retrier::retry_async(k8s_retry_strategy(), || { - let job_api: Api = Api::namespaced(self.client.clone(), DEFAULT_NAMESPACE); - let job_name = job_name.to_string(); - Box::pin(async move { - match job_api.get(&job_name).await { - Ok(job) => { - let job_status = job.status.as_ref().ok_or_else(|| { - format_err!("status not found for job {}: {:?}", job_name, &job) - })?; - if let Some(succeeded) = job_status.succeeded { - if succeeded == 1 { - return Ok(true); - } - } - if let Some(failed) = job_status.failed { - if failed as u32 == back_off_limit + 1 { - error!("job {} failed to complete", job_name); - return Ok(false); - } - } - bail!("job in still in progress {}", job_name) - } - Err(e) => { - if killed { - info!("Job {} has been killed already", job_name); - return Ok(true); - } - bail!("job_api.get failed for job {} : {:?}", job_name, e) - } - } - }) - }) - .await - } - - pub async fn kill_job(&self, job_full_name: &str) -> Result<()> { - let dp = DeleteParams::default(); - let job_api: Api = Api::namespaced(self.client.clone(), DEFAULT_NAMESPACE); - job_api - .delete(job_full_name, &dp) - .await? - .map_left(|o| debug!("Deleting Job: {:?}", o.status)) - .map_right(|s| debug!("Deleted Job: {:?}", s)); - let back_off_limit = 0; - // the job night have been deleted already, so we do not handle error - match self - .wait_job_completion(job_full_name, back_off_limit, true) - .await - { - Ok(_) => debug!("Killing job {} returned job_status.success", job_full_name), - Err(error) => info!( - "Killing job {} returned job_status.failed: {}", - job_full_name, error - ), - } - - Ok(()) - } - - // just ensures jobs are started, but does not wait on completion - async fn start_jobs(&self, jobs: Vec) -> Result> { - let pp = PostParams::default(); - let job_api: Api = Api::namespaced(self.client.clone(), DEFAULT_NAMESPACE); - let create_jobs_futures = jobs.iter().map(|job| job_api.create(&pp, job)); - let job_names: Vec = try_join_all_limit(create_jobs_futures.collect()) - .await? - .iter() - .map(|job| -> Result { - Ok(job - .metadata - .name - .as_ref() - .ok_or_else(|| format_err!("name not found for job {:?}", &job))? - .clone()) - }) - .collect::>()?; - Ok(job_names) - } - - async fn run_jobs(&self, jobs: Vec, back_off_limit: u32) -> Result<()> { - let job_names: Vec = self.start_jobs(jobs).await?; - let wait_jobs_futures = job_names - .iter() - .map(|job_name| self.wait_job_completion(job_name, back_off_limit, false)); - let wait_jobs_results = try_join_all_limit(wait_jobs_futures.collect()).await?; - if wait_jobs_results.iter().any(|r| !r) { - bail!("one of the jobs failed") - } - Ok(()) - } - - async fn list_nodes(&self) -> Result> { - let node_api: Api = Api::all(self.client.clone()); - let lp = ListParams::default().labels("nodeType=validators"); - let nodes = node_api.list(&lp).await?.items; - nodes.into_iter().map(KubeNode::try_from).collect() - } - - async fn delete_resource(&self, name: &str) -> Result<()> - where - T: k8s_openapi::Resource - + Clone - + serde::de::DeserializeOwned - + kube::api::Meta - + Send - + Sync, - { - debug!("Deleting {} {}", T::KIND, name); - let resource_api: Api = Api::namespaced(self.client.clone(), DEFAULT_NAMESPACE); - diem_retrier::retry_async(k8s_retry_strategy(), || { - let resource_api = resource_api.clone(); - let name = name.to_string(); - Box::pin(async move { - match resource_api.delete(&name, &Default::default()).await { - Ok(_) => {} - Err(kube::Error::Api(ae)) => { - if ae.code == ERROR_NOT_FOUND { - debug!("{} {} deleted successfully", T::KIND, name); - return Ok(()); - } else { - error!( - "delete failed for {} {} with kube::Error::Api: {}", - T::KIND, - name, - ae - ) - } - } - Err(err) => { - error!("delete failed for {} {} with error: {}", T::KIND, name, err) - } - } - match resource_api.get(&name).await { - Ok(_) => { - bail!("Waiting for {} {} to be deleted..", T::KIND, name); - } - Err(kube::Error::Api(ae)) => { - if ae.code == ERROR_NOT_FOUND { - debug!("{} {} deleted successfully", T::KIND, name); - Ok(()) - } else { - bail!("Waiting for {} {} to be deleted..", T::KIND, name) - } - } - Err(err) => bail!( - "Waiting for {} {} to be deleted... Error: {}", - T::KIND, - name, - err - ), - } - }) - }) - .await - .map_err(|e| format_err!("Failed to delete {} {}: {:?}", T::KIND, name, e)) - } - - async fn remove_all_network_effects_helper(&self) -> Result<()> { - debug!("Trying to remove_all_network_effects"); - let back_off_limit = 2; - - let jobs: Vec = self - .list_nodes() - .await? - .iter() - .map(|node| -> Result { - let suffix = thread_rng() - .sample_iter(&Alphanumeric) - .take(10) - .map(char::from) - .collect::() - .to_ascii_lowercase(); - let job_name = format!("remove-network-effects-{}", suffix); - let job_yaml = format!( - include_str!(JOB_TEMPLATE!()), - name = &job_name, - label = "remove-network-effects", - image = "853397791086.dkr.ecr.us-west-2.amazonaws.com/cluster-test-util:latest", - node_name = node.name, - command = "tc qdisc delete dev eth0 root || true", - back_off_limit = back_off_limit, - ); - debug!("Removing network effects from node {}", node.name); - get_spec_instance_from_template(job_yaml) - }) - .collect::>()?; - self.run_jobs(jobs, back_off_limit).await - } - - pub async fn get_workspace(&self) -> Result { - let cm_api: Api = Api::namespaced(self.client.clone(), DEFAULT_NAMESPACE); - let data = cm_api - .get("workspace") - .await? - .data - .ok_or_else(|| format_err!("data not found for ConfigMap"))?; - let workspace = data - .get("workspace") - .ok_or_else(|| format_err!("Failed to find workspace"))?; - Ok(workspace.clone()) - } - - pub async fn spawn_job( - &self, - k8s_node: &str, - docker_image: &str, - command: &str, - job_name: &str, - ) -> Result { - let back_off_limit = 0; - let (job_spec, job_full_name) = - self.job_spec(k8s_node, docker_image, command, job_name, back_off_limit)?; - debug!("Starting job {} for node {}", job_name, k8s_node); - self.start_jobs(vec![job_spec]).await?; - Ok(job_full_name) - } - - pub async fn run( - &self, - k8s_node: &str, - docker_image: &str, - command: &str, - job_name: &str, - ) -> Result<()> { - let back_off_limit = 0; - let (job_spec, _) = - self.job_spec(k8s_node, docker_image, command, job_name, back_off_limit)?; - debug!("Running job {} for node {}", job_name, k8s_node); - self.run_jobs(vec![job_spec], back_off_limit).await - } - - pub async fn allocate_node(&self, pod_name: &str) -> Result { - diem_retrier::retry_async(k8s_retry_strategy(), || { - Box::pin(async move { self.allocate_node_impl(pod_name).await }) - }) - .await - } - - async fn allocate_node_impl(&self, pod_name: &str) -> Result { - let nodes = self.list_nodes().await?; - let nodes_count = nodes.len(); - // Holding lock for read-verfy-write to avoid race conditions on this map - let mut node_map = self.node_map.lock().await; - if let Some(existed) = node_map.get(pod_name) { - return Ok(existed.clone()); - } - let used_nodes: HashSet<_> = node_map.values().map(|node| &node.name).collect(); - for node in nodes { - if !used_nodes.contains(&node.name) { - node_map.insert(pod_name.to_string(), node.clone()); - return Ok(node); - } - } - Err(format_err!( - "Can not find free node, got total {} nodes", - nodes_count - )) - } - - pub async fn upsert_node(&self, instance_config: InstanceConfig) -> Result { - let pod_name = instance_config.pod_name(); - let pod_api: Api = Api::namespaced(self.client.clone(), DEFAULT_NAMESPACE); - if pod_api.get(&pod_name).await.is_ok() { - self.delete_resource::(&pod_name).await?; - } - let node = self - .allocate_node(&pod_name) - .await - .map_err(|e| format_err!("Failed to allocate node: {}", e))?; - debug!( - "Configuring fluent-bit, genesis and config for pod {} on {}", - pod_name, node.name - ); - if instance_config.application_config.needs_fluentbit() { - self.config_fluentbit("events", &pod_name, &node.name) - .await?; - } - if instance_config.application_config.needs_genesis() { - self.put_genesis_file(&pod_name, &node.name).await?; - } - if instance_config.application_config.needs_config() { - self.generate_config(&instance_config, &pod_name, &node.name) - .await?; - } - debug!("Creating pod {} on {:?}", pod_name, node); - let (p, s): (Pod, Service) = match &instance_config.application_config { - Validator(validator_config) => ( - self.diem_node_spec( - "diem-validator", - pod_name.as_str(), - &node.name, - &validator_config.image_tag, - )?, - self.service_spec(pod_name.clone())?, - ), - Fullnode(fullnode_config) => ( - self.diem_node_spec( - "diem-fullnode", - pod_name.as_str(), - &node.name, - &fullnode_config.image_tag, - )?, - self.service_spec(pod_name.clone())?, - ), - Vault(_vault_config) => { - self.vault_spec(instance_config.validator_group.index_only(), &node.name)? - } - LSR(lsr_config) => { - self.lsr_spec(pod_name.as_str(), &node.name, &lsr_config.image_tag)? - } - }; - match pod_api.create(&PostParams::default(), &p).await { - Ok(o) => { - debug!( - "Created pod {}", - o.metadata - .name - .as_ref() - .ok_or_else(|| { format_err!("name not found for pod {}", pod_name) })? - ); - } - Err(e) => bail!("Failed to create pod {} : {}", pod_name, e), - } - let service_api: Api = Api::namespaced(self.client.clone(), DEFAULT_NAMESPACE); - match service_api.create(&PostParams::default(), &s).await { - Ok(o) => { - debug!( - "Created service {}", - o.metadata - .name - .as_ref() - .ok_or_else(|| { format_err!("name not found for service") })? - ); - } - Err(kube::Error::Api(ae)) => { - if ae.code == 409 { - // 409 == service already exists - debug!("Service already exists. Skipping") - } else { - bail!("Failed to create service : {}", ae) - } - } - Err(e) => bail!("Failed to create service : {}", e), - } - let ac_port = DEFAULT_JSON_RPC_PORT as u32; - let instance = Instance::new_k8s( - pod_name.clone(), - node.internal_ip, - ac_port, - node.name.clone(), - instance_config.clone(), - self.http_client.clone(), - self.clone(), - ); - Ok(instance) - } - - pub async fn delete_node(&self, instance_config: &InstanceConfig) -> Result<()> { - let pod_name = instance_config.pod_name(); - let service_name = pod_name.clone(); - self.delete_resource::(&pod_name).await?; - self.delete_resource::(&service_name).await - } - - async fn remove_all_network_effects(&self) -> Result<()> { - diem_retrier::retry_async(k8s_retry_strategy(), || { - Box::pin(async move { self.remove_all_network_effects_helper().await }) - }) - .await - } - - pub async fn cleanup(&self) -> Result<()> { - self.delete_all() - .await - .map_err(|e| format_err!("delete_all failed: {}", e))?; - self.remove_all_network_effects() - .await - .map_err(|e| format_err!("remove_all_network_effects: {}", e)) - } - - pub async fn delete(&self) -> Result<()> - where - T: k8s_openapi::Resource - + Clone - + serde::de::DeserializeOwned - + kube::api::Meta - + Send - + Sync, - { - let api: Api = Api::namespaced(self.client.clone(), DEFAULT_NAMESPACE); - let resource_names: Vec = api - .list(&ListParams { - label_selector: Some("diem-node=true".to_string()), - ..Default::default() - }) - .await? - .iter() - .map(|res| -> Result { - Ok(res - .meta() - .name - .as_ref() - .ok_or_else(|| format_err!("name not found"))? - .clone()) - }) - .collect::>()?; - let delete_futures = resource_names - .iter() - .map(|resource_names| self.delete_resource::(resource_names)); - try_join_all_limit(delete_futures.collect()).await?; - Ok(()) - } - - pub async fn delete_all(&self) -> Result<()> { - let del_pod = self.delete::().boxed(); - let del_service = self.delete::().boxed(); - let del_job = self.delete::().boxed(); - let _ = join!(del_pod, del_service, del_job); - Ok(()) - } - - /// Runs command on the provided host in separate utility container based on cluster-test-util image - pub async fn util_cmd>( - &self, - command: S, - k8s_node: &str, - job_name: &str, - ) -> Result<()> { - self.run( - k8s_node, - "853397791086.dkr.ecr.us-west-2.amazonaws.com/cluster-test-util:latest", - command.as_ref(), - job_name, - ) - .await - } - - async fn config_fluentbit(&self, input_tag: &str, pod_name: &str, node: &str) -> Result<()> { - let parsers_config = include_str!(FLUENT_BIT_PARSERS_CONF!()).to_string(); - let fluentbit_config = format!( - include_str!(FLUENT_BIT_CONF!()), - input_tag = input_tag, - pod_name = pod_name - ); - let dir = "/opt/diem/data/fluent-bit/"; - self.put_file( - node, - pod_name, - format!("{}parser.conf", dir).as_str(), - parsers_config.as_bytes(), - ) - .await?; - self.put_file( - node, - pod_name, - format!("{}fluent-bit.conf", dir).as_str(), - fluentbit_config.as_bytes(), - ) - .await?; - Ok(()) - } - - async fn put_genesis_file(&self, pod_name: &str, node: &str) -> Result<()> { - let genesis = std::fs::read(GENESIS_PATH) - .map_err(|e| format_err!("Failed to read {} : {}", GENESIS_PATH, e))?; - self.put_file( - node, - pod_name, - "/opt/diem/etc/genesis.blob", - genesis.as_slice(), - ) - .await?; - Ok(()) - } - - async fn generate_config( - &self, - instance_config: &InstanceConfig, - pod_name: &str, - node: &str, - ) -> Result<()> { - let node_config = match &instance_config.application_config { - Validator(validator_config) => Some(format!( - include_str!(VALIDATOR_CONFIG!()), - vault_addr = validator_config - .vault_addr - .as_ref() - .unwrap_or(&"".to_string()), - vault_ns = validator_config - .vault_namespace - .as_ref() - .unwrap_or(&"".to_string()), - safety_rules_addr = validator_config - .safety_rules_addr - .as_ref() - .unwrap_or(&"".to_string()), - )), - Fullnode(fullnode_config) => Some(format!( - include_str!(FULLNODE_CONFIG!()), - vault_addr = fullnode_config - .vault_addr - .as_ref() - .unwrap_or(&"".to_string()), - vault_ns = fullnode_config - .vault_namespace - .as_ref() - .unwrap_or(&"".to_string()), - seed_peer_ip = fullnode_config.seed_peer_ip, - )), - LSR(lsr_config) => Some(format!( - include_str!(SAFETY_RULES_CONFIG!()), - vault_addr = lsr_config.vault_addr.as_ref().unwrap_or(&"".to_string()), - vault_ns = lsr_config - .vault_namespace - .as_ref() - .unwrap_or(&"".to_string()), - )), - _ => None, - }; - - if let Some(node_config) = node_config { - self.put_file( - node, - pod_name, - "/opt/diem/etc/node.yaml", - node_config.as_bytes(), - ) - .await?; - } - - Ok(()) - } -} - -/// Retrieves a spec instance of type T from a T template file. -fn get_spec_instance_from_template(template_yaml: String) -> Result { - let spec: serde_yaml::Value = serde_yaml::from_str(&template_yaml)?; - let spec = serde_json::value::to_value(spec)?; - serde_json::from_value(spec).map_err(|e| format_err!("serde_json::from_value failed: {}", e)) -} - -#[async_trait] -impl ClusterSwarm for ClusterSwarmKube { - async fn spawn_new_instance(&self, instance_config: InstanceConfig) -> Result { - self.upsert_node(instance_config).await - } - - async fn clean_data(&self, node: &str) -> Result<()> { - self.util_cmd("rm -rf /opt/diem/data/*", node, "clean-data") - .await - } - - async fn get_node_name(&self, pod_name: &str) -> Result { - let node = self.allocate_node(pod_name).await?; - Ok(node.name) - } - - async fn get_grafana_baseurl(&self) -> Result { - let workspace = self.get_workspace().await?; - Ok(format!( - "http://grafana.{}-k8s-testnet.aws.hlw3truzy4ls.com", - workspace - )) - } - - async fn put_file(&self, node: &str, pod_name: &str, path: &str, content: &[u8]) -> Result<()> { - let bucket = "toro-cluster-test-flamegraphs"; - let run_id = env::var("RUN_ID").expect("RUN_ID is not set."); - diem_retrier::retry_async(k8s_retry_strategy(), || { - let run_id = &run_id; - let content = content.to_vec(); - Box::pin(async move { - self.s3_client - .put_object(PutObjectRequest { - bucket: bucket.to_string(), - key: format!("data/{}/{}/{}", run_id, pod_name, path), - body: Some(content.into()), - ..Default::default() - }) - .await - .map_err(|e| format_err!("put_object failed : {}", e)) - }) - }) - .await?; - self.util_cmd( - format!( - "aws s3 cp s3://{}/data/{}/{}/{path} {path}", - bucket, - run_id, - pod_name, - path = path - ), - node, - "put-file", - ) - .await - .map_err(|e| format_err!("aws s3 cp failed : {}", e))?; - Ok(()) - } -} - -#[derive(Clone, Debug)] -pub struct KubeNode { - pub name: String, - pub provider_id: String, - pub internal_ip: String, -} - -impl TryFrom for KubeNode { - type Error = anyhow::Error; - - fn try_from(node: Node) -> Result { - let metadata = node.metadata; - let spec = node - .spec - .ok_or_else(|| format_err!("spec not found for node"))?; - let provider_id = spec - .provider_id - .ok_or_else(|| format_err!("provider_id not found for node"))?; - let name = metadata - .name - .ok_or_else(|| format_err!("node name not found"))?; - let status = node - .status - .ok_or_else(|| format_err!("status not found for node"))?; - let addresses = status - .addresses - .ok_or_else(|| format_err!("addresses name not found"))?; - let internal_address = addresses - .iter() - .find(|a| a.type_ == "InternalIP") - .ok_or_else(|| format_err!("internal address not found"))?; - let internal_ip = internal_address.address.clone(); - Ok(Self { - name, - provider_id, - internal_ip, - }) - } -} - -async fn try_join_all_limit>>( - futures: Vec, -) -> std::result::Result, E> { - let semaphore = Semaphore::new(32); - let futures = futures - .into_iter() - .map(|f| acquire_and_execute(&semaphore, f)); - try_join_all(futures).await -} - -async fn acquire_and_execute(semaphore: &Semaphore, f: F) -> F::Output { - let _permit = semaphore.acquire().await; - f.await -} - -fn k8s_retry_strategy() -> impl Iterator { - diem_retrier::exp_retry_strategy(1000, 5000, 30) -} diff --git a/testsuite/cluster-test/src/cluster_swarm/configs/fullnode.yaml b/testsuite/cluster-test/src/cluster_swarm/configs/fullnode.yaml deleted file mode 100644 index 86318e43ca8a3..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/configs/fullnode.yaml +++ /dev/null @@ -1,73 +0,0 @@ -base: - role: "full_node" - waypoint: - from_storage: - type: "vault" - server: "http://{vault_addr}:8200" - namespace: "{vault_ns}" - token: - from_config: root - -execution: - genesis_file_location: "genesis.blob" -full_node_networks: -- network_id: - private: "vfn" - listen_address: "/ip4/0.0.0.0/tcp/6181" - seeds: - d58bc7bb154b38039bc9096ce04e1237: - addresses: - - "/ip4/{seed_peer_ip}/tcp/6181/ln-noise-ik/f0274c2774519281a8332d0bb9d8101bd58bc7bb154b38039bc9096ce04e1237/ln-handshake/0" - role: "Validator" -- network_id: "public" - discovery_method: "onchain" - listen_address: "/ip4/0.0.0.0/tcp/6182" - identity: - type: "from_storage" - key_name: "fullnode_network" - peer_id_name: "owner_account" - backend: - type: "vault" - server: "http://{vault_addr}:8200" - namespace: "{vault_ns}" - token: - from_config: root - -mempool: - default_failovers: 0 - -storage: - prune_window: 50000 - -json_rpc: - address: 0.0.0.0:8080 - -# this is only enabled when the binary is compiled with failpoints feature, otherwise no-op -failpoints: - api::endpoint_index: 1%return - api::endpoint_get_account: 1%return - api::endpoint_get_account_resources: 1%return - api::endpoint_get_account_modules: 1%return - api::endpoint_get_transaction: 1%return - api::endpoint_get_transactions: 1%return - api::endpoint_get_account_transactions: 1%return - api::endpoint_submit_json_transactions: 1%return - api::endpoint_submit_bcs_transactions: 1%return - api::endpoint_create_signing_message: 1%return - api::endpoint_get_events_by_event_key: 1%return - api::endpoint_get_events_by_event_handle: 1%return - jsonrpc::get_latest_ledger_info: 1%return - jsonrpc::method::submit::mempool_sender: 1%return - jsonrpc::method::submit: 1%return - jsonrpc::method::get_metadata: 1%return - jsonrpc::method::get_account: 1%return - jsonrpc::method::get_transactions: 1%return - jsonrpc::method::get_account_transaction: 1%return - jsonrpc::method::get_events: 1%return - jsonrpc::method::get_currencies: 1%return - jsonrpc::method::get_state_proof: 1%return - jsonrpc::method::get_account_state_with_proof: 1%return - jsonrpc::method::get_network_status: 1%return - state_sync::apply_chunk: 0.1%return - state_sync::process_chunk_request: 0.1%return - mempool::send_to: 1%return diff --git a/testsuite/cluster-test/src/cluster_swarm/configs/safetyrules.yaml b/testsuite/cluster-test/src/cluster_swarm/configs/safetyrules.yaml deleted file mode 100644 index f8f869113a906..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/configs/safetyrules.yaml +++ /dev/null @@ -1,9 +0,0 @@ -service: - type: "process" - server_address: "/ip4/0.0.0.0/tcp/6185" -backend: - type: "vault" - server: "http://{vault_addr}:8200" - namespace: "{vault_ns}" - token: - from_config: root diff --git a/testsuite/cluster-test/src/cluster_swarm/configs/validator.yaml b/testsuite/cluster-test/src/cluster_swarm/configs/validator.yaml deleted file mode 100644 index 690e0b7a2f44b..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/configs/validator.yaml +++ /dev/null @@ -1,70 +0,0 @@ -base: - waypoint: - from_storage: - type: "vault" - server: "http://{vault_addr}:8200" - namespace: "{vault_ns}" - token: - from_config: root - -consensus: - safety_rules: - service: - type: "process" - server_address: "/ip4/{safety_rules_addr}/tcp/6185" - -execution: - genesis_file_location: "/opt/diem/etc/genesis.blob" - backend: - type: "vault" - server: "http://{vault_addr}:8200" - namespace: "{vault_ns}" - token: - from_config: root - -validator_network: - discovery_method: "onchain" - mutual_authentication: true - identity: - type: "from_storage" - key_name: "validator_network" - peer_id_name: "owner_account" - backend: - type: "vault" - server: "http://{vault_addr}:8200" - namespace: "{vault_ns}" - token: - from_config: root - -full_node_networks: - - network_id: - private: "vfn" - listen_address: "/ip4/0.0.0.0/tcp/6181" - identity: - type: "from_config" - key: "b0f405a3e75516763c43a2ae1d70423699f34cd68fa9f8c6bb2d67aa87d0af69" - peer_id: "d58bc7bb154b38039bc9096ce04e1237" - -storage: - prune_window: 50000 - -json_rpc: - address: 0.0.0.0:8080 - -# this is only enabled when the binary is compiled with failpoints feature, otherwise no-op -failpoints: - consensus::process_proposal_msg: 5%return - consensus::process_sync_info_msg: 5%return - consensus::process_vote_msg: 5%return - consensus::process_block_retrieval: 5%return - consensus::sync_to: 1%return - consensus::compute: 1%return - consensus::pull_txn: 1%return - executor::vm_execute_chunk: 1%return - executor::commit_chunk: 1%return - executor::vm_execute_block: 1%return - #executor::commit_blocks: 0.01%return - state_sync::request_sync: 1%return - state_sync::apply_chunk: 1%return - state_sync::process_chunk_request: 0.1%return - mempool::send_to: 1%return diff --git a/testsuite/cluster-test/src/cluster_swarm/fluent-bit/fluent-bit.conf b/testsuite/cluster-test/src/cluster_swarm/fluent-bit/fluent-bit.conf deleted file mode 100644 index a87eea69560bb..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/fluent-bit/fluent-bit.conf +++ /dev/null @@ -1,39 +0,0 @@ -[SERVICE] - Flush 5 - Log_Level info - Daemon off - Parsers_File /opt/diem/data/fluent-bit/parsers.conf - -[INPUT] - Name tcp - Tag validator - Listen 0.0.0.0 - Port 5044 - Chunk_Size 32 - Buffer_Size 64 - Format json - -[INPUT] - Name tail - Tag {input_tag} - Path /opt/diem/data/events.log - Mem_Buf_Limit 200MB - Refresh_Interval 10 - Skip_Long_Lines On - Parser events - -[FILTER] - Name record_modifier - Match * - Record kubernetes.pod_name {pod_name} - -[OUTPUT] - Name es - Match * - Host elasticsearch-master - Port 9200 - Logstash_Format On - Replace_Dots Off - Retry_Limit False - Logstash_Prefix kubernetes_cluster - Generate_ID On diff --git a/testsuite/cluster-test/src/cluster_swarm/fluent-bit/parsers.conf b/testsuite/cluster-test/src/cluster_swarm/fluent-bit/parsers.conf deleted file mode 100644 index fc9d7525815bf..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/fluent-bit/parsers.conf +++ /dev/null @@ -1,5 +0,0 @@ -[PARSER] - Name events - Format json - Time_Key timestamp - Time_Format %Y-%m-%d %H:%M:%S diff --git a/testsuite/cluster-test/src/cluster_swarm/mod.rs b/testsuite/cluster-test/src/cluster_swarm/mod.rs deleted file mode 100644 index b32bd7c2c75ee..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/mod.rs +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -pub mod cluster_swarm_kube; - -use crate::instance::{Instance, InstanceConfig}; -use anyhow::Result; -use async_trait::async_trait; - -#[async_trait] -pub trait ClusterSwarm: Send + Sync { - /// Spawns a new instance. - async fn spawn_new_instance(&self, instance_config: InstanceConfig) -> Result; - - /// If deleting /opt/diem/data/* is required, call clean_date before calling - /// spawn_new_instance. - async fn clean_data(&self, node: &str) -> Result<()>; - - async fn get_node_name(&self, pod_name: &str) -> Result; - - async fn get_grafana_baseurl(&self) -> Result; - - async fn put_file(&self, node: &str, pod_name: &str, path: &str, content: &[u8]) -> Result<()>; -} diff --git a/testsuite/cluster-test/src/cluster_swarm/templates/diem_node_service_template.yaml b/testsuite/cluster-test/src/cluster_swarm/templates/diem_node_service_template.yaml deleted file mode 100644 index 187bf39841a0f..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/templates/diem_node_service_template.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: {peer_id} - labels: - app: diem-validator - diem-node: "true" - peer_id: {peer_id} -spec: - type: ClusterIP - selector: - app: diem-validator - diem-node: "true" - peer_id: {peer_id} - ports: - - name: "port6180" - protocol: TCP - port: 6180 - - name: "port6181" - protocol: TCP - port: 6181 - - name: "port8000" - protocol: TCP - port: 8000 - - name: "port9101" - protocol: TCP - port: 9101 - - name: "port6191" - protocol: TCP - port: 6191 diff --git a/testsuite/cluster-test/src/cluster_swarm/templates/diem_node_spec_template.yaml b/testsuite/cluster-test/src/cluster_swarm/templates/diem_node_spec_template.yaml deleted file mode 100644 index 1bbb2b7417f78..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/templates/diem_node_spec_template.yaml +++ /dev/null @@ -1,93 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: {pod_name} - labels: - app: {pod_app} - diem-node: "true" - peer_id: {pod_name} - annotations: - prometheus.io/should_be_scraped: "true" -spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - serviceAccountName: clustertest - nodeSelector: - nodeType: validators - nodeName: "{node_name}" - containers: - - name: fluent-bit - image: 853397791086.dkr.ecr.us-west-2.amazonaws.com/fluent-bit:1.3.9 - imagePullPolicy: IfNotPresent - command: ["/fluent-bit/bin/fluent-bit", "-c", "/opt/diem/data/fluent-bit/fluent-bit.conf"] - volumeMounts: - - mountPath: /opt/diem/data - name: data - - name: debug - image: 853397791086.dkr.ecr.us-west-2.amazonaws.com/diem/validator:{image_tag} - imagePullPolicy: Always - volumeMounts: - - mountPath: /opt/diem/data - name: data - - mountPath: /opt/diem/etc - name: config - command: - - "bash" - - "-c" - - | - set -x; - while true; do sleep 10; done - - name: main - image: 853397791086.dkr.ecr.us-west-2.amazonaws.com/diem/validator:{image_tag} - imagePullPolicy: Always - resources: - requests: - cpu: 7800m - command: ["/opt/diem/bin/diem-node", "-f", "/opt/diem/etc/node.yaml"] - ports: - - containerPort: 6180 - - containerPort: 6181 - - containerPort: 8000 - - containerPort: 9101 - - containerPort: 6191 - volumeMounts: - - mountPath: /opt/diem/data - name: data - - mountPath: /opt/diem/etc - name: config - env: - - name: RUST_LOG - value: "debug" - - name: STRUCT_LOG_TCP_ADDR - value: "127.0.0.1:5044" - - name: RUST_BACKTRACE - value: "1" - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - volumes: - - name: data - hostPath: - path: /data - type: Directory - - name: config - hostPath: - path: /config - type: Directory - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: diem-node - operator: Exists - topologyKey: "kubernetes.io/hostname" - terminationGracePeriodSeconds: 5 - tolerations: - - key: "validators" - operator: "Exists" - effect: "NoSchedule" - - key: "node.kubernetes.io/not-ready" - operator: "Exists" - effect: "NoSchedule" diff --git a/testsuite/cluster-test/src/cluster_swarm/templates/job_template.yaml b/testsuite/cluster-test/src/cluster_swarm/templates/job_template.yaml deleted file mode 100644 index c6d6934e88555..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/templates/job_template.yaml +++ /dev/null @@ -1,44 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: {name} - labels: - app: {label} - diem-node: "true" -spec: - template: - metadata: - labels: - app: {label} - spec: - serviceAccountName: clustertest - nodeName: {node_name} - hostNetwork: true - hostPID: true - dnsPolicy: ClusterFirstWithHostNet - containers: - - name: main - image: {image} - volumeMounts: - - mountPath: /opt/diem/data - name: data - - mountPath: /opt/diem/etc - name: config - imagePullPolicy: Always - command: ["sh", "-c", "{command}"] - securityContext: - runAsUser: 0 # To get permissions to write to /opt/diem/data - privileged: true - capabilities: - add: ["SYS_ADMIN"] - restartPolicy: Never - volumes: - - name: data - hostPath: - path: /data - type: DirectoryOrCreate - - name: config - hostPath: - path: /config - type: DirectoryOrCreate - backoffLimit: {back_off_limit} diff --git a/testsuite/cluster-test/src/cluster_swarm/templates/lsr_service_template.yaml b/testsuite/cluster-test/src/cluster_swarm/templates/lsr_service_template.yaml deleted file mode 100644 index b1edde15cbf92..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/templates/lsr_service_template.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: {pod_name} - labels: - app: diem-lsr - diem-node: "true" - peer_id: {pod_name} -spec: - type: ClusterIP - publishNotReadyAddresses: true - selector: - app: diem-lsr - diem-node: "true" - peer_id: {pod_name} - ports: - - name: safety-rules - protocol: TCP - port: 6185 diff --git a/testsuite/cluster-test/src/cluster_swarm/templates/lsr_spec_template.yaml b/testsuite/cluster-test/src/cluster_swarm/templates/lsr_spec_template.yaml deleted file mode 100644 index 0ab42fe20ab7e..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/templates/lsr_spec_template.yaml +++ /dev/null @@ -1,73 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: {pod_name} - labels: - app: diem-lsr - diem-node: "true" - peer_id: {pod_name} -spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - serviceAccountName: clustertest - nodeSelector: - nodeType: validators - nodeName: "{node_name}" - containers: - - name: fluent-bit - image: 853397791086.dkr.ecr.us-west-2.amazonaws.com/fluent-bit:1.3.9 - imagePullPolicy: IfNotPresent - command: ["/fluent-bit/bin/fluent-bit", "-c", "/opt/diem/data/fluent-bit/fluent-bit.conf"] - volumeMounts: - - mountPath: /opt/diem/data - name: diem-data - - name: main - image: 853397791086.dkr.ecr.us-west-2.amazonaws.com/diem/validator_tcb:{image_tag} - imagePullPolicy: Always - command: ["/opt/diem/bin/safety-rules", "/opt/diem/etc/node.yaml"] - ports: - - containerPort: 6185 - volumeMounts: - - mountPath: /opt/diem/etc - name: config-built - - mountPath: /opt/diem/data - name: diem-data - - mountPath: /opt/vault - name: vault-token - env: - - name: RUST_LOG - value: debug - - name: RUST_BACKTRACE - value: "1" - - name: PUSH_METRICS_ENDPOINT - value: "http://diem-testnet-prometheus-pushgateway:9091/metrics/job/safety_rules/instance/{pod_name}" - volumes: - - name: config-built - hostPath: - path: /config - type: DirectoryOrCreate - - name: diem-data - hostPath: - path: /data - type: DirectoryOrCreate - - name: tmp - emptyDir: {{}} - - name: vault-token - emptyDir: - medium: Memory - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: diem-node - operator: Exists - topologyKey: "kubernetes.io/hostname" - terminationGracePeriodSeconds: 5 - tolerations: - - key: "validators" - operator: "Exists" - effect: "NoSchedule" - - key: "node.kubernetes.io/not-ready" - operator: "Exists" - effect: "NoSchedule" diff --git a/testsuite/cluster-test/src/cluster_swarm/templates/vault_service_template.yaml b/testsuite/cluster-test/src/cluster_swarm/templates/vault_service_template.yaml deleted file mode 100644 index 9ba038f19a6a8..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/templates/vault_service_template.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# Source: helm chart for vault -apiVersion: v1 -kind: Service -metadata: - name: vault-{validator_index} - labels: - app: diem-vault - diem-node: "true" - peer_id: vault-{validator_index} - annotations: - service.alpha.kubernetes.io/tolerate-unready-endpoints: "true" -spec: - type: ClusterIP - publishNotReadyAddresses: true - ports: - - name: "http" - port: 8200 - targetPort: 8200 - - name: internal - port: 8201 - targetPort: 8201 - selector: - app: diem-vault - diem-node: "true" - peer_id: vault-{validator_index} diff --git a/testsuite/cluster-test/src/cluster_swarm/templates/vault_spec_template.yaml b/testsuite/cluster-test/src/cluster_swarm/templates/vault_spec_template.yaml deleted file mode 100644 index e0f040aac1977..0000000000000 --- a/testsuite/cluster-test/src/cluster_swarm/templates/vault_spec_template.yaml +++ /dev/null @@ -1,134 +0,0 @@ -# Source: helm chart for vault -apiVersion: v1 -kind: Pod -metadata: - name: vault-{validator_index} - labels: - app: diem-vault - diem-node: "true" - peer_id: vault-{validator_index} -spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - serviceAccountName: clustertest - nodeSelector: - nodeType: validators - nodeName: "{node_name}" - securityContext: - runAsNonRoot: true - runAsGroup: 1000 - runAsUser: 100 - fsGroup: 1000 - volumes: - containers: - - name: vault - securityContext: - capabilities: - add: ["IPC_LOCK"] - image: 853397791086.dkr.ecr.us-west-2.amazonaws.com/vault:1.4.0 - imagePullPolicy: IfNotPresent - command: - args: - env: - - name: HOST_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP - - name: POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: VAULT_K8S_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: VAULT_K8S_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: VAULT_ADDR - value: "http://127.0.0.1:8200" - - name: VAULT_API_ADDR - value: "http://$(POD_IP):8200" - - name: SKIP_CHOWN - value: "true" - - name: SKIP_SETCAP - value: "true" - - name: HOSTNAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: VAULT_CLUSTER_ADDR - value: "https://$(HOSTNAME).vault:8201" - - name: VAULT_DEV_ROOT_TOKEN_ID - value: "root" - volumeMounts: - ports: - - containerPort: 8200 - name: http - - containerPort: 8201 - name: internal - - containerPort: 8202 - name: replication - readinessProbe: - exec: - command: ["/bin/sh", "-ec", "vault status -tls-skip-verify"] - failureThreshold: 2 - initialDelaySeconds: 5 - periodSeconds: 3 - successThreshold: 1 - timeoutSeconds: 5 - lifecycle: - # Vault container doesn't receive SIGTERM from Kubernetes - # and after the grace period ends, Kube sends SIGKILL. This - # causes issues with graceful shutdowns such as deregistering itself - # from Consul (zombie services). - preStop: - exec: - command: [ - "/bin/sh", "-c", - # Adding a sleep here to give the pod eviction a - # chance to propagate, so requests will not be made - # to this pod while it's terminating - "sleep 5 && kill -SIGTERM $(pidof vault)", - ] - - name: vault-initialize - securityContext: - capabilities: - add: ["IPC_LOCK"] - image: 853397791086.dkr.ecr.us-west-2.amazonaws.com/vault:1.4.0 - imagePullPolicy: IfNotPresent - command: - - "sh" - - "-c" - - | - vault secrets enable transit - while true; do - sleep 10000 - done - args: - env: - - name: VAULT_ADDR - value: "http://127.0.0.1:8200" - - name: SKIP_CHOWN - value: "true" - - name: SKIP_SETCAP - value: "true" - - name: VAULT_TOKEN - value: "root" - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: diem-node - operator: Exists - topologyKey: "kubernetes.io/hostname" - terminationGracePeriodSeconds: 5 - tolerations: - - key: "validators" - operator: "Exists" - effect: "NoSchedule" - - key: "node.kubernetes.io/not-ready" - operator: "Exists" - effect: "NoSchedule" diff --git a/testsuite/cluster-test/src/effects/mod.rs b/testsuite/cluster-test/src/effects/mod.rs deleted file mode 100644 index 9f453161ce1bf..0000000000000 --- a/testsuite/cluster-test/src/effects/mod.rs +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use anyhow::Result; -use async_trait::async_trait; -use futures::future::try_join_all; -use std::fmt::Display; - -pub mod network_delay; -pub mod packet_loss; -pub mod stop_validator; - -#[async_trait] -pub trait Effect: Display { - async fn activate(&mut self) -> Result<()>; - async fn deactivate(&mut self) -> Result<()>; -} - -pub async fn activate_all(effects: &mut Vec) -> Result<()> { - try_join_all(effects.iter_mut().map(Effect::activate)).await?; - Ok(()) -} - -pub async fn deactivate_all(effects: &mut Vec) -> Result<()> { - try_join_all(effects.iter_mut().map(Effect::deactivate)).await?; - Ok(()) -} diff --git a/testsuite/cluster-test/src/effects/network_delay.rs b/testsuite/cluster-test/src/effects/network_delay.rs deleted file mode 100644 index 449f83a37cab3..0000000000000 --- a/testsuite/cluster-test/src/effects/network_delay.rs +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::effects::Effect; -/// NetworkDelay introduces network delay from a given instance to a provided list of instances -/// If no instances are provided, network delay is introduced on all outgoing packets -use crate::instance::Instance; -use anyhow::Result; - -use async_trait::async_trait; -use diem_logger::debug; -use std::{fmt, time::Duration}; - -pub struct NetworkDelay { - instance: Instance, - // A vector of a pair of (delay, instance list) - // Applies delay to each instance in the instance list - configuration: Vec<(Vec, Duration)>, -} - -impl NetworkDelay { - pub fn new(instance: Instance, configuration: Vec<(Vec, Duration)>) -> Self { - Self { - instance, - configuration, - } - } -} - -#[async_trait] -impl Effect for NetworkDelay { - async fn activate(&mut self) -> Result<()> { - debug!("Injecting NetworkDelays for {}", self.instance); - let mut command = "".to_string(); - // Create a HTB https://linux.die.net/man/8/tc-htb - command += "tc qdisc add dev eth0 root handle 1: htb; "; - for i in 0..self.configuration.len() { - // Create a class within the HTB https://linux.die.net/man/8/tc - command += format!( - "tc class add dev eth0 parent 1: classid 1:{} htb rate 1tbit; ", - i + 1 - ) - .as_str(); - } - for i in 0..self.configuration.len() { - // Create u32 filters so that all the target instances are classified as class 1:(i+1) - // http://man7.org/linux/man-pages/man8/tc-u32.8.html - for target_instance in &self.configuration[i].0 { - command += format!("tc filter add dev eth0 parent 1: protocol ip prio 1 u32 flowid 1:{} match ip dst {}; ", i+1, target_instance.ip()).as_str(); - } - } - for i in 0..self.configuration.len() { - // Use netem to delay packets to this class - command += format!( - "tc qdisc add dev eth0 parent 1:{} handle {}0: netem delay {}ms; ", - i + 1, - i + 1, - self.configuration[i].1.as_millis(), - ) - .as_str(); - } - self.instance.util_cmd(command, "ac-net-delay").await - } - - async fn deactivate(&mut self) -> Result<()> { - self.instance - .util_cmd("tc qdisc delete dev eth0 root; true", "de-net-delay") - .await - } -} - -impl fmt::Display for NetworkDelay { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "NetworkDelay from {}", self.instance) - } -} - -/// three_region_simulation_effects returns the list of NetworkDelays which need to be applied to -/// all the instances in the cluster. -/// `regions` is a 3-tuple consisting of the list of instances in each region -/// `delays_bw_regions` is a 3-tuple consisting of the one-way delays between pairs of regions -/// delays_bw_regions.0 is the delay b/w regions 1 & 2, delays_bw_regions.1 is the delay b/w regions 0 & 2, etc -pub fn three_region_simulation_effects( - regions: (Vec, Vec, Vec), - delays_bw_regions: (Duration, Duration, Duration), -) -> Vec { - let mut result = vec![]; - for instance in ®ions.0 { - let configuration = vec![ - (regions.1.clone(), delays_bw_regions.2), - (regions.2.clone(), delays_bw_regions.1), - ]; - result.push(NetworkDelay::new(instance.clone(), configuration)); - } - for instance in ®ions.1 { - let configuration = vec![ - (regions.0.clone(), delays_bw_regions.2), - (regions.2.clone(), delays_bw_regions.0), - ]; - result.push(NetworkDelay::new(instance.clone(), configuration)); - } - for instance in ®ions.2 { - let configuration = vec![ - (regions.1.clone(), delays_bw_regions.0), - (regions.0.clone(), delays_bw_regions.1), - ]; - result.push(NetworkDelay::new(instance.clone(), configuration)); - } - result -} diff --git a/testsuite/cluster-test/src/effects/packet_loss.rs b/testsuite/cluster-test/src/effects/packet_loss.rs deleted file mode 100644 index e0ab9f0566deb..0000000000000 --- a/testsuite/cluster-test/src/effects/packet_loss.rs +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -/// PacketLoss introduces a given percentage of PacketLoss for a given instance -use crate::{effects::Effect, instance::Instance}; -use anyhow::Result; - -use async_trait::async_trait; -use diem_logger::info; -use std::fmt; - -pub struct PacketLoss { - instance: Instance, - percent: f32, -} - -impl PacketLoss { - pub fn new(instance: Instance, percent: f32) -> Self { - Self { instance, percent } - } -} - -#[async_trait] -impl Effect for PacketLoss { - async fn activate(&mut self) -> Result<()> { - info!("PacketLoss {:.*}% for {}", 2, self.percent, self.instance); - let cmd = format!( - "tc qdisc add dev eth0 root netem loss {:.*}%", - 2, self.percent - ); - self.instance.util_cmd(cmd, "ac-packet-loss").await - } - - async fn deactivate(&mut self) -> Result<()> { - info!("PacketLoss {:.*}% for {}", 2, self.percent, self.instance); - let cmd = "tc qdisc delete dev eth0 root; true".to_string(); - self.instance.util_cmd(cmd, "de-packet-loss").await - } -} - -impl fmt::Display for PacketLoss { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "PacketLoss {:.*}% for {}", - 2, self.percent, self.instance - ) - } -} diff --git a/testsuite/cluster-test/src/effects/stop_validator.rs b/testsuite/cluster-test/src/effects/stop_validator.rs deleted file mode 100644 index 46a7721c7ddbb..0000000000000 --- a/testsuite/cluster-test/src/effects/stop_validator.rs +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -/// StopValidator introduces a rebooting for a given instance -use crate::{effects::Effect, instance::Instance}; -use anyhow::Result; - -use async_trait::async_trait; -use diem_logger::debug; -use std::fmt; - -pub struct StopValidator { - instance: Instance, -} - -impl StopValidator { - pub fn new(instance: Instance) -> Self { - Self { instance } - } -} - -#[async_trait] -impl Effect for StopValidator { - async fn activate(&mut self) -> Result<()> { - debug!("Stopping validator {}", self.instance); - self.instance.stop().await - } - - async fn deactivate(&mut self) -> Result<()> { - debug!("Starting validator {}", self.instance); - self.instance.start().await - } -} - -impl fmt::Display for StopValidator { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Stop validator {}", self.instance) - } -} diff --git a/testsuite/cluster-test/src/experiments/accurate_measurment.rs b/testsuite/cluster-test/src/experiments/accurate_measurment.rs deleted file mode 100644 index c4502084d02e3..0000000000000 --- a/testsuite/cluster-test/src/experiments/accurate_measurment.rs +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -use crate::{ - cluster::Cluster, - experiments::{Context, Experiment, ExperimentParam}, - instance::Instance, -}; -use anyhow::Result; -use async_trait::async_trait; -use core::fmt; -use diem_sdk::transaction_builder::TransactionFactory; -use forge::{EmitJobRequest, TxnEmitter}; -use futures::FutureExt; -use rand::{prelude::StdRng, rngs::OsRng, Rng, SeedableRng}; -use std::{collections::HashSet, convert::TryInto, time::Duration}; -use structopt::StructOpt; - -#[derive(StructOpt, Debug)] -pub struct AccurateMeasurementParams { - #[structopt( - long, - default_value = Box::leak(format!("{}", DEFAULT_BENCH_DURATION).into_boxed_str()), - help = "Duration of an experiment in seconds" - )] - pub duration: u64, - #[structopt(long, help = "Set fixed tps as the base tps number of experiment")] - pub base_tps: u64, - #[structopt(long, help = "Step numbers to change tps")] - pub step_num: u64, - #[structopt(long, help = "How may tps change for each step")] - pub step_length: u64, -} - -pub struct AccurateMeasurement { - validators: Vec, - fullnodes: Vec, - duration: Duration, - base_tps: u64, - step_num: u64, - step_length: u64, -} - -pub const DEFAULT_BENCH_DURATION: u64 = 600; - -impl ExperimentParam for AccurateMeasurementParams { - type E = AccurateMeasurement; - fn build(self, cluster: &Cluster) -> Self::E { - let validators = cluster.validator_instances().to_vec(); - let fullnodes = cluster.fullnode_instances().to_vec(); - Self::E { - validators, - fullnodes, - duration: Duration::from_secs(self.duration), - base_tps: self.base_tps, - step_num: self.step_num, - step_length: self.step_length, - } - } -} - -#[async_trait] -impl Experiment for AccurateMeasurement { - fn affected_validators(&self) -> HashSet { - HashSet::new() - } - - async fn run(&mut self, context: &mut Context<'_>) -> Result<()> { - let mut txn_emitter = TxnEmitter::new( - &mut context.treasury_compliance_account, - &mut context.designated_dealer_account, - context.cluster.random_validator_instance().rest_client(), - TransactionFactory::new(context.cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - let instances = if context.emit_to_validator { - self.validators.clone() - } else { - self.fullnodes.clone() - }; - for i in 0..self.step_num { - let tps = self.base_tps + self.step_length * i; - let window = self.duration / self.step_num as u32; - let emit_job_request = - EmitJobRequest::new(instances.iter().map(Instance::rest_client).collect()) - .fixed_tps(tps.try_into().unwrap()); - let emit_txn = txn_emitter.emit_txn_for(window, emit_job_request).boxed(); - let stats = emit_txn.await?; - // Report - let test_step = format!("Step {}", i); - context - .report - .report_txn_stats(test_step, stats, window, ""); - } - - Ok(()) - } - - fn deadline(&self) -> Duration { - let buffer = Duration::from_secs(60); - self.duration + buffer - } -} - -impl fmt::Display for AccurateMeasurement { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "Perf Measurement: start tps {}, steps number {}, step length = {}", - self.base_tps, self.step_num, self.step_length - )?; - Ok(()) - } -} diff --git a/testsuite/cluster-test/src/experiments/compatibility_test.rs b/testsuite/cluster-test/src/experiments/compatibility_test.rs deleted file mode 100644 index 138d1bdd299a9..0000000000000 --- a/testsuite/cluster-test/src/experiments/compatibility_test.rs +++ /dev/null @@ -1,422 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::{ - cluster::Cluster, - cluster_swarm::ClusterSwarm, - experiments::{Context, Experiment, ExperimentParam}, - instance, - instance::Instance, -}; -use async_trait::async_trait; -use diem_logger::prelude::*; -use diem_sdk::transaction_builder::TransactionFactory; -use forge::TxnEmitter; -use futures::future::try_join_all; -use rand::{ - rngs::{OsRng, StdRng}, - Rng, SeedableRng, -}; -use std::{ - collections::HashSet, - env, fmt, - iter::{once, Iterator}, - time::{Duration, Instant}, -}; -use structopt::StructOpt; -use tokio::time; - -/// Reboot `updated_instance` with newer image tag -pub async fn update_batch_instance( - cluster_swarm: &dyn ClusterSwarm, - updated_instance: &[Instance], - updated_lsr: &[Instance], - updated_tag: String, -) -> anyhow::Result<()> { - info!("Stop Existing instances."); - let futures: Vec<_> = updated_instance.iter().map(Instance::stop).collect(); - try_join_all(futures).await?; - - if !updated_lsr.is_empty() { - info!("Stop associated lsr instances."); - let futures: Vec<_> = updated_lsr.iter().map(Instance::stop).collect(); - try_join_all(futures).await?; - info!("Reinstantiate a set of new lsr."); - let futures: Vec<_> = updated_lsr - .iter() - .map(|instance| { - let mut newer_config = instance.instance_config().clone(); - newer_config.replace_tag(updated_tag.clone()).unwrap(); - cluster_swarm.spawn_new_instance(newer_config) - }) - .collect(); - try_join_all(futures).await?; - info!("Wait for the instance to sync up with peers"); - time::sleep(Duration::from_secs(20)).await; - } - - info!("Reinstantiate a set of new nodes."); - let futures: Vec<_> = updated_instance - .iter() - .map(|instance| { - let mut newer_config = instance.instance_config().clone(); - info!("newer config: {:?}", newer_config); - newer_config.replace_tag(updated_tag.clone()).unwrap(); - cluster_swarm.spawn_new_instance(newer_config) - }) - .collect(); - let instances = try_join_all(futures).await?; - - info!("Wait for the instances to recover."); - let deadline = Instant::now() + Duration::from_secs(5 * 60); - let futures: Vec<_> = instances - .iter() - .map(|instance| instance.wait_server_ready(deadline)) - .collect(); - try_join_all(futures).await?; - - // Add a timeout to have wait for validators back to healthy mode. - // TODO: Replace this with a blocking health check. - info!("Wait for the instance to sync up with peers"); - time::sleep(Duration::from_secs(20)).await; - Ok(()) -} - -pub fn get_corresponding_full_nodes<'a>( - validators: impl Iterator, - cluster: &Cluster, -) -> Vec { - let validator_groups = validators - .map(|instance| instance.validator_group()) - .collect::>(); - cluster - .fullnode_instances() - .iter() - .filter(|full_node| validator_groups.contains(&full_node.validator_group())) - .cloned() - .collect() -} - -pub fn get_instance_list_str(batch: &[Instance]) -> String { - let mut nodes_list = String::from(""); - for instance in batch.iter() { - nodes_list.push_str(&instance.to_string()); - nodes_list.push_str(", ") - } - nodes_list -} - -#[derive(StructOpt, Debug)] -pub struct CompatiblityTestParams { - #[structopt( - long, - default_value = "15", - help = "Number of nodes to update in the first batch" - )] - pub count: usize, - #[structopt(long, help = "Image tag of newer validator software")] - pub updated_image_tag: String, -} - -pub struct CompatibilityTest { - first_node: Instance, - first_lsr: Vec, - first_full_node: Vec, - first_batch: Vec, - first_batch_lsr: Vec, - first_full_nodes_batch: Vec, - second_batch: Vec, - second_batch_lsr: Vec, - second_full_nodes_batch: Vec, - updated_image_tag: String, -} - -impl ExperimentParam for CompatiblityTestParams { - type E = CompatibilityTest; - fn build(self, cluster: &Cluster) -> Self::E { - if self.count > cluster.validator_instances().len() || self.count == 0 { - panic!( - "Can not reboot {} validators in cluster with {} instances", - self.count, - cluster.validator_instances().len() - ); - } - let (first_batch, second_batch) = cluster.split_n_validators_random(self.count); - let mut first_batch = first_batch.into_validator_instances(); - let second_batch = second_batch.into_validator_instances(); - let first_node = first_batch - .pop() - .expect("Requires at least one validator in the first batch"); - let first_full_node = get_corresponding_full_nodes([first_node.clone()].iter(), cluster); - - let mut first_lsr = vec![]; - let mut first_batch_lsr = vec![]; - let mut second_batch_lsr = vec![]; - if !cluster.lsr_instances().is_empty() { - first_batch_lsr = cluster.lsr_instances_for_validators(&first_batch); - second_batch_lsr = cluster.lsr_instances_for_validators(&second_batch); - first_lsr = cluster.lsr_instances_for_validators(&[first_node.clone()]); - } - - let first_full_nodes_batch = get_corresponding_full_nodes(first_batch.iter(), cluster); - let second_full_nodes_batch = get_corresponding_full_nodes(second_batch.iter(), cluster); - - Self::E { - first_node, - first_lsr, - first_full_node, - first_batch, - first_batch_lsr, - first_full_nodes_batch, - second_batch, - second_batch_lsr, - second_full_nodes_batch, - updated_image_tag: self.updated_image_tag, - } - } -} - -#[async_trait] -impl Experiment for CompatibilityTest { - fn affected_validators(&self) -> HashSet { - instance::instancelist_to_set(&self.first_batch) - .union(&instance::instancelist_to_set(&self.second_batch)) - .cloned() - .chain(once(self.first_node.peer_name().clone())) - .collect() - } - - async fn run(&mut self, context: &mut Context<'_>) -> anyhow::Result<()> { - let mut txn_emitter = TxnEmitter::new( - &mut context.treasury_compliance_account, - &mut context.designated_dealer_account, - context.cluster.random_validator_instance().rest_client(), - TransactionFactory::new(context.cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - - let job_duration = Duration::from_secs(3); - context.report.report_text(format!( - "Compatibility test results for {} ==> {} (PR)", - context.current_tag, self.updated_image_tag - )); - - // Generate some traffic - let msg = format!( - "1. All instances running {}, generating some traffic on network", - context.current_tag - ); - info!("{}", msg); - context.report.report_text(msg); - let all_full_nodes_request = crate::util::emit_job_request_for_instances( - context.cluster.fullnode_instances().to_vec(), - context.global_emit_job_request, - 0, - 0, - ); - txn_emitter - .emit_txn_for(job_duration, all_full_nodes_request) - .await - .map_err(|e| anyhow::format_err!("Failed to generate traffic: {}", e))?; - - let msg = format!( - "2. First full node {} ==> {}, to validate new full node to old validator node traffic", - context.current_tag, self.updated_image_tag - ); - info!("{}", msg); - context.report.report_text(msg); - info!( - "Upgrade Full Node: {}", - get_instance_list_str(&self.first_full_node) - ); - update_batch_instance( - context.cluster_swarm, - &self.first_full_node, - &[], - self.updated_image_tag.clone(), - ) - .await?; - - // Full node running at n+1, validator running n - txn_emitter - .emit_txn_for( - job_duration, - crate::util::emit_job_request_for_instances( - self.first_full_node.clone(), - context.global_emit_job_request, - 0, - 0, - ), - ) - .await - .map_err(|e| anyhow::format_err!("Storage backwards compat broken: {}", e))?; - - let msg = format!( - "3. First Validator node {} ==> {}, to validate storage compatibility", - context.current_tag, self.updated_image_tag - ); - info!("{}", msg); - context.report.report_text(msg); - info!("Upgrading validator: {}", self.first_node); - let first_node = vec![self.first_node.clone()]; - update_batch_instance( - context.cluster_swarm, - &first_node, - &self.first_lsr, - self.updated_image_tag.clone(), - ) - .await?; - txn_emitter - .emit_txn_for( - job_duration, - crate::util::emit_job_request_for_instances( - self.first_full_node.clone(), - context.global_emit_job_request, - 0, - 0, - ), - ) - .await - .map_err(|e| anyhow::format_err!("Storage backwards compat broken: {}", e))?; - - let msg = format!( - "4. First batch validators ({}) {} ==> {}, to test consensus and traffic between old full nodes and new validator node", - self.first_batch.len(), - context.current_tag, - self.updated_image_tag - ); - info!("{}", msg); - info!( - "Upgrading validators: {}", - get_instance_list_str(&self.first_batch) - ); - context.report.report_text(msg); - update_batch_instance( - context.cluster_swarm, - &self.first_batch, - &self.first_batch_lsr, - self.updated_image_tag.clone(), - ) - .await?; - - // Full node running at n, validator running n+1 - txn_emitter - .emit_txn_for( - job_duration, - crate::util::emit_job_request_for_instances( - self.first_full_nodes_batch.clone(), - context.global_emit_job_request, - 0, - 0, - ), - ) - .await - .map_err(|e| anyhow::format_err!("Consensus backwards compat broken: {}", e))?; - - let msg = format!( - "5. First batch full nodes ({}) {} ==> {}", - self.first_full_nodes_batch.len(), - context.current_tag, - self.updated_image_tag - ); - info!("{}", msg); - context.report.report_text(msg); - info!( - "Upgrading full nodes: {}", - get_instance_list_str(&self.first_full_nodes_batch) - ); - update_batch_instance( - context.cluster_swarm, - &self.first_full_nodes_batch, - &[], - self.updated_image_tag.clone(), - ) - .await?; - - let msg = format!( - "6. Second batch validators ({}) {} ==> {}, to upgrade rest of the validators", - self.second_batch.len(), - context.current_tag, - self.updated_image_tag - ); - info!("{}", msg); - context.report.report_text(msg); - info!( - "Upgrading validators: {}", - get_instance_list_str(&self.second_batch) - ); - update_batch_instance( - context.cluster_swarm, - &self.second_batch, - &self.second_batch_lsr, - self.updated_image_tag.clone(), - ) - .await?; - txn_emitter - .emit_txn_for( - job_duration, - crate::util::emit_job_request_for_instances( - self.second_batch.clone(), - context.global_emit_job_request, - 0, - 0, - ), - ) - .await - .map_err(|e| { - anyhow::format_err!("Failed to upgrade rest of validator images: {}", e) - })?; - - let msg = format!( - "7. Second batch of full nodes ({}) {} ==> {}, to finish the network upgrade", - self.second_full_nodes_batch.len(), - context.current_tag, - self.updated_image_tag - ); - info!("{}", msg); - info!( - "Upgrading full nodes: {}", - get_instance_list_str(&self.second_full_nodes_batch) - ); - context.report.report_text(msg); - update_batch_instance( - context.cluster_swarm, - &self.second_full_nodes_batch, - &[], - self.updated_image_tag.clone(), - ) - .await?; - txn_emitter - .emit_txn_for( - job_duration, - crate::util::emit_job_request_for_instances( - self.second_full_nodes_batch.clone(), - context.global_emit_job_request, - 0, - 0, - ), - ) - .await - .map_err(|e| anyhow::format_err!("Failed to upgrade full node images: {}", e))?; - - Ok(()) - } - - fn deadline(&self) -> Duration { - Duration::from_secs(16 * 60) - } -} - -impl fmt::Display for CompatibilityTest { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "Compatibility test, phased upgrade to {} in batches of 1, {}, {}", - self.updated_image_tag, - self.first_batch.len(), - self.second_batch.len() - ) - } -} diff --git a/testsuite/cluster-test/src/experiments/cpu_flamegraph.rs b/testsuite/cluster-test/src/experiments/cpu_flamegraph.rs deleted file mode 100644 index ccf7c9e976730..0000000000000 --- a/testsuite/cluster-test/src/experiments/cpu_flamegraph.rs +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -use crate::{ - cluster::Cluster, - experiments::{Context, Experiment, ExperimentParam}, - instance, - instance::Instance, -}; -use anyhow::{anyhow, format_err, Result}; -use async_trait::async_trait; -use diem_sdk::transaction_builder::TransactionFactory; -use forge::TxnEmitter; -use futures::{future::FutureExt, join}; -use rand::{prelude::StdRng, rngs::OsRng, Rng, SeedableRng}; -use std::{ - collections::HashSet, - env, - fmt::{Display, Error, Formatter}, - time::Duration, -}; -use structopt::StructOpt; - -#[derive(StructOpt, Debug)] -pub struct CpuFlamegraphParams { - #[structopt( - long, - default_value = "60", - help = "Number of seconds for which perf should be run" - )] - pub duration_secs: usize, -} - -pub struct CpuFlamegraph { - duration_secs: usize, - perf_instance: Instance, -} - -impl ExperimentParam for CpuFlamegraphParams { - type E = CpuFlamegraph; - fn build(self, cluster: &Cluster) -> Self::E { - let perf_instance = cluster.random_validator_instance(); - Self::E { - duration_secs: self.duration_secs, - perf_instance, - } - } -} - -#[async_trait] -impl Experiment for CpuFlamegraph { - fn affected_validators(&self) -> HashSet { - instance::instancelist_to_set(&[self.perf_instance.clone()]) - } - - async fn run(&mut self, context: &mut Context<'_>) -> Result<()> { - let mut txn_emitter = TxnEmitter::new( - &mut context.treasury_compliance_account, - &mut context.designated_dealer_account, - context.cluster.random_validator_instance().rest_client(), - TransactionFactory::new(context.cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - let buffer = Duration::from_secs(60); - let tx_emitter_duration = 2 * buffer + Duration::from_secs(self.duration_secs as u64); - let emit_job_request = crate::util::emit_job_request_for_instances( - context.cluster.validator_instances().to_vec(), - context.global_emit_job_request, - 0, - 0, - ); - let emit_future = txn_emitter - .emit_txn_for(tx_emitter_duration, emit_job_request) - .boxed(); - let run_id = env::var("RUN_ID") - .map_err(|e| anyhow!("RUN_ID could not be read from the environment, Error:{}", e))?; - let filename = "diem-node-perf.svg"; - let command = generate_perf_flamegraph_command(filename, &run_id, self.duration_secs); - let flame_graph = self.perf_instance.util_cmd(command, "generate-flamegraph"); - let flame_graph_future = tokio::time::sleep(buffer) - .then(|_| async move { flame_graph.await }) - .boxed(); - let (emit_result, flame_graph_result) = join!(emit_future, flame_graph_future); - emit_result.map_err(|e| format_err!("Emiting tx failed: {:?}", e))?; - flame_graph_result.map_err(|e| format_err!("Failed to generate flamegraph: {:?}", e))?; - context.report.report_text(format!( - "perf flamegraph : https://toro-cluster-test-flamegraphs.s3-us-west-2.amazonaws.com/flamegraphs/{}/{}", - run_id, - filename - )); - Ok(()) - } - - fn deadline(&self) -> Duration { - Duration::from_secs(480) - } -} - -impl Display for CpuFlamegraph { - fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { - write!(f, "Generating CpuFlamegraph on {}", self.perf_instance) - } -} - -fn generate_perf_flamegraph_command(filename: &str, run_id: &str, duration_secs: usize) -> String { - format!( - r#" - set -xe; - rm -rf /tmp/perf-data; - mkdir /tmp/perf-data; - cd /tmp/perf-data; - perf record -F 99 -p $(ps aux | grep diem-node | grep -v grep | awk '{{print $2}}') --output=perf.data --call-graph dwarf -- sleep {duration_secs}; - perf script --input=perf.data | /usr/local/etc/FlameGraph/stackcollapse-perf.pl > out.perf-folded; - /usr/local/etc/FlameGraph/flamegraph.pl out.perf-folded > {filename}; - aws s3 cp {filename} s3://toro-cluster-test-flamegraphs/flamegraphs/{run_id}/{filename};"#, - duration_secs = duration_secs, - filename = filename, - run_id = run_id, - ) -} diff --git a/testsuite/cluster-test/src/experiments/load_test.rs b/testsuite/cluster-test/src/experiments/load_test.rs deleted file mode 100644 index 494e742c75d30..0000000000000 --- a/testsuite/cluster-test/src/experiments/load_test.rs +++ /dev/null @@ -1,518 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::{ - cluster::Cluster, - experiments::{Context, Experiment, ExperimentParam}, -}; -use anyhow::Result; -use async_trait::async_trait; -use diem_config::{config::NodeConfig, network_id::NetworkId}; -use diem_logger::*; -use diem_mempool::network::{MempoolNetworkEvents, MempoolNetworkSender}; -use diem_sdk::transaction_builder::TransactionFactory; -use diem_time_service::TimeService; -use diem_types::{account_config::diem_root_address, chain_id::ChainId}; -use forge::{gen_transfer_txn_request, TxnEmitter}; -use futures::{sink::SinkExt, StreamExt}; -use network::{ - application::storage::PeerMetadataStorage, - connectivity_manager::DiscoverySource, - protocols::network::{ApplicationNetworkSender, Event}, - ConnectivityRequest, -}; -use network_builder::builder::NetworkBuilder; -use rand::{prelude::StdRng, rngs::OsRng, Rng, SeedableRng}; -use state_sync_v1::network::{StateSyncEvents, StateSyncSender}; -use std::{ - collections::HashSet, - fmt, - ops::Add, - time::{Duration, Instant}, -}; -use structopt::StructOpt; -use tokio::runtime::{Builder, Handle}; - -const EXPERIMENT_BUFFER_SECS: u64 = 900; - -#[derive(StructOpt, Debug)] -pub struct LoadTestParams { - #[structopt(long, help = "run load test on mempool")] - pub mempool: bool, - #[structopt(long, help = "run load test on state sync")] - pub state_sync: bool, - #[structopt(long, help = "emit p2p transfer txns during experiment")] - pub emit_txn: bool, - #[structopt( - long, - help = "duration (in seconds) to run load test for. All specified components (mempool, state sync) will be load tested simultaneously" - )] - pub duration: u64, - #[structopt(long, default_value = "1", help = "Number of stubbed nodes")] - pub num_stubbed: usize, -} - -pub struct LoadTest { - mempool: bool, - state_sync: bool, - emit_txn: bool, - duration: u64, - num_stubbed: usize, -} - -impl ExperimentParam for LoadTestParams { - type E = LoadTest; - fn build(self, _cluster: &Cluster) -> Self::E { - LoadTest { - mempool: self.mempool, - state_sync: self.state_sync, - emit_txn: self.emit_txn, - duration: self.duration, - num_stubbed: self.num_stubbed, - } - } -} - -#[async_trait] -impl Experiment for LoadTest { - fn affected_validators(&self) -> HashSet { - HashSet::new() - } - - async fn run(&mut self, context: &mut Context<'_>) -> anyhow::Result<()> { - let mut txn_emitter = TxnEmitter::new( - &mut context.treasury_compliance_account, - &mut context.designated_dealer_account, - context.cluster.random_validator_instance().rest_client(), - TransactionFactory::new(context.cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - - // spin up StubbedNode - let vfn = context.cluster.random_fullnode_instance(); - info!("Node {:?} is selected", vfn.peer_name()); - let vfn_endpoint = format!("http://{}:{}/v1", vfn.ip(), vfn.ac_port()); - let network_runtime = Builder::new_multi_thread() - .thread_name("stubbed-node-network") - .enable_all() - .build() - .expect("Failed to start runtime. Won't be able to start networking."); - let mut stubbed_node = get_stubbed_nodes( - vfn_endpoint, - network_runtime.handle().clone(), - self.num_stubbed, - ) - .await; - let mut emit_job = None; - let mut mempool_handlers: Vec<_> = vec![]; - let mut state_sync_handlers: Vec<_> = vec![]; - let mut mempool_task = vec![]; - let mut state_sync_task = vec![]; - let duration = Duration::from_secs(self.duration); - - for node in &mut stubbed_node { - mempool_handlers.push( - node.mempool_handle - .take() - .expect("missing mempool network handles"), - ); - state_sync_handlers.push( - node.state_sync_handle - .take() - .expect("missing state sync network handles"), - ); - } - - if self.emit_txn { - // emit txns to JSON RPC - // spawn future - emit_job = Some( - txn_emitter - .start_job(crate::util::emit_job_request_for_instances( - context.cluster.fullnode_instances().to_vec(), - context.global_emit_job_request, - 0, - 0, - )) - .await?, - ); - } - - if self.mempool { - // spawn mempool load test - for (mempool_sender, mempool_events) in mempool_handlers { - mempool_task.push(tokio::task::spawn(mempool_load_test( - duration, - mempool_sender, - mempool_events, - ))); - } - } - - if self.state_sync { - // spawn state sync load test - for (state_sync_sender, state_sync_events) in state_sync_handlers { - state_sync_task.push(tokio::task::spawn(state_sync_load_test( - duration, - state_sync_sender, - state_sync_events, - ))); - } - } - - // await on all spawned tasks - tokio::time::sleep(Duration::from_secs(self.duration)).await; - if let Some(j) = emit_job { - let stats = txn_emitter.stop_job(j).await; - let mut sender = &mut context.root_account; - let receiver = diem_root_address(); - let tx_factory = TransactionFactory::new(ChainId::test()); - let dummy_tx = gen_transfer_txn_request(&mut sender, &receiver, 0, &tx_factory, 0); - let total_byte = dummy_tx.raw_txn_bytes_len() as u64 * stats.submitted; - info!("Total tx emitter stats: {}, bytes: {}", stats, total_byte); - info!( - "Average rate: {}, {} bytes/s", - stats.rate(Duration::from_secs(self.duration)), - total_byte / Duration::from_secs(self.duration).as_secs() - ); - } - - let mut mempool_stats = MempoolStats::default(); - for task in mempool_task { - let stats = task.await?.expect("failed mempool load test task"); - mempool_stats = mempool_stats + stats; - } - if self.mempool { - info!("Total mempool stats: {}", mempool_stats); - info!( - "Average rate: {}", - mempool_stats.rate(Duration::from_secs(self.duration)) - ); - } - - let mut state_sync_stats = StateSyncStats::default(); - for task in state_sync_task { - let stats = task.await?.expect("failed state sync load test task"); - state_sync_stats = state_sync_stats + stats; - } - if self.state_sync { - info!("Total state sync stats: {}", state_sync_stats); - info!( - "Average rate: {}", - state_sync_stats.rate(Duration::from_secs(self.duration)) - ); - } - - // create blocking context to drop stubbed node's runtime in - // We cannot drop a runtime in an async context where blocking is not allowed - otherwise, - // this thread will panic. - tokio::task::spawn_blocking(move || { - drop(network_runtime); - }) - .await?; - - Ok(()) - } - fn deadline(&self) -> Duration { - Duration::from_secs(self.duration + EXPERIMENT_BUFFER_SECS) - } -} - -impl fmt::Display for LoadTest { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "Load test components: mempool: {}, state sync: {}, emit txns: {}", - self.mempool, self.state_sync, self.emit_txn, - ) - } -} - -async fn get_stubbed_nodes( - endpoint: String, - runtime_handle: Handle, - num_of_nodes: usize, -) -> Vec { - let mut nodes = vec![]; - for i in 0..num_of_nodes { - nodes.push(StubbedNode::launch(endpoint.clone(), runtime_handle.clone(), i).await); - } - nodes -} - -// An actor that can participate in DiemNet -// Connects to VFN via on-chain discovery and interact with it via mempool and state sync protocol -// It is 'stubbed' in the sense that it has no real node components running and only network stubs -// that interact with the remote VFN via DiemNet mempool and state sync protocol -struct StubbedNode { - pub mempool_handle: Option<(MempoolNetworkSender, MempoolNetworkEvents)>, - pub state_sync_handle: Option<(StateSyncSender, StateSyncEvents)>, -} - -impl StubbedNode { - async fn launch(node_endpoint: String, runtime_handle: Handle, index: usize) -> Self { - // generate seed peers config from querying node endpoint - let seed_peers = - seed_peer_generator::utils::gen_validator_full_node_seed_peer_config(node_endpoint) - .unwrap(); - - // build sparse network runner - - let mut pfn_config = NodeConfig::default_for_public_full_node(); - - // some sanity checks on the network the stubbed node will be running in - assert_eq!( - pfn_config.full_node_networks.len(), - 1, - "expected only one fn network for PFN" - ); - let network_config = &mut pfn_config.full_node_networks[0]; - // this dummy listen address is not used - network_config.listen_address = format!("/ip4/127.0.0.1/tcp/{}", 6180 + index) - .parse() - .unwrap(); - assert_eq!(network_config.network_id, NetworkId::Public); - - let mut network_builder = NetworkBuilder::create( - ChainId::test(), - pfn_config.base.role, - network_config, - TimeService::real(), - None, - PeerMetadataStorage::new(&[network_config.network_id]), - ); - - let state_sync_handle = Some( - network_builder.add_p2p_service(&state_sync_v1::network::network_endpoint_config()), - ); - - let mempool_handle = Some(network_builder.add_p2p_service( - &diem_mempool::network::network_endpoint_config( - pfn_config.mempool.max_broadcasts_per_peer, - ), - )); - - network_builder.build(runtime_handle); - network_builder.start(); - - // feed the network builder the seed peer config - let mut conn_req_tx = network_builder - .conn_mgr_reqs_tx() - .expect("expecting connectivity mgr to exist after adding protocol handler"); - - conn_req_tx - .send(ConnectivityRequest::UpdateDiscoveredPeers( - DiscoverySource::OnChainValidatorSet, - seed_peers, - )) - .await - .expect("failed to send conn req"); - - Self { - mempool_handle, - state_sync_handle, - } - } -} - -async fn mempool_load_test( - duration: Duration, - sender: MempoolNetworkSender, - mut events: MempoolNetworkEvents, -) -> Result { - let new_peer_event = events.select_next_some().await; - let vfn = if let Event::NewPeer(metadata) = new_peer_event { - metadata.remote_peer_id - } else { - return Err(anyhow::format_err!( - "received unexpected network event for mempool load test" - )); - }; - - let mut bytes = 0_u64; - let mut msg_num = 0_u64; - let task_start = Instant::now(); - while Instant::now().duration_since(task_start) < duration { - let msg = diem_mempool::network::MempoolSyncMsg::BroadcastTransactionsRequest { - request_id: bcs::to_bytes("request_id")?, - transactions: vec![], // TODO submit actual txns - }; - // TODO log stats for bandwidth sent to remote peer to MempoolResult - bytes += bcs::to_bytes(&msg)?.len() as u64; - msg_num += 1; - sender.send_to(vfn, msg)?; - - // await ACK from remote peer - let _response = events.select_next_some().await; - } - - Ok(MempoolStats { - bytes, - tx_num: 0, - msg_num, - }) -} - -#[derive(Debug, Default)] -struct MempoolStats { - bytes: u64, - tx_num: u64, - msg_num: u64, -} - -#[derive(Debug, Default)] -pub struct MempoolStatsRate { - pub bytes: u64, - pub tx_num: u64, - pub msg_num: u64, -} - -impl MempoolStats { - pub fn rate(&self, window: Duration) -> MempoolStatsRate { - MempoolStatsRate { - bytes: self.bytes / window.as_secs(), - tx_num: self.tx_num / window.as_secs(), - msg_num: self.msg_num / window.as_secs(), - } - } -} - -impl Add for MempoolStats { - type Output = MempoolStats; - - fn add(self, other: MempoolStats) -> MempoolStats { - MempoolStats { - bytes: self.bytes + other.bytes, - tx_num: self.tx_num + other.tx_num, - msg_num: self.msg_num + other.msg_num, - } - } -} - -impl fmt::Display for MempoolStats { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "exchanged {} messages, {} bytes", - self.msg_num, self.bytes, - ) - } -} - -impl fmt::Display for MempoolStatsRate { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "exchanged {} messages/s, {} bytes/s", - self.msg_num, self.bytes, - ) - } -} - -async fn state_sync_load_test( - duration: Duration, - sender: StateSyncSender, - mut events: StateSyncEvents, -) -> Result { - let new_peer_event = events.select_next_some().await; - let vfn = if let Event::NewPeer(metadata) = new_peer_event { - metadata.remote_peer_id - } else { - return Err(anyhow::format_err!( - "received unexpected network event for state sync load test" - )); - }; - - let chunk_request = state_sync_v1::chunk_request::GetChunkRequest::new( - 1, - 1, - 1000, - state_sync_v1::chunk_request::TargetType::HighestAvailable { - target_li: None, - timeout_ms: 10_000, - }, - ); - - let task_start = Instant::now(); - let mut served_txns = 0_u64; - let mut bytes = 0_u64; - let mut msg_num = 0_u64; - while Instant::now().duration_since(task_start) < duration { - use state_sync_v1::network::StateSyncMessage::*; - - let msg = GetChunkRequest(Box::new(chunk_request.clone())); - bytes += bcs::to_bytes(&msg)?.len() as u64; - msg_num += 1; - sender.send_to(vfn, msg)?; - - // await response from remote peer - let response = events.select_next_some().await; - if let Event::Message(_remote_peer, GetChunkResponse(chunk_response)) = response { - // TODO analyze response and update StateSyncResult with stats accordingly - served_txns += chunk_response.txn_list_with_proof.transactions.len() as u64; - } - } - Ok(StateSyncStats { - served_txns, - bytes, - msg_num, - }) -} - -#[derive(Debug, Default)] -struct StateSyncStats { - served_txns: u64, - bytes: u64, - msg_num: u64, -} - -#[derive(Debug, Default)] -pub struct StateSyncStatsRate { - pub served_txns: u64, - pub bytes: u64, - pub msg_num: u64, -} - -impl Add for StateSyncStats { - type Output = StateSyncStats; - - fn add(self, other: StateSyncStats) -> StateSyncStats { - StateSyncStats { - served_txns: self.served_txns + other.served_txns, - bytes: self.bytes + other.bytes, - msg_num: self.msg_num + other.msg_num, - } - } -} - -impl StateSyncStats { - pub fn rate(&self, window: Duration) -> StateSyncStatsRate { - StateSyncStatsRate { - served_txns: self.served_txns / window.as_secs(), - bytes: self.bytes / window.as_secs(), - msg_num: self.msg_num / window.as_secs(), - } - } -} - -impl fmt::Display for StateSyncStats { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "received {} txs, exchanged {} messages, {} bytes, ", - self.served_txns, self.msg_num, self.bytes - ) - } -} - -impl fmt::Display for StateSyncStatsRate { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "received {} txs/s, exchanged {} msg/s, {} bytes/s, ", - self.served_txns, self.msg_num, self.bytes, - ) - } -} diff --git a/testsuite/cluster-test/src/experiments/mod.rs b/testsuite/cluster-test/src/experiments/mod.rs deleted file mode 100644 index 89c4b62b86f26..0000000000000 --- a/testsuite/cluster-test/src/experiments/mod.rs +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -mod accurate_measurment; -mod compatibility_test; -mod cpu_flamegraph; -mod load_test; -mod packet_loss_random_validators; -mod performance_benchmark; -mod performance_benchmark_three_region_simulation; -mod reboot_cluster; -mod reboot_random_validators; -mod reconfiguration_test; -mod recovery_time; -mod state_sync_performance; -mod twin_validator; -mod versioning_test; - -use crate::{ - cluster::Cluster, - cluster_builder::{ClusterBuilder, ClusterBuilderParams}, - cluster_swarm::{cluster_swarm_kube::ClusterSwarmKube, ClusterSwarm}, - experiments::accurate_measurment::AccurateMeasurementParams, - prometheus::Prometheus, - report::SuiteReport, -}; -use async_trait::async_trait; -use diem_sdk::types::LocalAccount; -use forge::EmitJobRequest; -use std::{ - collections::{HashMap, HashSet}, - fmt::Display, - time::Duration, -}; -use structopt::{clap::AppSettings, StructOpt}; - -pub use compatibility_test::{CompatibilityTest, CompatiblityTestParams}; -pub use cpu_flamegraph::{CpuFlamegraph, CpuFlamegraphParams}; -pub use load_test::LoadTestParams; -pub use packet_loss_random_validators::{ - PacketLossRandomValidators, PacketLossRandomValidatorsParams, -}; -pub use performance_benchmark::{PerformanceBenchmark, PerformanceBenchmarkParams}; -pub use performance_benchmark_three_region_simulation::{ - PerformanceBenchmarkThreeRegionSimulation, PerformanceBenchmarkThreeRegionSimulationParams, -}; -pub use reboot_cluster::{RebootCluster, RebootClusterParams}; -pub use reboot_random_validators::{RebootRandomValidators, RebootRandomValidatorsParams}; -pub use reconfiguration_test::{Reconfiguration, ReconfigurationParams}; -pub use recovery_time::{RecoveryTime, RecoveryTimeParams}; -pub use state_sync_performance::{StateSyncPerformance, StateSyncPerformanceParams}; -pub use twin_validator::{TwinValidators, TwinValidatorsParams}; -pub use versioning_test::{ValidatorVersioning, ValidatorVersioningParams}; - -#[async_trait] -pub trait Experiment: Display + Send { - fn affected_validators(&self) -> HashSet { - HashSet::new() - } - async fn run(&mut self, context: &mut Context<'_>) -> anyhow::Result<()>; - fn deadline(&self) -> Duration; -} - -pub trait ExperimentParam { - type E: Experiment; - fn build(self, cluster: &Cluster) -> Self::E; -} - -pub struct Context<'a> { - pub root_account: &'a mut LocalAccount, - pub treasury_compliance_account: &'a mut LocalAccount, - pub designated_dealer_account: &'a mut LocalAccount, - pub prometheus: &'a Prometheus, - pub cluster_builder: &'a mut ClusterBuilder, - pub cluster_builder_params: &'a ClusterBuilderParams, - pub cluster: &'a Cluster, - pub report: &'a mut SuiteReport, - pub global_emit_job_request: &'a mut Option, - pub emit_to_validator: bool, - pub cluster_swarm: &'a dyn ClusterSwarm, - /// Current docker image tag used by this run - pub current_tag: &'a str, -} - -impl<'a> Context<'a> { - pub fn new( - root_account: &'a mut LocalAccount, - treasury_compliance_account: &'a mut LocalAccount, - designated_dealer_account: &'a mut LocalAccount, - prometheus: &'a Prometheus, - cluster_builder: &'a mut ClusterBuilder, - cluster_builder_params: &'a ClusterBuilderParams, - cluster: &'a Cluster, - report: &'a mut SuiteReport, - emit_job_request: &'a mut Option, - emit_to_validator: bool, - cluster_swarm: &'a ClusterSwarmKube, - current_tag: &'a str, - ) -> Self { - Context { - root_account, - treasury_compliance_account, - designated_dealer_account, - prometheus, - cluster_builder, - cluster_builder_params, - cluster, - report, - global_emit_job_request: emit_job_request, - emit_to_validator, - cluster_swarm, - current_tag, - } - } -} - -fn from_args(args: &[String], cluster: &Cluster) -> Box -where - P: StructOpt + 'static, -{ - let params = P::from_clap( - &P::clap() - .global_setting(AppSettings::NoBinaryName) - .get_matches_from(args), - ); - Box::new(params.build(cluster)) -} - -/// Given an experiment name and its flags, it constructs an instance of that experiment -/// and returns it as a `Box` -pub fn get_experiment(name: &str, args: &[String], cluster: &Cluster) -> Box { - fn f( - ) -> Box Box> { - Box::new(from_args::

) - } - - let mut known_experiments = HashMap::new(); - - known_experiments.insert("recovery_time", f::()); - known_experiments.insert( - "packet_loss_random_validators", - f::(), - ); - known_experiments.insert("bench", f::()); - known_experiments.insert( - "bench_three_region", - f::(), - ); - known_experiments.insert( - "reboot_random_validators", - f::(), - ); - known_experiments.insert("twin", f::()); - known_experiments.insert("generate_cpu_flamegraph", f::()); - known_experiments.insert("versioning_testing", f::()); - known_experiments.insert("compatibility_test", f::()); - known_experiments.insert("reboot_cluster", f::()); - known_experiments.insert("reconfiguration", f::()); - known_experiments.insert("load_test", f::()); - known_experiments.insert("state_sync_performance", f::()); - known_experiments.insert("measure", f::()); - - let builder = known_experiments.get(name).expect("Experiment not found"); - builder(args, cluster) -} diff --git a/testsuite/cluster-test/src/experiments/packet_loss_random_validators.rs b/testsuite/cluster-test/src/experiments/packet_loss_random_validators.rs deleted file mode 100644 index a8bc07ef16487..0000000000000 --- a/testsuite/cluster-test/src/experiments/packet_loss_random_validators.rs +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -/// This module provides an experiment which introduces packet loss for -/// a given number of instances in the cluster. It undoes the packet loss -/// in the cluster after the given duration -use crate::{ - cluster::Cluster, - effects::{self, packet_loss::PacketLoss}, - experiments::{Context, Experiment, ExperimentParam}, - instance::Instance, -}; - -use async_trait::async_trait; -use std::{fmt, time::Duration}; -use structopt::StructOpt; - -pub struct PacketLossRandomValidators { - instances: Vec, - percent: f32, - duration: Duration, -} -use tokio::time; - -#[derive(StructOpt, Debug)] -pub struct PacketLossRandomValidatorsParams { - #[structopt( - long, - default_value = "10", - help = "Percent of instances in which packet loss should be introduced" - )] - percent_instances: f32, - #[structopt( - long, - default_value = "10", - help = "Percent of packet loss for each instance" - )] - packet_loss_percent: f32, - #[structopt( - long, - default_value = "60", - help = "Duration in secs for which packet loss happens" - )] - duration_secs: u64, -} - -impl ExperimentParam for PacketLossRandomValidatorsParams { - type E = PacketLossRandomValidators; - fn build(self, cluster: &Cluster) -> Self::E { - let total_instances = cluster.validator_instances().len(); - let packet_loss_num_instances: usize = std::cmp::min( - ((self.percent_instances / 100.0) * total_instances as f32).ceil() as usize, - total_instances, - ); - let (test_cluster, _) = cluster.split_n_validators_random(packet_loss_num_instances); - Self::E { - instances: test_cluster.into_validator_instances(), - percent: self.packet_loss_percent, - duration: Duration::from_secs(self.duration_secs), - } - } -} - -#[async_trait] -impl Experiment for PacketLossRandomValidators { - async fn run(&mut self, _context: &mut Context<'_>) -> anyhow::Result<()> { - let mut effects: Vec<_> = self - .instances - .clone() - .into_iter() - .map(|instance| PacketLoss::new(instance, self.percent)) - .collect(); - effects::activate_all(&mut effects).await?; - time::sleep(self.duration).await; - effects::deactivate_all(&mut effects).await?; - Ok(()) - } - - fn deadline(&self) -> Duration { - Duration::from_secs(20 * 60) - } -} - -impl fmt::Display for PacketLossRandomValidators { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Packet Loss {:.*}% [", 2, self.percent)?; - for instance in self.instances.iter() { - write!(f, "{}, ", instance)?; - } - write!(f, "]") - } -} diff --git a/testsuite/cluster-test/src/experiments/performance_benchmark.rs b/testsuite/cluster-test/src/experiments/performance_benchmark.rs deleted file mode 100644 index 7f83a817af983..0000000000000 --- a/testsuite/cluster-test/src/experiments/performance_benchmark.rs +++ /dev/null @@ -1,312 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -use crate::{ - cluster::Cluster, - experiments::{Context, Experiment, ExperimentParam}, - instance, - instance::Instance, - stats::PrometheusRangeView, - util::human_readable_bytes_per_sec, -}; -use anyhow::{anyhow, Result}; -use async_trait::async_trait; -use diem_infallible::duration_since_epoch; -use diem_logger::{info, warn}; -use diem_sdk::transaction_builder::TransactionFactory; -use forge::{EmitJobRequest, TxnEmitter, TxnStats}; -use futures::{future::try_join_all, FutureExt}; -use rand::{ - prelude::StdRng, - rngs::{OsRng, ThreadRng}, - seq::SliceRandom, - Rng, SeedableRng, -}; -use std::{ - collections::HashSet, - convert::TryInto, - fmt::{Display, Error, Formatter}, - time::Duration, -}; -use structopt::StructOpt; -use tokio::task::JoinHandle; - -#[derive(StructOpt, Debug)] -pub struct PerformanceBenchmarkParams { - #[structopt( - long, - default_value = "0", - help = "Percent of nodes which should be down" - )] - pub percent_nodes_down: usize, - #[structopt( - long, - default_value = Box::leak(format!("{}", DEFAULT_BENCH_DURATION).into_boxed_str()), - help = "Duration of an experiment in seconds" - )] - pub duration: u64, - #[structopt(long, help = "Set fixed tps during perf experiment")] - pub tps: Option, - #[structopt( - long, - help = "Whether benchmark should pick one node to run DB backup." - )] - pub backup: bool, - #[structopt(long, default_value = "0", help = "Set gas price in tx")] - pub gas_price: u64, - #[structopt(long, help = "Set periodic stat aggregator step")] - pub periodic_stats: Option, - #[structopt(long, default_value = "0", help = "Set percentage of invalid tx")] - pub invalid_tx: u64, -} - -pub struct PerformanceBenchmark { - down_validators: Vec, - up_validators: Vec, - up_fullnodes: Vec, - percent_nodes_down: usize, - duration: Duration, - tps: Option, - backup: bool, - gas_price: u64, - periodic_stats: Option, - invalid_tx: u64, -} - -pub const DEFAULT_BENCH_DURATION: u64 = 120; - -impl PerformanceBenchmarkParams { - pub fn new_nodes_down(percent_nodes_down: usize) -> Self { - Self { - percent_nodes_down, - duration: DEFAULT_BENCH_DURATION, - tps: None, - backup: false, - gas_price: 0, - periodic_stats: None, - invalid_tx: 0, - } - } - - pub fn new_fixed_tps(percent_nodes_down: usize, fixed_tps: u64) -> Self { - Self { - percent_nodes_down, - duration: DEFAULT_BENCH_DURATION, - tps: Some(fixed_tps), - backup: false, - gas_price: 0, - periodic_stats: None, - invalid_tx: 0, - } - } - - pub fn non_zero_gas_price(percent_nodes_down: usize, gas_price: u64) -> Self { - Self { - percent_nodes_down, - duration: DEFAULT_BENCH_DURATION, - tps: None, - backup: false, - gas_price, - periodic_stats: None, - invalid_tx: 0, - } - } - - pub fn mix_invalid_tx(percent_nodes_down: usize, invalid_tx: u64) -> Self { - Self { - percent_nodes_down, - duration: DEFAULT_BENCH_DURATION, - tps: None, - backup: false, - gas_price: 0, - periodic_stats: None, - invalid_tx, - } - } - - pub fn enable_db_backup(mut self) -> Self { - self.backup = true; - self - } -} - -impl ExperimentParam for PerformanceBenchmarkParams { - type E = PerformanceBenchmark; - fn build(self, cluster: &Cluster) -> Self::E { - let all_fullnode_instances = cluster.fullnode_instances(); - let num_nodes = cluster.validator_instances().len(); - let nodes_down = (num_nodes * self.percent_nodes_down) / 100; - let (down, up) = cluster.split_n_validators_random(nodes_down); - let up_validators = up.into_validator_instances(); - let up_fullnodes: Vec<_> = up_validators - .iter() - .filter_map(|val| { - all_fullnode_instances - .iter() - .find(|x| val.validator_group() == x.validator_group()) - .cloned() - }) - .collect(); - Self::E { - down_validators: down.into_validator_instances(), - up_validators, - up_fullnodes, - percent_nodes_down: self.percent_nodes_down, - duration: Duration::from_secs(self.duration), - tps: self.tps, - backup: self.backup, - gas_price: self.gas_price, - periodic_stats: self.periodic_stats, - invalid_tx: self.invalid_tx, - } - } -} - -#[async_trait] -impl Experiment for PerformanceBenchmark { - fn affected_validators(&self) -> HashSet { - instance::instancelist_to_set(&self.down_validators) - } - - async fn run(&mut self, context: &mut Context<'_>) -> Result<()> { - let mut txn_emitter = TxnEmitter::new( - &mut context.treasury_compliance_account, - &mut context.designated_dealer_account, - context.cluster.random_validator_instance().rest_client(), - TransactionFactory::new(context.cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - let futures: Vec<_> = self.down_validators.iter().map(Instance::stop).collect(); - try_join_all(futures).await?; - - let backup = self.maybe_start_backup()?; - let buffer = Duration::from_secs(60); - let window = self.duration + buffer * 2; - let instances = if context.emit_to_validator { - self.up_validators.clone() - } else { - self.up_fullnodes.clone() - }; - let emit_job_request = match self.tps { - Some(tps) => { - EmitJobRequest::new(instances.into_iter().map(|i| i.rest_client()).collect()) - .fixed_tps(tps.try_into().unwrap()) - .gas_price(self.gas_price) - .invalid_transaction_ratio(self.invalid_tx as usize) - } - None => crate::util::emit_job_request_for_instances( - instances, - context.global_emit_job_request, - self.gas_price, - self.invalid_tx as usize, - ), - }; - let emit_txn = match self.periodic_stats { - Some(interval) => txn_emitter - .emit_txn_for_with_stats(window, emit_job_request, interval) - .boxed(), - None => txn_emitter.emit_txn_for(window, emit_job_request).boxed(), - }; - - let stats = emit_txn.await; - - // Report - self.report(context, buffer, window, stats?).await?; - - // Clean up - drop(backup); - let futures: Vec<_> = self.down_validators.iter().map(|ic| ic.start()).collect(); - try_join_all(futures).await?; - - Ok(()) - } - - fn deadline(&self) -> Duration { - Duration::from_secs(900) + self.duration - } -} - -impl PerformanceBenchmark { - fn maybe_start_backup(&self) -> Result>> { - if !self.backup { - return Ok(None); - } - - let mut rng = ThreadRng::default(); - let validator = self - .up_validators - .choose(&mut rng) - .ok_or_else(|| anyhow!("No up validator."))? - .clone(); - - const COMMAND: &str = "/opt/diem/bin/db-backup coordinator run \ - --transaction-batch-size 20000 \ - --state-snapshot-interval 1 \ - local-fs --dir $(mktemp -d -t diem_backup_XXXXXXXX);"; - - Ok(Some(tokio::spawn(async move { - validator.exec(COMMAND, true).await.unwrap_or_else(|e| { - let err_msg = e.to_string(); - if err_msg.ends_with("exit code Some(137)") { - info!("db-backup killed."); - } else { - warn!("db-backup failed: {}", err_msg); - } - }) - }))) - } - - async fn report( - &mut self, - context: &mut Context<'_>, - buffer: Duration, - window: Duration, - stats: TxnStats, - ) -> Result<()> { - let end = duration_since_epoch() - buffer; - let start = end - window + 2 * buffer; - info!( - "Link to dashboard : {}", - context.prometheus.link_to_dashboard(start, end) - ); - - let pv = PrometheusRangeView::new(context.prometheus, start, end); - - // Transaction stats - if let Some(avg_txns_per_block) = pv.avg_txns_per_block() { - context - .report - .report_metric(&self, "avg_txns_per_block", avg_txns_per_block); - } - let additional = if self.backup { - // Backup throughput - let bytes_per_sec = pv.avg_backup_bytes_per_second().unwrap_or(-1.0); - context - .report - .report_metric(&self, "avg_backup_bytes_per_second", bytes_per_sec); - format!(" backup: {},", human_readable_bytes_per_sec(bytes_per_sec)) - } else { - "".to_string() - }; - context - .report - .report_txn_stats(self.to_string(), stats, window, &additional); - Ok(()) - } -} - -impl Display for PerformanceBenchmark { - fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { - if let Some(tps) = self.tps { - write!(f, "fixed tps {}", tps)?; - } else if self.percent_nodes_down == 0 { - write!(f, "all up")?; - } else { - write!(f, "{}% down", self.percent_nodes_down)?; - } - if self.gas_price != 0 { - write!(f, ", gas price {}", self.gas_price)?; - } - Ok(()) - } -} diff --git a/testsuite/cluster-test/src/experiments/performance_benchmark_three_region_simulation.rs b/testsuite/cluster-test/src/experiments/performance_benchmark_three_region_simulation.rs deleted file mode 100644 index 00cd11cd915a8..0000000000000 --- a/testsuite/cluster-test/src/experiments/performance_benchmark_three_region_simulation.rs +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -use crate::{ - cluster::Cluster, - effects::{self, network_delay}, - experiments::{Context, Experiment, ExperimentParam}, -}; -use anyhow::Result; -use async_trait::async_trait; -use diem_sdk::transaction_builder::TransactionFactory; -use forge::TxnEmitter; -use rand::{prelude::StdRng, rngs::OsRng, Rng, SeedableRng}; -use std::{ - fmt::{Display, Error, Formatter}, - time::Duration, -}; -use structopt::StructOpt; - -pub struct PerformanceBenchmarkThreeRegionSimulation { - cluster: Cluster, -} - -#[derive(StructOpt, Debug)] -pub struct PerformanceBenchmarkThreeRegionSimulationParams {} - -impl ExperimentParam for PerformanceBenchmarkThreeRegionSimulationParams { - type E = PerformanceBenchmarkThreeRegionSimulation; - fn build(self, cluster: &Cluster) -> Self::E { - Self::E { - cluster: cluster.clone(), - } - } -} - -#[async_trait] -impl Experiment for PerformanceBenchmarkThreeRegionSimulation { - async fn run(&mut self, context: &mut Context<'_>) -> anyhow::Result<()> { - let mut txn_emitter = TxnEmitter::new( - &mut context.treasury_compliance_account, - &mut context.designated_dealer_account, - context.cluster.random_validator_instance().rest_client(), - TransactionFactory::new(context.cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - - let num_nodes = self.cluster.validator_instances().len(); - let split_country_num = ((num_nodes as f64) * 0.8) as usize; - let split_region_num = split_country_num / 2; - let (us, euro) = self.cluster.split_n_validators_random(split_country_num); - let (us_west, us_east) = us.split_n_validators_random(split_region_num); - let mut effects = network_delay::three_region_simulation_effects( - ( - us_west.validator_instances().to_vec(), - us_east.validator_instances().to_vec(), - euro.validator_instances().to_vec(), - ), - ( - Duration::from_millis(60), // us_east<->eu one way delay - Duration::from_millis(95), // us_west<->eu one way delay - Duration::from_millis(40), // us_west<->us_east one way delay - ), - ); - - effects::activate_all(&mut effects).await?; - - let window = Duration::from_secs(240); - let emit_job_request = if context.emit_to_validator { - crate::util::emit_job_request_for_instances( - context.cluster.validator_instances().to_vec(), - context.global_emit_job_request, - 0, - 0, - ) - } else { - crate::util::emit_job_request_for_instances( - context.cluster.fullnode_instances().to_vec(), - context.global_emit_job_request, - 0, - 0, - ) - }; - let stats = txn_emitter.emit_txn_for(window, emit_job_request).await?; - effects::deactivate_all(&mut effects).await?; - context - .report - .report_txn_stats(self.to_string(), stats, window, ""); - Ok(()) - } - - fn deadline(&self) -> Duration { - Duration::from_secs(600) - } -} - -impl Display for PerformanceBenchmarkThreeRegionSimulation { - fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { - write!(f, "3 Region Simulation") - } -} diff --git a/testsuite/cluster-test/src/experiments/reboot_cluster.rs b/testsuite/cluster-test/src/experiments/reboot_cluster.rs deleted file mode 100644 index d2c5f7c1f7f59..0000000000000 --- a/testsuite/cluster-test/src/experiments/reboot_cluster.rs +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use std::{fmt, time::Duration}; - -use crate::{ - cluster::Cluster, - experiments::{Context, Experiment, ExperimentParam}, - instance, - instance::Instance, -}; -use async_trait::async_trait; -use diem_logger::info; -use futures::future::try_join_all; -use std::{ - collections::HashSet, - fmt::{Error, Formatter}, -}; -use structopt::StructOpt; -use tokio::time; - -#[derive(StructOpt, Debug)] -pub struct RebootClusterParams {} - -pub struct RebootCluster { - instances: Vec, -} - -impl ExperimentParam for RebootClusterParams { - type E = RebootCluster; - fn build(self, cluster: &Cluster) -> Self::E { - Self::E { - instances: <&[instance::Instance]>::clone(&cluster.validator_instances()).to_vec(), - } - } -} - -#[async_trait] -impl Experiment for RebootCluster { - fn affected_validators(&self) -> HashSet { - instance::instancelist_to_set(&self.instances) - } - - async fn run(&mut self, _context: &mut Context<'_>) -> anyhow::Result<()> { - let futures: Vec<_> = self.instances.iter().map(Instance::stop).collect(); - try_join_all(futures).await?; - for inst in &self.instances { - info!("Starting node {}", inst.peer_name()); - inst.start().await?; - time::sleep(Duration::from_secs(10)).await; - } - Ok(()) - } - - fn deadline(&self) -> Duration { - Duration::from_secs(20 * 60) - } -} - -impl fmt::Display for RebootCluster { - fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { - write!(f, "Reboot cluster") - } -} diff --git a/testsuite/cluster-test/src/experiments/reboot_random_validators.rs b/testsuite/cluster-test/src/experiments/reboot_random_validators.rs deleted file mode 100644 index 8671090d0ffb6..0000000000000 --- a/testsuite/cluster-test/src/experiments/reboot_random_validators.rs +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use std::{collections::HashSet, fmt, time::Duration}; - -use rand::seq::SliceRandom; - -use crate::{ - cluster::Cluster, - effects::{self, stop_validator::StopValidator}, - experiments::{Context, Experiment, ExperimentParam}, - instance, - instance::Instance, -}; -use async_trait::async_trait; -use structopt::StructOpt; - -#[derive(StructOpt, Debug)] -pub struct RebootRandomValidatorsParams { - #[structopt( - long, - default_value = "10", - help = "Number of validator nodes to reboot" - )] - count: usize, - #[structopt(long, default_value = "0", help = "Number of lsr nodes to reboot")] - lsr_count: usize, -} - -impl RebootRandomValidatorsParams { - pub fn new(validator_count: usize, lsr_count: usize) -> Self { - Self { - count: validator_count, - lsr_count, - } - } -} - -pub struct RebootRandomValidators { - instances: Vec, -} - -impl ExperimentParam for RebootRandomValidatorsParams { - type E = RebootRandomValidators; - fn build(self, cluster: &Cluster) -> Self::E { - if self.count > cluster.validator_instances().len() { - panic!( - "Can not reboot {} validators in cluster with {} instances", - self.count, - cluster.validator_instances().len() - ); - } - - if self.lsr_count > cluster.lsr_instances().len() { - panic!( - "Can not reboot {} lsrs in cluster with {} instances", - self.count, - cluster.lsr_instances().len() - ); - } - - let mut rnd = rand::thread_rng(); - let mut instances = Vec::with_capacity(self.count + self.lsr_count); - instances.append( - &mut cluster - .validator_instances() - .choose_multiple(&mut rnd, self.count) - .cloned() - .collect(), - ); - instances.append( - &mut cluster - .lsr_instances() - .choose_multiple(&mut rnd, self.lsr_count) - .cloned() - .collect(), - ); - - Self::E { instances } - } -} - -#[async_trait] -impl Experiment for RebootRandomValidators { - fn affected_validators(&self) -> HashSet { - instance::instancelist_to_set(&self.instances) - } - - async fn run(&mut self, _context: &mut Context<'_>) -> anyhow::Result<()> { - let mut effects: Vec<_> = self - .instances - .clone() - .into_iter() - .map(StopValidator::new) - .collect(); - effects::activate_all(&mut effects).await?; - effects::deactivate_all(&mut effects).await?; - Ok(()) - } - - fn deadline(&self) -> Duration { - Duration::from_secs(20 * 60) - } -} - -impl fmt::Display for RebootRandomValidators { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Reboot [")?; - for instance in self.instances.iter() { - write!(f, "{}, ", instance)?; - } - write!(f, "]") - } -} diff --git a/testsuite/cluster-test/src/experiments/reconfiguration_test.rs b/testsuite/cluster-test/src/experiments/reconfiguration_test.rs deleted file mode 100644 index d489689770fb2..0000000000000 --- a/testsuite/cluster-test/src/experiments/reconfiguration_test.rs +++ /dev/null @@ -1,246 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::{ - cluster::Cluster, - experiments::{Context, Experiment, ExperimentParam}, - instance::Instance, -}; -use anyhow::ensure; -use async_trait::async_trait; -use diem_client::Client; -use diem_logger::prelude::*; -use diem_operational_tool::json_rpc::JsonRpcClientWrapper; -use diem_sdk::transaction_builder::TransactionFactory; -use diem_types::{ - account_address::AccountAddress, - chain_id::ChainId, - ledger_info::LedgerInfoWithSignatures, - on_chain_config::{ConsensusConfigV2, OnChainConsensusConfig}, -}; -use forge::{execute_and_wait_transactions, TxnEmitter}; -use rand::{prelude::StdRng, rngs::OsRng, Rng, SeedableRng}; -use std::{ - collections::HashSet, - fmt, - time::{Duration, Instant}, -}; -use structopt::StructOpt; - -#[derive(StructOpt, Debug)] -pub struct ReconfigurationParams { - #[structopt(long, default_value = "101", help = "Number of epochs to trigger")] - pub count: u64, - #[structopt(long, help = "Emit p2p transfer transactions during experiment")] - pub emit_txn: bool, -} - -pub struct Reconfiguration { - affected_peer_id: AccountAddress, - affected_pod_name: String, - count: u64, - emit_txn: bool, -} - -impl ExperimentParam for ReconfigurationParams { - type E = Reconfiguration; - fn build(self, cluster: &Cluster) -> Self::E { - let full_node = cluster.random_fullnode_instance(); - let client = JsonRpcClientWrapper::new(full_node.json_rpc_url().into()); - let validator_info = client - .validator_set(None) - .expect("Unable to fetch validator set"); - let affected_peer_id = *validator_info[0].account_address(); - let validator_config = client - .validator_config(affected_peer_id) - .expect("Unable to fetch validator config"); - let affected_pod_name = std::str::from_utf8(&validator_config.human_name) - .unwrap() - .to_string(); - Self::E { - affected_peer_id, - affected_pod_name, - count: self.count, - emit_txn: self.emit_txn, - } - } -} - -async fn expect_epoch( - client: &Client, - known_version: u64, - expected_epoch: u64, -) -> anyhow::Result { - let state_proof = client.get_state_proof(known_version).await?.into_inner(); - let li: LedgerInfoWithSignatures = bcs::from_bytes(&state_proof.ledger_info_with_signatures)?; - let epoch = li.ledger_info().next_block_epoch(); - ensure!( - epoch == expected_epoch, - "Expect epoch {}, actual {}", - expected_epoch, - epoch - ); - info!("Epoch {} is committed", epoch); - Ok(li.ledger_info().version()) -} - -#[async_trait] -impl Experiment for Reconfiguration { - fn affected_validators(&self) -> HashSet { - let mut nodes = HashSet::new(); - nodes.insert(self.affected_pod_name.clone()); - nodes - } - - async fn run(&mut self, context: &mut Context<'_>) -> anyhow::Result<()> { - let mut txn_emitter = TxnEmitter::new( - &mut context.treasury_compliance_account, - &mut context.designated_dealer_account, - context.cluster.random_validator_instance().rest_client(), - TransactionFactory::new(context.cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - let full_node = context.cluster.random_fullnode_instance(); - let tx_factory = TransactionFactory::new(ChainId::test()); - let full_node_client = full_node.rest_client(); - let full_node_jsonrpc_client = full_node.json_rpc_client(); - let mut diem_root_account = &mut context.root_account; - let allowed_nonce = 0; - let emit_job = if self.emit_txn { - info!("Start emitting txn"); - let instances: Vec = context - .cluster - .validator_instances() - .iter() - .filter(|i| *i.peer_name() != self.affected_pod_name) - .cloned() - .collect(); - Some( - txn_emitter - .start_job(crate::util::emit_job_request_for_instances( - instances, - context.global_emit_job_request, - 0, - 0, - )) - .await?, - ) - } else { - None - }; - - let timer = Instant::now(); - let mut version = expect_epoch(&full_node_jsonrpc_client, 0, 1).await?; - { - info!("Remove and add back {}.", self.affected_pod_name); - let validator_name = self.affected_pod_name.as_bytes().to_vec(); - let remove_txn = diem_root_account.sign_with_transaction_builder( - tx_factory.remove_validator_and_reconfigure( - allowed_nonce, - validator_name.clone(), - self.affected_peer_id, - ), - ); - execute_and_wait_transactions( - &full_node_client, - &mut diem_root_account, - vec![remove_txn], - ) - .await?; - version = expect_epoch(&full_node_jsonrpc_client, version, 2).await?; - let add_txn = diem_root_account.sign_with_transaction_builder( - tx_factory.add_validator_and_reconfigure( - allowed_nonce, - validator_name.clone(), - self.affected_peer_id, - ), - ); - execute_and_wait_transactions(&full_node_client, &mut diem_root_account, vec![add_txn]) - .await?; - version = expect_epoch(&full_node_jsonrpc_client, version, 3).await?; - } - - { - info!("Switch decoupled-execution on and off repetitively."); - let upgrade_config = OnChainConsensusConfig::V2(ConsensusConfigV2 { - two_chain: true, - decoupled_execution: true, - back_pressure_limit: 10, - exclude_round: 20, - }); - let downgrade_config = OnChainConsensusConfig::default(); - for i in 1..self.count / 2 { - let upgrade_txn = diem_root_account.sign_with_transaction_builder( - tx_factory.update_diem_consensus_config( - allowed_nonce, - bcs::to_bytes(&upgrade_config).unwrap(), - ), - ); - execute_and_wait_transactions( - &full_node_client, - &mut diem_root_account, - vec![upgrade_txn], - ) - .await?; - version = expect_epoch(&full_node_jsonrpc_client, version, (i + 1) * 2).await?; - let downgrade_txn = diem_root_account.sign_with_transaction_builder( - tx_factory.update_diem_consensus_config( - allowed_nonce, - bcs::to_bytes(&downgrade_config).unwrap(), - ), - ); - execute_and_wait_transactions( - &full_node_client, - &mut diem_root_account, - vec![downgrade_txn], - ) - .await?; - version = expect_epoch(&full_node_jsonrpc_client, version, (i + 1) * 2 + 1).await?; - } - } - - if self.count % 2 == 1 { - let magic_number = 42; - info!("Bump DiemVersion to {}", magic_number); - let update_txn = diem_root_account.sign_with_transaction_builder( - TransactionFactory::new(ChainId::test()) - .update_diem_version(allowed_nonce, magic_number), - ); - execute_and_wait_transactions( - &full_node_client, - &mut diem_root_account, - vec![update_txn], - ) - .await?; - expect_epoch(&full_node_jsonrpc_client, version, self.count + 1).await?; - } - let elapsed = timer.elapsed(); - if let Some(job) = emit_job { - let stats = txn_emitter.stop_job(job).await; - context - .report - .report_txn_stats(self.to_string(), stats, elapsed, ""); - } else { - context.report.report_text(format!( - "{} finished in {} seconds", - self.to_string(), - elapsed.as_secs() - )); - } - - Ok(()) - } - - fn deadline(&self) -> Duration { - // allow each epoch to take 20 secs - Duration::from_secs(self.count as u64 * 10) - } -} - -impl fmt::Display for Reconfiguration { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Reconfiguration: total epoch: {}", self.count) - } -} diff --git a/testsuite/cluster-test/src/experiments/recovery_time.rs b/testsuite/cluster-test/src/experiments/recovery_time.rs deleted file mode 100644 index b8a0eeaa83581..0000000000000 --- a/testsuite/cluster-test/src/experiments/recovery_time.rs +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use std::{collections::HashSet, fmt, time::Duration}; - -use diem_sdk::transaction_builder::TransactionFactory; -use forge::TxnEmitter; -use rand::{prelude::StdRng, rngs::OsRng, Rng, SeedableRng}; -use structopt::StructOpt; -use tokio::time; - -use crate::{ - cluster::Cluster, - experiments::{Context, Experiment, ExperimentParam}, - instance::Instance, -}; -use async_trait::async_trait; -use diem_logger::info; -use std::time::Instant; - -#[derive(StructOpt, Debug)] -pub struct RecoveryTimeParams { - #[structopt( - long, - default_value = "100", - help = "Number of accounts to mint before starting the experiment" - )] - pub num_accounts_to_mint: u64, -} - -pub struct RecoveryTime { - params: RecoveryTimeParams, - instance: Instance, -} - -impl ExperimentParam for RecoveryTimeParams { - type E = RecoveryTime; - fn build(self, cluster: &Cluster) -> Self::E { - let instance = cluster.random_validator_instance(); - Self::E { - params: self, - instance, - } - } -} - -#[async_trait] -impl Experiment for RecoveryTime { - fn affected_validators(&self) -> HashSet { - let mut result = HashSet::new(); - result.insert(self.instance.peer_name().clone()); - result - } - - async fn run(&mut self, context: &mut Context<'_>) -> anyhow::Result<()> { - let mut txn_emitter = TxnEmitter::new( - &mut context.treasury_compliance_account, - &mut context.designated_dealer_account, - context.cluster.random_validator_instance().rest_client(), - TransactionFactory::new(context.cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - - txn_emitter - .mint_accounts( - &crate::util::emit_job_request_for_instances( - context.cluster.validator_instances().to_vec(), - context.global_emit_job_request, - 0, - 0, - ), - self.params.num_accounts_to_mint as usize, - ) - .await?; - info!("Stopping {}", self.instance); - self.instance.stop().await?; - info!("Deleting db and restarting node for {}", self.instance); - self.instance.clean_data().await?; - self.instance.start().await?; - info!("Waiting for instance to be up: {}", self.instance); - self.instance - .wait_server_ready(Instant::now() + Duration::from_secs(120)) - .await?; - let start_instant = Instant::now(); - info!( - "Instance {} is up. Waiting for it to start committing.", - self.instance - ); - while self - .instance - .counter("diem_consensus_last_committed_round") - .is_err() - { - time::sleep(Duration::from_secs(1)).await; - } - let time_to_recover = start_instant.elapsed(); - let recovery_rate = - self.params.num_accounts_to_mint as f64 / time_to_recover.as_secs() as f64; - let result = format!("Recovery rate : {:.1} txn/sec", recovery_rate,); - info!("{}", result); - context.report.report_text(result); - context - .report - .report_metric(self, "recovery_rate", recovery_rate); - Ok(()) - } - - fn deadline(&self) -> Duration { - Duration::from_secs(20 * 60) - } -} - -impl fmt::Display for RecoveryTime { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "RecoveryTime") - } -} diff --git a/testsuite/cluster-test/src/experiments/state_sync_performance.rs b/testsuite/cluster-test/src/experiments/state_sync_performance.rs deleted file mode 100644 index cb11d660f5eb5..0000000000000 --- a/testsuite/cluster-test/src/experiments/state_sync_performance.rs +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use std::{collections::HashSet, fmt, time::Duration}; - -use diem_sdk::transaction_builder::TransactionFactory; -use forge::TxnEmitter; -use rand::{prelude::StdRng, rngs::OsRng, Rng, SeedableRng}; -use structopt::StructOpt; -use tokio::time; - -use crate::{ - cluster::Cluster, - experiments::{Context, Experiment, ExperimentParam}, - instance::Instance, -}; -use async_trait::async_trait; -use diem_logger::info; -use std::time::Instant; - -const EXPERIMENT_DURATION_TIMEOUT_SECS: u64 = 1000; -const STATE_SYNC_COMMITTED_COUNTER_NAME: &str = "diem_state_sync_version.synced"; - -#[derive(StructOpt, Debug)] -pub struct StateSyncPerformanceParams { - emit_transactions_duration_secs: u64, -} - -impl StateSyncPerformanceParams { - pub fn new(emit_transactions_duration_secs: u64) -> Self { - Self { - emit_transactions_duration_secs, - } - } -} - -pub struct StateSyncPerformance { - params: StateSyncPerformanceParams, - fullnode_instance: Instance, - validator_instance: Instance, -} - -impl ExperimentParam for StateSyncPerformanceParams { - type E = StateSyncPerformance; - - fn build(self, cluster: &Cluster) -> Self::E { - let validator_instance = cluster.random_validator_instance(); - let fullnode_instance = cluster.random_fullnode_instance(); - Self::E { - params: self, - fullnode_instance, - validator_instance, - } - } -} - -#[async_trait] -impl Experiment for StateSyncPerformance { - fn affected_validators(&self) -> HashSet { - let mut result = HashSet::new(); - result.insert(self.validator_instance.peer_name().clone()); - result - } - - async fn run(&mut self, context: &mut Context<'_>) -> anyhow::Result<()> { - let mut txn_emitter = TxnEmitter::new( - &mut context.treasury_compliance_account, - &mut context.designated_dealer_account, - context.cluster.random_validator_instance().rest_client(), - TransactionFactory::new(context.cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - - // Stop the fullnode and clear all data so that it falls behind - info!("Stopping the fullnode: {}", self.fullnode_instance); - self.fullnode_instance.stop().await?; - self.fullnode_instance.clean_data().await?; - - // Execute and commit transactions on the validators for the specified duration - let emit_transactions_duration_secs = self.params.emit_transactions_duration_secs; - info!( - "Executing transactions for {} seconds", - emit_transactions_duration_secs - ); - let emit_job_request = crate::util::emit_job_request_for_instances( - context.cluster.validator_instances().to_vec(), - context.global_emit_job_request, - 0, - 0, - ); - let _ = txn_emitter - .emit_txn_for( - Duration::from_secs(emit_transactions_duration_secs), - emit_job_request, - ) - .await?; - - // Read the validator synced version - let validator_synced_version = self.read_validator_synced_version(); - if validator_synced_version == 0.0 { - return Err(anyhow::format_err!( - "Validator synced zero transactions! Something has gone wrong!" - )); - } - info!( - "The validator is now synced at version: {}", - validator_synced_version - ); - - // Restart the fullnode so that it starts state syncing to catch up - info!( - "Waiting for the fullnode to wake up: {}", - self.fullnode_instance - ); - self.fullnode_instance.start().await?; - self.fullnode_instance - .wait_server_ready(Instant::now() + Duration::from_secs(120)) - .await?; - - // Wait for the fullnode to catch up to the expected version - info!( - "The fullnode is now up. Waiting for it to state sync to the expected version: {}", - validator_synced_version - ); - let start_instant = Instant::now(); - while self.read_fullnode_synced_version() < validator_synced_version { - time::sleep(Duration::from_secs(1)).await; - } - info!( - "The fullnode has caught up to version: {}", - validator_synced_version - ); - - // Calculate the state sync throughput - let time_to_state_sync = start_instant.elapsed().as_secs(); - if time_to_state_sync == 0 { - return Err(anyhow::format_err!( - "The time taken to state sync was 0 seconds! Something has gone wrong!" - )); - } - let state_sync_throughput = validator_synced_version as u64 / time_to_state_sync; - let state_sync_throughput_message = - format!("State sync throughput : {} txn/sec", state_sync_throughput,); - info!("Time to state sync {:?}", time_to_state_sync); - - // Display the state sync throughput and report the results - info!("{}", state_sync_throughput_message); - context.report.report_text(state_sync_throughput_message); - context - .report - .report_metric(self, "state_sync_throughput", state_sync_throughput as f64); - - Ok(()) - } - - fn deadline(&self) -> Duration { - Duration::from_secs(EXPERIMENT_DURATION_TIMEOUT_SECS) - } -} - -impl StateSyncPerformance { - fn read_fullnode_synced_version(&self) -> f64 { - Self::read_synced_counter(&self.fullnode_instance) - } - - fn read_validator_synced_version(&self) -> f64 { - Self::read_synced_counter(&self.validator_instance) - } - - // Reads the state sync "synced counter" for the given instance. If no - // counter is found, returns zero. - fn read_synced_counter(instance: &Instance) -> f64 { - instance - .counter(STATE_SYNC_COMMITTED_COUNTER_NAME) - .unwrap_or(0.0) - } -} - -impl fmt::Display for StateSyncPerformance { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "StateSyncPerformance") - } -} diff --git a/testsuite/cluster-test/src/experiments/twin_validator.rs b/testsuite/cluster-test/src/experiments/twin_validator.rs deleted file mode 100644 index 6cc9134cadcc6..0000000000000 --- a/testsuite/cluster-test/src/experiments/twin_validator.rs +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::{ - cluster::Cluster, - experiments::{Context, Experiment, ExperimentParam}, - instance, - instance::Instance, -}; -use async_trait::async_trait; -use diem_infallible::duration_since_epoch; -use diem_logger::info; -use diem_sdk::transaction_builder::TransactionFactory; -use forge::TxnEmitter; -use futures::future::try_join_all; -use rand::{prelude::StdRng, rngs::OsRng, Rng, SeedableRng}; -use std::{ - collections::HashSet, - fmt, - time::{Duration, Instant}, -}; -use structopt::StructOpt; -use tokio::time; - -#[derive(StructOpt, Debug)] -pub struct TwinValidatorsParams { - #[structopt(long, default_value = "1", help = "Set twin node pair number")] - pub pair: usize, -} - -pub struct TwinValidators { - instances: Vec, - twin_validators: Vec, -} - -impl ExperimentParam for TwinValidatorsParams { - type E = TwinValidators; - fn build(self, cluster: &Cluster) -> Self::E { - if self.pair >= cluster.validator_instances().len() { - panic!( - "pair number {} can not equal or more than validator number {}", - self.pair, - cluster.validator_instances().len() - ); - } - let mut instances = cluster.validator_instances().to_vec(); - let mut twin_validators = vec![]; - let mut rnd = rand::thread_rng(); - for _i in 0..self.pair { - twin_validators.push(instances.remove(rnd.gen_range(1..instances.len()))); - } - Self::E { - instances, - twin_validators, - } - } -} - -#[async_trait] -impl Experiment for TwinValidators { - fn affected_validators(&self) -> HashSet { - instance::instancelist_to_set(&self.twin_validators) - } - - async fn run(&mut self, context: &mut Context<'_>) -> anyhow::Result<()> { - let mut txn_emitter = TxnEmitter::new( - &mut context.treasury_compliance_account, - &mut context.designated_dealer_account, - context.cluster.random_validator_instance().rest_client(), - TransactionFactory::new(context.cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - let buffer = Duration::from_secs(60); - let window = Duration::from_secs(240); - let mut new_instances = vec![]; - let mut origin_instances = vec![]; - for inst in self.twin_validators.iter() { - info!("Stopping origin validator {}", inst); - inst.stop().await?; - let mut new_twin_config = inst.instance_config().clone(); - new_twin_config.make_twin(1); - info!( - "Deleting db and starting twin node {} for {}", - new_twin_config.pod_name(), - inst - ); - context - .cluster_swarm - .clean_data( - &context - .cluster_swarm - .get_node_name(&new_twin_config.pod_name()) - .await?, - ) - .await?; - let new_inst = context - .cluster_swarm - .spawn_new_instance(new_twin_config) - .await?; - info!("Waiting for twin node to be up: {}", new_inst); - new_inst - .wait_server_ready(Instant::now() + Duration::from_secs(120)) - .await?; - info!("Twin node {} is up", new_inst); - info!("Restarting origin validator {}", inst); - inst.start().await?; - origin_instances.push(inst.clone()); - new_instances.push(new_inst.clone()); - } - let instances = self.instances.clone(); - let emit_job_request = crate::util::emit_job_request_for_instances( - instances, - context.global_emit_job_request, - 0, - 0, - ); - info!("Starting txn generation"); - let stats = txn_emitter.emit_txn_for(window, emit_job_request).await?; - let end = duration_since_epoch() - buffer; - let start = end - window + 2 * buffer; - info!( - "Link to dashboard : {}", - context.prometheus.link_to_dashboard(start, end) - ); - info!("Stopping origin validators"); - let futures: Vec<_> = origin_instances.iter().map(|ic| ic.stop()).collect(); - try_join_all(futures).await?; - time::sleep(Duration::from_secs(10)).await; - info!("Stopping twin validators"); - let futures: Vec<_> = new_instances.iter().map(|ic| ic.stop()).collect(); - try_join_all(futures).await?; - time::sleep(Duration::from_secs(10)).await; - info!("Restarting origin validators"); - let futures: Vec<_> = origin_instances.iter().map(|ic| ic.start()).collect(); - try_join_all(futures).await?; - time::sleep(Duration::from_secs(10)).await; - - for inst in origin_instances.iter() { - info!("Waiting for origin node to be up: {}", inst); - inst.wait_server_ready(Instant::now() + Duration::from_secs(120)) - .await?; - info!("Origin node {} is up", inst); - } - - context - .report - .report_txn_stats(self.to_string(), stats, window, ""); - Ok(()) - } - - fn deadline(&self) -> Duration { - Duration::from_secs(20 * 60) - } -} - -impl fmt::Display for TwinValidators { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Twin validator [")?; - for instance in self.twin_validators.iter() { - write!(f, "{}, ", instance.instance_config().pod_name())?; - } - write!(f, "]") - } -} diff --git a/testsuite/cluster-test/src/experiments/versioning_test.rs b/testsuite/cluster-test/src/experiments/versioning_test.rs deleted file mode 100644 index 409c44f95bfd3..0000000000000 --- a/testsuite/cluster-test/src/experiments/versioning_test.rs +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::{ - cluster::Cluster, - experiments::{ - compatibility_test::update_batch_instance, Context, Experiment, ExperimentParam, - }, - instance, - instance::Instance, -}; -use anyhow::format_err; -use async_trait::async_trait; -use diem_logger::prelude::*; -use diem_sdk::{transaction_builder::TransactionFactory, types::LocalAccount}; -use diem_transaction_builder::stdlib::encode_update_diem_version_script; -use diem_types::{chain_id::ChainId, transaction::TransactionPayload}; -use forge::{execute_and_wait_transactions, TxnEmitter}; -use language_e2e_tests::common_transactions::multi_agent_p2p_script_function; -use rand::{prelude::StdRng, rngs::OsRng, Rng, SeedableRng}; -use std::{collections::HashSet, fmt, time::Duration}; -use structopt::StructOpt; - -#[derive(StructOpt, Debug)] -pub struct ValidatorVersioningParams { - #[structopt( - long, - default_value = "15", - help = "Number of nodes to update in the first batch" - )] - pub count: usize, - #[structopt(long, help = "Image tag of newer validator software")] - pub updated_image_tag: String, -} - -pub struct ValidatorVersioning { - first_batch: Vec, - first_batch_lsr: Vec, - second_batch: Vec, - second_batch_lsr: Vec, - _full_nodes: Vec, - updated_image_tag: String, -} - -impl ExperimentParam for ValidatorVersioningParams { - type E = ValidatorVersioning; - fn build(self, cluster: &Cluster) -> Self::E { - if self.count > cluster.validator_instances().len() { - panic!( - "Can not reboot {} validators in cluster with {} instances", - self.count, - cluster.validator_instances().len() - ); - } - let (first_batch, second_batch) = cluster.split_n_validators_random(self.count); - let first_batch = first_batch.into_validator_instances(); - let second_batch = second_batch.into_validator_instances(); - let mut first_batch_lsr = vec![]; - let mut second_batch_lsr = vec![]; - if !cluster.lsr_instances().is_empty() { - first_batch_lsr = cluster.lsr_instances_for_validators(&first_batch); - second_batch_lsr = cluster.lsr_instances_for_validators(&second_batch); - } - - Self::E { - first_batch, - first_batch_lsr, - second_batch, - second_batch_lsr, - _full_nodes: cluster.fullnode_instances().to_vec(), - updated_image_tag: self.updated_image_tag, - } - } -} - -#[async_trait] -impl Experiment for ValidatorVersioning { - fn affected_validators(&self) -> HashSet { - instance::instancelist_to_set(&self.first_batch) - .union(&instance::instancelist_to_set(&self.second_batch)) - .cloned() - .collect() - } - - async fn run(&mut self, context: &mut Context<'_>) -> anyhow::Result<()> { - let mut txn_emitter = TxnEmitter::new( - &mut context.treasury_compliance_account, - &mut context.designated_dealer_account, - context.cluster.random_validator_instance().rest_client(), - TransactionFactory::new(context.cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - - // Mint a number of accounts - txn_emitter - .mint_accounts( - &crate::util::emit_job_request_for_instances( - context.cluster.validator_instances().to_vec(), - context.global_emit_job_request, - 0, - 0, - ), - 150, - ) - .await?; - let mut account = txn_emitter.take_account(); - let secondary_signer_account = txn_emitter.take_account(); - - // Define the transaction generator - // - // TODO: In the future we may want to pass this functor as an argument to the experiment - // to make versioning test extensible. - // Define a multi-agent p2p transaction. - let txn_payload = multi_agent_p2p_script_function(10); - - let tx_factory = - TransactionFactory::new(ChainId::test()).with_transaction_expiration_time(420); - let txn_gen = |account: &mut LocalAccount, secondary_signer_account: &LocalAccount| { - account.sign_multi_agent_with_transaction_builder( - vec![secondary_signer_account], - tx_factory.payload(txn_payload.clone()), - ) - }; - - // grab a validator node - let old_validator_node = context.cluster.random_validator_instance(); - let old_client = old_validator_node.rest_client(); - - info!("1. Send a transaction using the new feature to a validator node"); - let txn1 = txn_gen(&mut account, &secondary_signer_account); - if execute_and_wait_transactions(&old_client, &mut account, vec![txn1]) - .await - .is_ok() - { - return Err(format_err!( - "The transaction should be rejected as the new feature is not yet recognized \ - by any of the validator nodes" - )); - }; - info!("-- [Expected] The transaction is rejected by the validator node"); - - info!("2. Update the first batch of validator nodes"); - update_batch_instance( - context.cluster_swarm, - &self.first_batch, - &self.first_batch_lsr, - self.updated_image_tag.clone(), - ) - .await?; - - // choose an updated validator - let new_validator_node = self - .first_batch - .get(0) - .expect("getting an updated validator instance requires a non-empty list"); - let new_client = new_validator_node.rest_client(); - - info!("3. Send the transaction using the new feature to an updated validator node"); - let txn3 = txn_gen(&mut account, &secondary_signer_account); - if execute_and_wait_transactions(&new_client, &mut account, vec![txn3]) - .await - .is_ok() - { - return Err(format_err!( - "The transaction should be rejected as the feature is under gating", - )); - } - info!("-- The transaction is rejected as expected"); - - info!("4. Update the rest of the validator nodes"); - update_batch_instance( - context.cluster_swarm, - &self.second_batch, - &self.second_batch_lsr, - self.updated_image_tag.clone(), - ) - .await?; - - info!("5. Send the transaction using the new feature to an updated validator node again"); - let txn4 = txn_gen(&mut account, &secondary_signer_account); - if execute_and_wait_transactions(&new_client, &mut account, vec![txn4]) - .await - .is_ok() - { - return Err(format_err!( - "The transaction should be rejected as the feature is still gated", - )); - } - info!("-- The transaction is still rejected as expected, because the new feature is gated"); - - info!("6. Activate the new feature multi agent"); - let mut diem_root_account = &mut context.root_account; - let allowed_nonce = 0; - let update_txn = diem_root_account.sign_with_transaction_builder(tx_factory.payload( - TransactionPayload::Script(encode_update_diem_version_script(allowed_nonce, 3)), - )); - execute_and_wait_transactions(&new_client, &mut diem_root_account, vec![update_txn]) - .await?; - - info!("7. Send the transaction using the new feature after Diem version update"); - let txn5 = txn_gen(&mut account, &secondary_signer_account); - execute_and_wait_transactions(&new_client, &mut account, vec![txn5]).await?; - info!("-- [Expected] The transaction goes through"); - - Ok(()) - } - - fn deadline(&self) -> Duration { - Duration::from_secs(15 * 60) - } -} - -impl fmt::Display for ValidatorVersioning { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Updating [")?; - for instance in self.first_batch.iter() { - write!(f, "{}, ", instance)?; - } - for instance in self.second_batch.iter() { - write!(f, "{}, ", instance)?; - } - write!(f, "]")?; - writeln!(f, "Updated Config: {:?}", self.updated_image_tag) - } -} diff --git a/testsuite/cluster-test/src/genesis_helper.rs b/testsuite/cluster-test/src/genesis_helper.rs deleted file mode 100644 index 84344bfc98465..0000000000000 --- a/testsuite/cluster-test/src/genesis_helper.rs +++ /dev/null @@ -1,396 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 -use diem_crypto::ed25519::Ed25519PublicKey; -use diem_genesis_tool::{command::Command, layout::Layout}; -use diem_management::{error::Error, secure_backend::DISK}; -use diem_operational_tool::command::Command as OperationalCommand; -use diem_types::{ - chain_id::ChainId, network_address::NetworkAddress, transaction::Transaction, - waypoint::Waypoint, -}; -use std::path::Path; -use structopt::StructOpt; -use tokio::task::spawn_blocking; - -pub struct GenesisHelper { - path: &'static str, -} - -impl GenesisHelper { - pub fn new(path: &'static str) -> Self { - GenesisHelper { path } - } - - pub async fn set_layout(&self, path: &str, namespace: &str) -> Result { - let args = format!( - " - diem-genesis-tool - set-layout - --path {path} - --shared-backend backend={backend};\ - path={storage_path};\ - namespace={ns} - ", - path = path, - backend = DISK, - storage_path = self.path, - ns = namespace, - ); - - let command = Command::from_iter(args.split_whitespace()); - spawn_blocking(|| command.set_layout()) - .await - .expect("tokio spawn_blocking runtime error") - } - - pub async fn set_move_modules( - &self, - dir: &str, - namespace: &str, - ) -> Result>, Error> { - let args = format!( - " - diem-genesis-tool - set-move-modules - --dir {dir} - --shared-backend backend={backend};\ - path={storage_path};\ - namespace={ns} - ", - dir = dir, - backend = DISK, - storage_path = self.path, - ns = namespace, - ); - - let command = Command::from_iter(args.split_whitespace()); - spawn_blocking(|| command.set_move_modules()) - .await - .expect("tokio spawn_blocking runtime error") - } - - pub async fn diem_root_key( - &self, - validator_backend: &str, - server: &str, - token_path: &str, - validator_ns: &str, - shared_ns: &str, - ) -> Result { - let args = format!( - " - diem-genesis-tool - diem-root-key - --validator-backend backend={validator_backend};\ - server={server};\ - token={token_path};\ - namespace={validator_ns} - --shared-backend backend={backend};\ - path={path};\ - namespace={shared_ns} - ", - backend = DISK, - validator_backend = validator_backend, - server = server, - token_path = token_path, - path = self.path, - validator_ns = validator_ns, - shared_ns = shared_ns, - ); - - let command = Command::from_iter(args.split_whitespace()); - spawn_blocking(|| command.diem_root_key()) - .await - .expect("tokio spawn_blocking runtime error") - } - - pub async fn owner_key( - &self, - validator_backend: &str, - server: &str, - token_path: &str, - validator_ns: &str, - shared_ns: &str, - ) -> Result { - let args = format!( - " - diem-genesis-tool - owner-key - --validator-backend backend={validator_backend};\ - server={server};\ - token={token_path};\ - namespace={validator_ns} - --shared-backend backend={backend};\ - path={path};\ - namespace={shared_ns} - ", - backend = DISK, - validator_backend = validator_backend, - server = server, - token_path = token_path, - path = self.path, - validator_ns = validator_ns, - shared_ns = shared_ns, - ); - - let command = Command::from_iter(args.split_whitespace()); - spawn_blocking(|| command.owner_key()) - .await - .expect("tokio spawn_blocking runtime error") - } - - pub async fn operator_key( - &self, - validator_backend: &str, - server: &str, - token_path: &str, - validator_ns: &str, - shared_ns: &str, - ) -> Result { - let args = format!( - " - diem-genesis-tool - operator-key - --validator-backend backend={validator_backend};\ - server={server};\ - token={token_path};\ - namespace={validator_ns} - --shared-backend backend={backend};\ - path={path};\ - namespace={shared_ns} - ", - backend = DISK, - validator_backend = validator_backend, - server = server, - token_path = token_path, - path = self.path, - validator_ns = validator_ns, - shared_ns = shared_ns, - ); - - let command = Command::from_iter(args.split_whitespace()); - spawn_blocking(|| command.operator_key()) - .await - .expect("tokio spawn_blocking runtime error") - } - - pub async fn treasury_compliance_key( - &self, - validator_backend: &str, - server: &str, - token_path: &str, - validator_ns: &str, - shared_ns: &str, - ) -> Result { - let args = format!( - " - diem-genesis-tool - treasury-compliance-key - --validator-backend backend={validator_backend};\ - server={server};\ - token={token_path};\ - namespace={validator_ns} - --shared-backend backend={backend};\ - path={path};\ - namespace={shared_ns} - ", - backend = DISK, - validator_backend = validator_backend, - server = server, - token_path = token_path, - path = self.path, - validator_ns = validator_ns, - shared_ns = shared_ns, - ); - - let command = Command::from_iter(args.split_whitespace()); - spawn_blocking(|| command.treasury_compliance_key()) - .await - .expect("tokio spawn_blocking runtime error") - } - - pub async fn validator_config( - &self, - owner_name: &str, - validator_address: NetworkAddress, - fullnode_address: NetworkAddress, - chain_id: ChainId, - validator_backend: &str, - server: &str, - token_path: &str, - validator_ns: &str, - shared_ns: &str, - ) -> Result { - let args = format!( - " - diem-genesis-tool - validator-config - --owner-name {owner_name} - --validator-address {validator_address} - --fullnode-address {fullnode_address} - --chain-id {chain_id} - --validator-backend backend={validator_backend};\ - server={server};\ - token={token_path};\ - namespace={validator_ns} - --shared-backend backend={backend};\ - path={path};\ - namespace={shared_ns} - ", - owner_name = owner_name, - validator_address = validator_address, - fullnode_address = fullnode_address, - chain_id = chain_id.id(), - validator_backend = validator_backend, - server = server, - token_path = token_path, - backend = DISK, - path = self.path, - validator_ns = validator_ns, - shared_ns = shared_ns, - ); - - let command = Command::from_iter(args.split_whitespace()); - spawn_blocking(|| command.validator_config()) - .await - .expect("tokio spawn_blocking runtime error") - } - - pub async fn set_operator( - &self, - operator_name: &str, - shared_ns: &str, - ) -> Result { - let args = format!( - " - diem-genesis-tool - set-operator - --operator-name {operator_name} - --shared-backend backend={backend};\ - path={path};\ - namespace={shared_ns} - ", - operator_name = operator_name, - backend = DISK, - path = self.path, - shared_ns = shared_ns, - ); - - let command = Command::from_iter(args.split_whitespace()); - spawn_blocking(|| command.set_operator()) - .await - .expect("tokio spawn_blocking runtime error") - } - - pub async fn genesis( - &self, - chain_id: ChainId, - genesis_path: &Path, - ) -> Result { - let args = format!( - " - diem-genesis-tool - genesis - --chain-id {chain_id} - --shared-backend backend={backend};\ - path={path} - --path {genesis_path} - ", - chain_id = chain_id, - backend = DISK, - path = self.path, - genesis_path = genesis_path.to_str().expect("Unable to parse genesis_path"), - ); - - let command = Command::from_iter(args.split_whitespace()); - spawn_blocking(|| command.genesis()) - .await - .expect("tokio spawn_blocking runtime error") - } - - pub async fn create_and_insert_waypoint( - &self, - chain_id: ChainId, - validator_backend: &str, - server: &str, - token_path: &str, - validator_ns: &str, - ) -> Result { - let waypoint = self.create_waypoint(chain_id).await?; - - let args = format!( - " - diem-genesis-tool - insert-waypoint - --validator-backend backend={validator_backend};\ - server={server};\ - token={token_path};\ - namespace={validator_ns} - --waypoint {waypoint} - --set-genesis - ", - validator_backend = validator_backend, - server = server, - token_path = token_path, - validator_ns = validator_ns, - waypoint = waypoint, - ); - - let command = Command::from_iter(args.split_whitespace()); - spawn_blocking(|| command.insert_waypoint()) - .await - .expect("tokio spawn_blocking runtime error") - .map(|_| waypoint) - } - - pub async fn create_waypoint(&self, chain_id: ChainId) -> Result { - let args = format!( - " - diem-genesis-tool - create-waypoint - --chain-id {chain_id} - --shared-backend backend={backend};\ - path={path}\ - ", - chain_id = chain_id, - backend = DISK, - path = self.path, - ); - - let command = Command::from_iter(args.split_whitespace()); - spawn_blocking(|| command.create_waypoint()) - .await - .expect("tokio spawn_blocking runtime error") - } - - pub async fn extract_private_key( - &self, - key_name: &str, - key_file: &str, - validator_backend: &str, - server: &str, - token_path: &str, - ) -> Result<(), Error> { - let args = format!( - " - diem-operational-tool - extract-private-key - --key-name {key_name} - --key-file {key_file} - --validator-backend backend={validator_backend};\ - server={server};\ - token={token_path}\ - ", - key_name = key_name, - key_file = key_file, - validator_backend = validator_backend, - server = server, - token_path = token_path, - ); - - let command = OperationalCommand::from_iter(args.split_whitespace()); - spawn_blocking(|| command.extract_private_key()) - .await - .expect("tokio spawn_blocking runtime error") - } -} diff --git a/testsuite/cluster-test/src/health/commit_check.rs b/testsuite/cluster-test/src/health/commit_check.rs deleted file mode 100644 index f4df4bccbaca0..0000000000000 --- a/testsuite/cluster-test/src/health/commit_check.rs +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::health::{Event, HealthCheck, HealthCheckContext, ValidatorEvent}; -use async_trait::async_trait; -use std::collections::{hash_map::Entry, HashMap, HashSet}; - -type EpochAndRound = (u64, u64); - -/// Verifies that commit history produced by validators is 'lineariazble' -/// This means that validators can be behind each other, but commits that they are producing -/// do not contradict each other -#[derive(Default)] -pub struct CommitHistoryHealthCheck { - epoch_round_to_commit: HashMap, - latest_committed_epoch_round: HashMap, -} - -struct CommitAndValidators { - pub hash: String, - pub validators: HashSet, -} - -impl CommitHistoryHealthCheck { - pub fn new() -> Self { - Default::default() - } -} - -#[async_trait] -impl HealthCheck for CommitHistoryHealthCheck { - fn on_event(&mut self, ve: &ValidatorEvent, ctx: &mut HealthCheckContext) { - let commit = if let Event::Commit(ref commit) = ve.event { - commit - } else { - return; - }; - let round_to_commit = self.epoch_round_to_commit.entry(commit.epoch_and_round()); - match round_to_commit { - Entry::Occupied(mut oe) => { - let commit_and_validators = oe.get_mut(); - if commit_and_validators.hash != commit.commit { - ctx.report_failure( - ve.validator.clone(), - format!( - "produced contradicting commit {} at epoch_round {:?}, expected: {}", - commit.commit, - commit.epoch_and_round(), - commit_and_validators.hash - ), - ); - } else { - commit_and_validators - .validators - .insert(ve.validator.clone()); - } - } - Entry::Vacant(va) => { - let mut validators = HashSet::new(); - validators.insert(ve.validator.clone()); - va.insert(CommitAndValidators { - hash: commit.commit.clone(), - validators, - }); - } - } - let latest_committed_round = self - .latest_committed_epoch_round - .entry(ve.validator.clone()); - match latest_committed_round { - Entry::Occupied(mut oe) => { - let previous_epoch_and_round = *oe.get(); - if previous_epoch_and_round > commit.epoch_and_round() { - ctx.report_failure( - ve.validator.clone(), - format!( - "committed epoch and round {:?} after committing {:?}", - commit.epoch_and_round(), - previous_epoch_and_round - ), - ); - } - oe.insert(commit.epoch_and_round()); - } - Entry::Vacant(va) => { - va.insert(commit.epoch_and_round()); - } - } - if let Some(min_round) = self.latest_committed_epoch_round.values().min() { - self.epoch_round_to_commit.retain(|k, _v| *k >= *min_round); - } - } - - async fn verify(&mut self, _ctx: &mut HealthCheckContext) {} - - fn clear(&mut self) { - self.epoch_round_to_commit.clear(); - self.latest_committed_epoch_round.clear(); - } - - fn name(&self) -> &'static str { - "commit_check" - } -} diff --git a/testsuite/cluster-test/src/health/debug_interface_log_tail.rs b/testsuite/cluster-test/src/health/debug_interface_log_tail.rs deleted file mode 100644 index 228ed98745e81..0000000000000 --- a/testsuite/cluster-test/src/health/debug_interface_log_tail.rs +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::{ - cluster::Cluster, - health::{Commit, Event, LogTail, ValidatorEvent}, - instance::Instance, -}; -use debug_interface::AsyncNodeDebugClient; -use diem_infallible::duration_since_epoch; -use diem_logger::{json_log::JsonLogEntry as DebugInterfaceEvent, *}; -use serde_json::{self, value as json}; -use std::{ - env, - sync::{ - atomic::{AtomicI64, Ordering}, - mpsc, Arc, - }, - time::Duration, -}; -use tokio::{runtime::Handle, time}; - -pub struct DebugPortLogWorker { - instance: Instance, - client: AsyncNodeDebugClient, - event_sender: mpsc::Sender, - started_sender: Option>, - pending_messages: Arc, -} - -impl DebugPortLogWorker { - pub fn spawn_new(cluster: &Cluster) -> LogTail { - let runtime = Handle::current(); - let (event_sender, event_receiver) = mpsc::channel(); - let mut started_receivers = vec![]; - let pending_messages = Arc::new(AtomicI64::new(0)); - for instance in cluster.validator_and_fullnode_instances() { - let (started_sender, started_receiver) = mpsc::channel(); - started_receivers.push(started_receiver); - let client = instance.debug_interface_client(); - let debug_port_log_worker = DebugPortLogWorker { - instance: instance.clone(), - client, - event_sender: event_sender.clone(), - started_sender: Some(started_sender), - pending_messages: pending_messages.clone(), - }; - runtime.spawn(debug_port_log_worker.run()); - } - for r in started_receivers { - if let Err(e) = r.recv() { - panic!("Failed to start one of debug port log threads: {:?}", e); - } - } - LogTail { - event_receiver, - pending_messages, - } - } -} - -impl DebugPortLogWorker { - pub async fn run(mut self) { - let print_failures = env::var("VERBOSE").is_ok(); - loop { - match self.client.get_events().await { - Err(e) => { - if print_failures { - info!("Failed to get events from {}: {:?}", self.instance, e); - } - time::sleep(Duration::from_secs(1)).await; - } - Ok(resp) => { - let mut sent_events = 0i64; - for event in resp { - if let Some(e) = self.parse_event(event) { - let _ignore = self.event_sender.send(e); - sent_events += 1; - } - } - self.pending_messages - .fetch_add(sent_events, Ordering::Relaxed); - time::sleep(Duration::from_millis(100)).await; - } - } - if let Some(started_sender) = self.started_sender.take() { - if let Err(e) = started_sender.send(()) { - panic!("Failed to send to started_sender: {:?}", e); - } - } - } - } - - fn parse_event(&self, event: DebugInterfaceEvent) -> Option { - let e = if event.name == "committed" { - Self::parse_commit(&event.json) - } else { - return None; - }; - Some(ValidatorEvent { - validator: self.instance.peer_name().clone(), - timestamp: Duration::from_millis(event.timestamp as u64), - received_timestamp: duration_since_epoch(), - event: e, - }) - } - - fn parse_commit(json: &json::Value) -> Event { - Event::Commit(Commit { - commit: json - .get("block_id") - .expect("No block_id in commit event") - .as_str() - .expect("block_id is not string") - .to_string(), - epoch: json - .get("epoch") - .expect("No epoch in commit event") - .as_u64() - .expect("epoch is not u64"), - round: json - .get("round") - .expect("No round in commit event") - .as_u64() - .expect("round is not u64"), - parent: json - .get("parent_id") - .expect("No parent_id in commit event") - .as_str() - .expect("parent_id is not string") - .to_string(), - }) - } -} diff --git a/testsuite/cluster-test/src/health/fullnode_check.rs b/testsuite/cluster-test/src/health/fullnode_check.rs deleted file mode 100644 index 338277a4c7f2c..0000000000000 --- a/testsuite/cluster-test/src/health/fullnode_check.rs +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::{ - cluster::Cluster, - health::{HealthCheck, HealthCheckContext}, - instance::Instance, -}; -use async_trait::async_trait; -use futures::future::join_all; -use once_cell::sync::Lazy; -use std::{collections::HashMap, env}; - -pub static THRESHOLD: Lazy = Lazy::new(|| { - if let Ok(v) = env::var("FULL_NODE_HEALTH_THRESHOLD") { - v.parse() - .expect("Failed to parse FULL_NODE_HEALTH_THRESHOLD") - } else { - 15000_i64 - } -}); - -pub struct FullNodeHealthCheck { - cluster: Cluster, -} - -impl FullNodeHealthCheck { - pub fn new(cluster: Cluster) -> FullNodeHealthCheck { - Self { cluster } - } -} - -async fn get_version(instance: &Instance) -> (&Instance, i64) { - let res = instance - .debug_interface_client() - .get_node_metric("diem_state_sync_version{type=committed}") - .await; - let content = match res { - Ok(res) => res.unwrap_or_default(), - _ => 0i64, - }; - (instance, content) -} - -#[async_trait] -impl HealthCheck for FullNodeHealthCheck { - async fn verify(&mut self, ctx: &mut HealthCheckContext) { - let validators = self.cluster.validator_instances(); - let fullnodes = self.cluster.fullnode_instances(); - - let futures = validators.iter().map(get_version); - let val_latest_versions = join_all(futures).await; - let val_latest_versions: HashMap<_, _> = val_latest_versions - .into_iter() - .map(|(instance, version)| (instance.validator_group().index, version)) - .collect(); - - let futures = fullnodes.iter().map(get_version); - let fullnode_latest_versions = join_all(futures).await; - - for (fullnode, fullnode_version) in fullnode_latest_versions { - let index = fullnode.validator_group().index; - let val_version = val_latest_versions.get(&index).unwrap(); - if val_version - fullnode_version > *THRESHOLD { - ctx.report_failure( - format!("val-{}", index), - format!( - "fullnode {} state sync committed version: {} is behind validator: {}", - fullnode.peer_name(), - fullnode_version, - val_version, - ), - ); - } - } - } - - fn name(&self) -> &'static str { - "fullnode_check" - } -} diff --git a/testsuite/cluster-test/src/health/liveness_check.rs b/testsuite/cluster-test/src/health/liveness_check.rs deleted file mode 100644 index 838f3a06cc2fc..0000000000000 --- a/testsuite/cluster-test/src/health/liveness_check.rs +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::{ - cluster::Cluster, - health::{Event, HealthCheck, HealthCheckContext, ValidatorEvent}, -}; -use async_trait::async_trait; -use std::{collections::HashMap, time::Duration}; - -pub struct LivenessHealthCheck { - last_committed: HashMap, -} - -const MAX_BEHIND: Duration = Duration::from_secs(120); - -#[derive(Default)] -struct LastCommitInfo { - ve: Option, - timestamp: Duration, -} - -impl LivenessHealthCheck { - pub fn new(cluster: &Cluster) -> Self { - let mut last_committed = HashMap::new(); - for instance in cluster.validator_instances() { - last_committed.insert(instance.peer_name().clone(), LastCommitInfo::default()); - } - Self { last_committed } - } -} - -#[async_trait] -impl HealthCheck for LivenessHealthCheck { - fn on_event(&mut self, ve: &ValidatorEvent, ctx: &mut HealthCheckContext) { - match ve.event { - Event::Commit(..) => { - if let Some(prev) = self.last_committed.get(&ve.validator) { - if prev.timestamp > ve.timestamp { - return; - } - } - self.last_committed.insert( - ve.validator.clone(), - LastCommitInfo { - ve: Some(ve.clone()), - timestamp: ve.timestamp, - }, - ); - } - Event::ConsensusStarted => { - ctx.report_failure(ve.validator.clone(), "validator restarted".into()); - } - } - } - - async fn verify(&mut self, ctx: &mut HealthCheckContext) { - let min_timestamp = ctx.now - MAX_BEHIND; - for (validator, lci) in &self.last_committed { - if lci.timestamp < min_timestamp { - ctx.report_failure( - validator.clone(), - format!( - "Last commit is {} ms behind: {:?}", - (min_timestamp - lci.timestamp).as_millis(), - lci.ve, - ), - ); - } - } - } - - fn invalidate(&mut self, validator: &str) { - self.last_committed - .insert(validator.into(), LastCommitInfo::default()); - } - - fn name(&self) -> &'static str { - "liveness_check" - } -} diff --git a/testsuite/cluster-test/src/health/log_tail.rs b/testsuite/cluster-test/src/health/log_tail.rs deleted file mode 100644 index 5bd1c906c59e4..0000000000000 --- a/testsuite/cluster-test/src/health/log_tail.rs +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::health::ValidatorEvent; -use diem_infallible::duration_since_epoch; -use diem_logger::*; -use std::{ - sync::{ - atomic::{AtomicI64, Ordering}, - mpsc, Arc, - }, - thread, - time::{Duration, Instant}, -}; - -pub struct LogTail { - pub event_receiver: mpsc::Receiver, - pub pending_messages: Arc, -} - -impl LogTail { - pub fn recv_all_until_deadline(&self, deadline: Instant) -> Vec { - let mut events = vec![]; - while Instant::now() < deadline { - match self.event_receiver.try_recv() { - Ok(event) => events.push(event), - Err(..) => thread::sleep(Duration::from_millis(1)), - } - } - let events_count = events.len() as i64; - let prev = self - .pending_messages - .fetch_sub(events_count, Ordering::Relaxed); - let pending = prev - events_count; - let now = duration_since_epoch(); - if let Some(last) = events.last() { - let delay = now - last.received_timestamp; - if delay > Duration::from_secs(1) { - warn!( - "{} Last event delay: {}, pending {}", - now.as_millis(), - delay.as_millis(), - pending - ); - } - } else { - debug!("{} No events", now.as_millis()); - } - events - } - - pub fn recv_all(&self) -> Vec { - let mut events = vec![]; - while let Ok(event) = self.event_receiver.try_recv() { - self.pending_messages.fetch_sub(1, Ordering::Relaxed); - events.push(event); - } - events - } -} diff --git a/testsuite/cluster-test/src/health/mod.rs b/testsuite/cluster-test/src/health/mod.rs deleted file mode 100644 index 230b168ba75bd..0000000000000 --- a/testsuite/cluster-test/src/health/mod.rs +++ /dev/null @@ -1,255 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -mod commit_check; -mod debug_interface_log_tail; -mod fullnode_check; -mod liveness_check; -mod log_tail; - -use crate::cluster::Cluster; -use anyhow::{bail, Result}; -use async_trait::async_trait; -pub use commit_check::CommitHistoryHealthCheck; -pub use debug_interface_log_tail::DebugPortLogWorker; -pub use fullnode_check::FullNodeHealthCheck; -use itertools::Itertools; -pub use liveness_check::LivenessHealthCheck; -pub use log_tail::LogTail; -use std::{ - collections::{HashMap, HashSet}, - env, fmt, - time::{Duration, Instant}, -}; -use termion::color::*; - -#[derive(Clone, Debug)] -pub struct Commit { - commit: String, - epoch: u64, - round: u64, - parent: String, -} - -impl Commit { - pub fn epoch_and_round(&self) -> (u64, u64) { - (self.epoch, self.round) - } -} - -#[derive(Clone, Debug)] -pub enum Event { - Commit(Commit), - ConsensusStarted, -} - -#[derive(Clone)] -pub struct ValidatorEvent { - validator: String, - timestamp: Duration, - received_timestamp: Duration, - event: Event, -} - -impl fmt::Debug for ValidatorEvent { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "recv: {}; {} {} {:?}", - self.received_timestamp.as_millis(), - self.timestamp.as_millis(), - self.validator, - self.event - ) - } -} - -#[async_trait] -pub trait HealthCheck: Send { - /// Verify specific event - fn on_event(&mut self, _event: &ValidatorEvent, _ctx: &mut HealthCheckContext) {} - /// Periodic verification (happens even if when no events produced) - async fn verify(&mut self, _ctx: &mut HealthCheckContext); - /// Optionally marks validator as failed, requiring waiting for at least one event from it to - /// mark it as healthy again - fn invalidate(&mut self, _validator: &str) {} - /// Clean is invoked when cluster is wiped - /// This means that checks like commit history check should wipe internal state - fn clear(&mut self) {} - - fn name(&self) -> &'static str; -} - -pub struct HealthCheckRunner { - cluster: Cluster, - health_checks: Vec>, - debug: bool, -} - -impl HealthCheckRunner { - pub fn new(cluster: Cluster, health_checks: Vec>) -> Self { - Self { - cluster, - health_checks, - debug: env::var("HEALTH_CHECK_DEBUG").is_ok(), - } - } - - pub fn new_all(cluster: Cluster) -> Self { - let liveness_health_check = LivenessHealthCheck::new(&cluster); - let fullnode_check = FullNodeHealthCheck::new(cluster.clone()); - Self::new( - cluster, - vec![ - Box::new(CommitHistoryHealthCheck::new()), - Box::new(liveness_health_check), - Box::new(fullnode_check), - ], - ) - } - - /// Takes a list of affected_validators. If there are validators which failed - /// which were not part of the experiment, then it returns an Err with a string - /// of all the unexpected failures. - /// Otherwise, it returns a list of ALL the failed validators - /// It also takes print_failures parameter that controls level of verbosity of health check - pub async fn run( - &mut self, - events: &[ValidatorEvent], - affected_validators_set: &HashSet, - print_failures: PrintFailures, - ) -> Result> { - let mut node_health = HashMap::new(); - for instance in self.cluster.validator_instances() { - node_health.insert(instance.peer_name().clone(), true); - } - let mut messages = vec![]; - - let mut context = HealthCheckContext::new(); - for health_check in self.health_checks.iter_mut() { - let start = Instant::now(); - for event in events { - health_check.on_event(event, &mut context); - } - let events_processed = Instant::now(); - health_check.verify(&mut context).await; - let verified = Instant::now(); - if self.debug { - messages.push(format!( - "{} {}, on_event time: {}ms, verify time: {}ms, events: {}", - diem_infallible::duration_since_epoch().as_millis(), - health_check.name(), - (events_processed - start).as_millis(), - (verified - events_processed).as_millis(), - events.len(), - )); - } - } - for err in context.err_acc { - node_health.insert(err.validator.clone(), false); - messages.push(format!( - "{} {:?}", - diem_infallible::duration_since_epoch().as_millis(), - err - )); - } - - let mut failed = vec![]; - let mut validators_message = "".to_string(); - for (i, (node, healthy)) in node_health.into_iter().sorted().enumerate() { - if healthy { - validators_message.push_str(&format!("{}* {}{} ", Fg(Green), node, Fg(Reset))); - } else { - validators_message.push_str(&format!("{}* {}{} ", Fg(Red), node, Fg(Reset))); - failed.push(node); - } - if (i + 1) % 15 == 0 { - validators_message.push('\n'); - } - } - messages.push(validators_message); - messages.push(format!("")); - messages.push(format!("")); - - let affected_validators_set_refs: HashSet<_> = affected_validators_set.iter().collect(); - let failed_set: HashSet<_> = failed.iter().collect(); - let has_unexpected_failures = !failed_set.is_subset(&affected_validators_set_refs); - - if print_failures.should_print(has_unexpected_failures) { - messages.iter().for_each(|m| println!("{}", m)); - } - - if has_unexpected_failures { - let unexpected_failures = failed_set - .difference(&affected_validators_set_refs) - .join(","); - bail!(unexpected_failures); - } - Ok(failed) - } - - pub fn invalidate(&mut self, validator: &str) { - for hc in self.health_checks.iter_mut() { - hc.invalidate(validator); - } - } - - pub fn clear(&mut self) { - for hc in self.health_checks.iter_mut() { - hc.clear(); - } - } -} - -pub enum PrintFailures { - None, - UnexpectedOnly, - All, -} - -impl PrintFailures { - fn should_print(&self, has_unexpected_failures: bool) -> bool { - match self { - PrintFailures::None => false, - PrintFailures::UnexpectedOnly => has_unexpected_failures, - PrintFailures::All => true, - } - } -} - -pub struct HealthCheckContext { - now: Duration, - err_acc: Vec, -} - -#[derive(Debug)] -pub struct HealthCheckError { - pub validator: String, - pub message: String, -} - -impl HealthCheckContext { - pub fn new() -> Self { - let now = diem_infallible::duration_since_epoch(); - Self { - now, - err_acc: vec![], - } - } - - pub fn now(&self) -> Duration { - self.now - } - - pub fn report_failure(&mut self, validator: String, message: String) { - self.err_acc.push(HealthCheckError { validator, message }) - } -} - -impl Default for HealthCheckContext { - fn default() -> Self { - Self::new() - } -} diff --git a/testsuite/cluster-test/src/instance.rs b/testsuite/cluster-test/src/instance.rs deleted file mode 100644 index 2ec0c464617cf..0000000000000 --- a/testsuite/cluster-test/src/instance.rs +++ /dev/null @@ -1,454 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use crate::cluster_swarm::cluster_swarm_kube::ClusterSwarmKube; -use anyhow::{format_err, Result}; -use debug_interface::AsyncNodeDebugClient; -use diem_client::Client as JsonRpcClient; -use diem_config::config::NodeConfig; -use diem_rest_client::Client as RestClient; -use reqwest::{Client, Url}; -use serde_json::Value; -use std::{ - collections::HashSet, - fmt, - process::Stdio, - str::FromStr, - time::{Duration, Instant}, -}; -use tokio::{process::Command, time}; - -#[derive(Debug, Clone, PartialEq)] -pub struct ValidatorGroup { - pub index: u32, - pub twin_index: Option, -} - -#[derive(Debug, Clone)] -pub struct InstanceConfig { - pub validator_group: ValidatorGroup, - pub application_config: ApplicationConfig, -} - -#[derive(Debug, Clone)] -pub enum ApplicationConfig { - Validator(ValidatorConfig), - Fullnode(FullnodeConfig), - LSR(LSRConfig), - Vault(VaultConfig), -} - -#[derive(Debug, Clone)] -pub struct VaultConfig {} - -#[derive(Debug, Clone)] -pub struct LSRConfig { - pub image_tag: String, - pub lsr_backend: String, - pub vault_addr: Option, - pub vault_namespace: Option, -} - -#[derive(Debug, Clone)] -pub struct ValidatorConfig { - pub enable_lsr: bool, - pub image_tag: String, - pub safety_rules_addr: Option, - pub vault_addr: Option, - pub vault_namespace: Option, -} - -#[derive(Debug, Clone)] -pub struct FullnodeConfig { - pub fullnode_index: u32, - pub image_tag: String, - pub seed_peer_ip: String, - pub vault_addr: Option, - pub vault_namespace: Option, -} - -#[derive(Clone)] -pub struct Instance { - peer_name: String, - ip: String, - ac_port: u32, - debug_interface_port: Option, - http_client: Client, - backend: InstanceBackend, -} - -#[derive(Clone)] -enum InstanceBackend { - K8S(K8sInstanceInfo), - Swarm, -} - -#[derive(Clone)] -struct K8sInstanceInfo { - k8s_node: String, - instance_config: InstanceConfig, - kube: ClusterSwarmKube, -} - -impl ValidatorGroup { - pub fn new_for_index(index: u32) -> ValidatorGroup { - Self { - index, - twin_index: None, - } - } - - pub fn index_only(&self) -> u32 { - match self.twin_index { - None => self.index, - _ => panic!("Only validator has twin index"), - } - } -} - -impl ApplicationConfig { - pub fn needs_genesis(&self) -> bool { - matches!(self, Self::Validator(_)) || matches!(self, Self::Fullnode(_)) - } - - pub fn needs_config(&self) -> bool { - matches!(self, Self::Validator(_)) - || matches!(self, Self::Fullnode(_)) - || matches!(self, Self::LSR(_)) - } - - pub fn needs_fluentbit(&self) -> bool { - matches!(self, Self::Validator(_)) - || matches!(self, Self::Fullnode(_)) - || matches!(self, Self::LSR(_)) - } -} - -impl InstanceConfig { - pub fn replace_tag(&mut self, new_tag: String) -> Result<()> { - match &mut self.application_config { - ApplicationConfig::Validator(c) => { - c.image_tag = new_tag; - } - ApplicationConfig::Fullnode(c) => { - c.image_tag = new_tag; - } - ApplicationConfig::LSR(c) => { - c.image_tag = new_tag; - } - ApplicationConfig::Vault(..) => { - return Err(format_err!( - "InstanceConfig::Vault does not support custom tags" - )); - } - } - Ok(()) - } - - pub fn pod_name(&self) -> String { - match &self.application_config { - ApplicationConfig::Validator(_) => match self.validator_group.twin_index { - None => validator_pod_name(self.validator_group.index), - twin_index => format!( - "val-{}-twin-{}", - self.validator_group.index, - twin_index.unwrap() - ), - }, - ApplicationConfig::Fullnode(fullnode_config) => { - fullnode_pod_name(self.validator_group.index, fullnode_config.fullnode_index) - } - ApplicationConfig::LSR(_) => lsr_pod_name(self.validator_group.index), - ApplicationConfig::Vault(_) => vault_pod_name(self.validator_group.index), - } - } - - pub fn make_twin(&mut self, twin_index: u32) { - self.validator_group.twin_index = Some(twin_index); - } -} - -impl Instance { - pub fn new( - peer_name: String, - ip: String, - ac_port: u32, - debug_interface_port: Option, - http_client: Client, - ) -> Instance { - let backend = InstanceBackend::Swarm; - Instance { - peer_name, - ip, - ac_port, - debug_interface_port, - http_client, - backend, - } - } - - pub fn new_k8s( - peer_name: String, - ip: String, - ac_port: u32, - k8s_node: String, - instance_config: InstanceConfig, - http_client: Client, - kube: ClusterSwarmKube, - ) -> Instance { - let backend = InstanceBackend::K8S(K8sInstanceInfo { - k8s_node, - instance_config, - kube, - }); - Instance { - peer_name, - ip, - ac_port, - debug_interface_port: Some( - NodeConfig::default() - .debug_interface - .admission_control_node_debug_port as u32, - ), - http_client, - backend, - } - } - - pub fn counter(&self, counter: &str) -> Result { - let response: Value = - reqwest::blocking::get(format!("http://{}:9101/counters", self.ip).as_str())?.json()?; - if let Value::Number(ref response) = response[counter] { - if let Some(response) = response.as_f64() { - Ok(response) - } else { - Err(format_err!( - "Failed to parse counter({}) as f64: {:?}", - counter, - response - )) - } - } else { - Err(format_err!( - "Counter({}) was not a Value::Number: {:?}", - counter, - response[counter] - )) - } - } - - pub async fn try_rest_api(&self) -> Result<()> { - self.rest_client().get_ledger_information().await?; - Ok(()) - } - - pub async fn wait_server_ready(&self, deadline: Instant) -> Result<()> { - loop { - let ret = self.try_rest_api().await; - if ret.is_ok() { - break; - } - if Instant::now() > deadline { - return Err(format_err!( - "wait_server_ready for {} timed out, last error: {:?}", - self, - ret.err().unwrap() - )); - } - time::sleep(Duration::from_secs(3)).await; - } - Ok(()) - } - - pub fn peer_name(&self) -> &String { - &self.peer_name - } - - pub fn validator_group(&self) -> ValidatorGroup { - self.k8s_backend().instance_config.validator_group.clone() - } - - pub fn ip(&self) -> &String { - &self.ip - } - - pub fn ac_port(&self) -> u32 { - self.ac_port - } - - pub fn json_rpc_url(&self) -> Url { - Url::from_str(&format!("http://{}:{}/v1", self.ip(), self.ac_port())).expect("Invalid URL.") - } - - pub fn rest_api_url(&self) -> Url { - Url::from_str(&format!("http://{}:{}", self.ip(), self.ac_port())).expect("Invalid URL.") - } - - fn k8s_backend(&self) -> &K8sInstanceInfo { - if let InstanceBackend::K8S(ref k8s) = self.backend { - return k8s; - } - panic!("Instance was not started with k8s"); - } - - pub fn debug_interface_port(&self) -> Option { - self.debug_interface_port - } - - pub fn json_rpc_client(&self) -> JsonRpcClient { - JsonRpcClient::new(self.json_rpc_url().to_string()) - } - - pub fn rest_client(&self) -> RestClient { - RestClient::new(self.json_rpc_url()) - } - - pub async fn stop(&self) -> Result<()> { - let backend = self.k8s_backend(); - backend.kube.delete_node(&backend.instance_config).await - } - - /// Node must be stopped first - pub async fn start(&self) -> Result<()> { - let backend = self.k8s_backend(); - backend - .kube - .upsert_node(backend.instance_config.clone()) - .await - .map(|_| ()) - } - - /// If deleting /opt/diem/data/* is required, call Instance::clean_date before calling - /// Instance::start. - pub async fn clean_data(&self) -> Result<()> { - self.util_cmd("rm -rf /opt/diem/data/*; ", "clean-data") - .await - } - - pub async fn spawn_job( - &self, - docker_image: &str, - command: &str, - job_name: &str, - ) -> Result { - let backend = self.k8s_backend(); - backend - .kube - .spawn_job(&backend.k8s_node, docker_image, command, job_name) - .await - } - - pub fn instance_config(&self) -> &InstanceConfig { - let backend = self.k8s_backend(); - &backend.instance_config - } - - pub async fn cmd>( - &self, - docker_image: &str, - command: S, - job_name: &str, - ) -> Result<()> { - let backend = self.k8s_backend(); - backend - .kube - .run(&backend.k8s_node, docker_image, command.as_ref(), job_name) - .await - } - - /// Runs command on the same host in separate utility container based on cluster-test-util image - pub async fn util_cmd>(&self, command: S, job_name: &str) -> Result<()> { - self.cmd( - "853397791086.dkr.ecr.us-west-2.amazonaws.com/cluster-test-util:latest", - command, - job_name, - ) - .await - } - - /// Unlike util_cmd, exec runs command inside the container - pub async fn exec(&self, command: &str, mute: bool) -> Result<()> { - let mut cmd = Command::new("kubectl"); - cmd.arg("exec") - .arg(&self.peer_name) - .arg("--container") - .arg("main") - .arg("--") - .arg("sh") - .arg("-c") - .arg(command) - .kill_on_drop(true); - if mute { - cmd.stdout(Stdio::null()).stderr(Stdio::null()); - } - let mut child = cmd.spawn().map_err(|e| { - format_err!( - "Failed to spawn child process {} on {}: {}", - command, - self.peer_name(), - e - ) - })?; - let status = child - .wait() - .await - .map_err(|e| format_err!("Error running {} on {}: {}", command, self.peer_name(), e))?; - if !status.success() { - Err(format_err!( - "Running {} on {}, exit code {:?}", - command, - self.peer_name(), - status.code() - )) - } else { - Ok(()) - } - } - - pub fn debug_interface_client(&self) -> AsyncNodeDebugClient { - AsyncNodeDebugClient::new( - self.http_client.clone(), - self.ip(), - self.debug_interface_port - .expect("debug_interface_port is not known on this instance") as u16, - ) - } -} - -impl fmt::Display for Instance { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}({})", self.peer_name, self.ip) - } -} - -impl fmt::Debug for Instance { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self) - } -} - -pub fn instancelist_to_set(instances: &[Instance]) -> HashSet { - let mut r = HashSet::new(); - for instance in instances { - r.insert(instance.peer_name().clone()); - } - r -} - -pub fn validator_pod_name(index: u32) -> String { - format!("val-{}", index) -} - -pub fn vault_pod_name(index: u32) -> String { - format!("vault-{}", index) -} - -pub fn lsr_pod_name(index: u32) -> String { - format!("lsr-{}", index) -} - -pub fn fullnode_pod_name(validator_index: u32, fullnode_index: u32) -> String { - format!("fn-{}-{}", validator_index, fullnode_index) -} diff --git a/testsuite/cluster-test/src/lib.rs b/testsuite/cluster-test/src/lib.rs deleted file mode 100644 index edbf427ad0f54..0000000000000 --- a/testsuite/cluster-test/src/lib.rs +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -pub mod aws; -pub mod cluster; -pub mod cluster_builder; -pub mod cluster_swarm; -pub mod effects; -pub mod experiments; -pub mod genesis_helper; -pub mod health; -pub mod instance; -pub mod prometheus; -pub mod report; -pub mod stats; -pub mod suite; - -pub mod util { - use crate::instance::Instance; - use forge::EmitJobRequest; - - pub fn emit_job_request_for_instances( - instances: Vec, - global_emit_job_request: &Option, - gas_price: u64, - invalid_tx: usize, - ) -> EmitJobRequest { - let clients = instances - .into_iter() - .map(|instance| instance.rest_client()) - .collect(); - - global_emit_job_request - .clone() - .unwrap_or_default() - .rest_clients(clients) - .gas_price(gas_price) - .invalid_transaction_ratio(invalid_tx) - } - - pub fn human_readable_bytes_per_sec(bytes_per_sec: f64) -> String { - if bytes_per_sec.round() < 1024.0 { - return format!("{:.0} Bps", bytes_per_sec); - } - - let kbytes_per_sec = bytes_per_sec / 1024.0; - if kbytes_per_sec.round() < 1024.0 { - return format!("{:.0} KBps", kbytes_per_sec); - } - - let mbytes_per_sec = kbytes_per_sec / 1024.0; - format!("{:.2} MBps", mbytes_per_sec) - } - - #[cfg(test)] - mod tests { - use crate::util::human_readable_bytes_per_sec; - - #[test] - fn test_human_readable_bytes_per_sec() { - assert_eq!(&human_readable_bytes_per_sec(0.3), "0 Bps"); - assert_eq!(&human_readable_bytes_per_sec(0.7), "1 Bps"); - assert_eq!(&human_readable_bytes_per_sec(1.0), "1 Bps"); - assert_eq!(&human_readable_bytes_per_sec(1023.4), "1023 Bps"); - assert_eq!(&human_readable_bytes_per_sec(1023.5), "1 KBps"); - assert_eq!(&human_readable_bytes_per_sec(1024.0 * 3.5), "4 KBps"); - assert_eq!(&human_readable_bytes_per_sec(1024.0 * 1023.4), "1023 KBps"); - assert_eq!(&human_readable_bytes_per_sec(1024.0 * 1023.5), "1.00 MBps"); - assert_eq!( - &human_readable_bytes_per_sec(1024.0 * 1024.0 * 2.28), - "2.28 MBps" - ); - } - } -} diff --git a/testsuite/cluster-test/src/main.rs b/testsuite/cluster-test/src/main.rs deleted file mode 100644 index c8f30bd533c18..0000000000000 --- a/testsuite/cluster-test/src/main.rs +++ /dev/null @@ -1,877 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -use std::{ - collections::HashSet, - env, fmt, process, - time::{Duration, Instant}, -}; - -use diem_logger::{info, warn}; -use diem_types::chain_id::ChainId; -use rand::{prelude::StdRng, rngs::OsRng, Rng, SeedableRng}; -use reqwest::Url; -use structopt::{clap::ArgGroup, StructOpt}; -use termion::{color, style}; - -use anyhow::{bail, format_err, Result}; -use cluster_test::{ - aws, - cluster::Cluster, - cluster_builder::{ClusterBuilder, ClusterBuilderParams}, - cluster_swarm::{cluster_swarm_kube::ClusterSwarmKube, ClusterSwarm}, - experiments::{get_experiment, Context, Experiment}, - health::{DebugPortLogWorker, HealthCheckRunner, LogTail, PrintFailures}, - instance::Instance, - prometheus::Prometheus, - report::SuiteReport, - suite::ExperimentSuite, -}; -use diem_config::config::DEFAULT_JSON_RPC_PORT; -use diem_sdk::{transaction_builder::TransactionFactory, types::LocalAccount}; -use forge::{ - query_sequence_numbers, EmitJobRequest, EmitThreadParams, GitHub, SlackClient, TxnEmitter, -}; -use futures::{ - future::{join_all, FutureExt}, - select, -}; -use itertools::zip; -use std::cmp::min; -use tokio::time::{sleep, sleep_until, Instant as TokioInstant}; - -const HEALTH_POLL_INTERVAL: Duration = Duration::from_secs(5); - -#[derive(StructOpt, Debug)] -#[structopt(group = ArgGroup::with_name("action"))] -struct Args { - #[structopt(short = "p", long, use_delimiter = true, requires = "swarm")] - peers: Vec, - - #[structopt(long, help = "If set, tries to connect to a diem-swarm instead of aws")] - swarm: bool, - #[structopt(long, help = "If set, tries to use public peers instead of localhost")] - vasp: bool, - - #[structopt(long, group = "action")] - run: Option, - #[structopt(long, group = "action")] - health_check: bool, - #[structopt(long, group = "action")] - emit_tx: bool, - #[structopt(long, group = "action", requires = "swarm")] - diag: bool, - #[structopt(long, group = "action")] - no_teardown: bool, - #[structopt(long, group = "action")] - suite: Option, - #[structopt(long, group = "action")] - exec: Option, - - #[structopt(last = true)] - last: Vec, - - #[structopt(long)] - deploy: Option, - #[structopt(long, multiple = true)] - changelog: Option>, - - // emit_tx options - #[structopt(long, default_value = "15")] - accounts_per_client: usize, - #[structopt(long)] - workers_per_ac: Option, - #[structopt(long, default_value = "0")] - wait_millis: u64, - #[structopt(long)] - burst: bool, - #[structopt(long, default_value = "mint.key")] - mint_file: String, - #[structopt(long, default_value = "TESTING")] - chain_id: ChainId, - #[structopt( - long, - help = "Time to run --emit-tx for in seconds", - default_value = "60" - )] - duration: u64, - #[structopt(long, help = "Percentage of invalid txs", default_value = "0")] - invalid_tx: usize, - - #[structopt( - long, - help = "Whether transactions should be submitted to validators or full nodes" - )] - pub emit_to_validator: Option, - - #[structopt( - long, - help = "Wait for given number of seconds if experiment fails. This require experiment to return error, it does not catch panics" - )] - pub wait_on_failure: Option, - - #[structopt(flatten)] - pub cluster_builder_params: ClusterBuilderParams, -} - -#[tokio::main] -pub async fn main() { - setup_log(); - - let args = Args::from_args(); - - if args.swarm && !(args.emit_tx || args.diag || args.health_check) { - panic!("Can only use --emit-tx or --diag or --health-check in --swarm mode"); - } - - if args.diag { - let util = BasicSwarmUtil::setup(&args); - exit_on_error(util.diag(args.vasp).await); - return; - } else if args.emit_tx && args.swarm { - let util = BasicSwarmUtil::setup(&args); - exit_on_error(emit_tx(&util.cluster, &args).await); - return; - } else if args.health_check && args.swarm { - let util = BasicSwarmUtil::setup(&args); - let logs = DebugPortLogWorker::spawn_new(&util.cluster); - let mut health_check_runner = HealthCheckRunner::new_all(util.cluster); - let duration = Duration::from_secs(args.duration); - exit_on_error(run_health_check(&logs, &mut health_check_runner, duration).await); - return; - } - - let wait_on_failure = if let Some(wait_on_failure) = args.wait_on_failure { - if wait_on_failure > 20 * 60 { - println!("wait_on_failure can not be more then 1200 seconds on shared cluster"); - process::exit(1); - } - Some(Duration::from_secs(wait_on_failure)) - } else { - None - }; - - let runner = ClusterTestRunner::setup(&args).await; - let mut runner = match runner { - Ok(r) => r, - Err(e) => { - if let Some(wait_on_failure) = wait_on_failure { - warn!( - "Setting up runner failed with {}, waiting for {:?} before terminating", - e, wait_on_failure - ); - sleep(wait_on_failure).await; - } - panic!("Failed to setup cluster test runner: {}", e); - } - }; - - let result = handle_cluster_test_runner_commands(&args, &mut runner).await; - if let Err(e) = &result { - if let Some(wait_on_failure) = wait_on_failure { - warn!( - "Command failed with {}, waiting for {:?} before terminating", - e, wait_on_failure - ); - sleep(wait_on_failure).await; - warn!("Tearing down cluster now"); - } - } - if !args.no_teardown { - runner.teardown().await; - } - let perf_msg = exit_on_error(result); - - if let Some(mut changelog) = args.changelog { - if changelog.len() != 2 { - println!("Use: changelog "); - process::exit(1); - } - let to_commit = changelog.remove(1); - let from_commit = Some(changelog.remove(0)); - if let Some(perf_msg) = perf_msg { - runner.send_changelog_message(&perf_msg, &from_commit, &to_commit); - } else { - println!("{}", runner.get_changelog(from_commit.as_ref(), &to_commit)); - } - } else if let Some(perf_msg) = perf_msg { - println!("{}", perf_msg); - } -} - -// This function contain handlers for commands that require cluster running for executing them -async fn handle_cluster_test_runner_commands( - args: &Args, - runner: &mut ClusterTestRunner, -) -> Result> { - let startup_timeout = Duration::from_secs(5 * 60); - runner - .wait_until_all_healthy(Instant::now() + startup_timeout) - .await - .map_err(|err| { - runner - .report - .report_text(format!("Cluster setup failed: `{}`", err)); - runner.print_report(); - err - })?; - let mut perf_msg = None; - if args.health_check { - let duration = Duration::from_secs(args.duration); - run_health_check(&runner.logs, &mut runner.health_check_runner, duration).await? - } else if let Some(suite) = args.suite.as_ref() { - perf_msg = Some(runner.run_named_suite(suite).await?); - } else if let Some(experiment_name) = args.run.as_ref() { - runner - .run_and_report(get_experiment(experiment_name, &args.last, &runner.cluster)) - .await?; - info!( - "{}Experiment Result: {}{}", - Bold {}, - runner.report, - Reset {} - ); - } else if args.emit_tx { - emit_tx(&runner.cluster, args).await?; - } else if let Some(ref exec) = args.exec { - let pos = exec.find(':'); - let pos = pos.ok_or_else(|| { - format_err!("Format for exec command is pod:command, for example val-1:date") - })?; - let (pod, cmd) = exec.split_at(pos); - let cmd = &cmd[1..]; - runner.exec_on_pod(pod, cmd).await?; - } - Ok(perf_msg) -} - -fn exit_on_error(r: Result) -> T { - match r { - Ok(r) => r, - Err(err) => { - println!("{}", err); - process::exit(1) - } - } -} - -fn setup_log() { - if env::var("RUST_LOG").is_err() { - env::set_var("RUST_LOG", "info"); - } - ::diem_logger::Logger::new().is_async(true).init(); -} - -struct BasicSwarmUtil { - cluster: Cluster, -} - -struct ClusterTestRunner { - logs: LogTail, - cluster_builder: ClusterBuilder, - cluster_builder_params: ClusterBuilderParams, - cluster: Cluster, - health_check_runner: HealthCheckRunner, - slack: SlackClient, - slack_changelog_url: Option, - root_account: LocalAccount, - treasury_compliance_account: LocalAccount, - designated_dealer_account: LocalAccount, - prometheus: Prometheus, - github: GitHub, - report: SuiteReport, - global_emit_job_request: EmitJobRequest, - emit_to_validator: bool, - cluster_swarm: ClusterSwarmKube, - current_tag: String, -} - -fn parse_host_port(s: &str) -> Result<(String, u32, Option)> { - let v = s.split(':').collect::>(); - if v.len() == 1 { - let default_port = DEFAULT_JSON_RPC_PORT as u32; - return Ok((v[0].to_string(), default_port, None)); - } - if v.len() != 2 && v.len() != 3 { - return Err(format_err!( - "Failed to parse {:?} in host:port or host:port:debug_interface_port format", - s - )); - } - let host = v[0].to_string(); - let port = v[1].parse::()?; - if v.len() == 3 { - let debug_interface_port = v[2].parse::()?; - return Ok((host, port, Some(debug_interface_port))); - } - Ok((host, port, None)) -} - -async fn emit_tx(cluster: &Cluster, args: &Args) -> Result<()> { - let thread_params = EmitThreadParams { - wait_millis: args.wait_millis, - wait_committed: !args.burst, - }; - let duration = Duration::from_secs(args.duration); - let client = cluster.random_validator_instance().rest_client(); - let mut treasury_compliance_account = cluster.load_tc_account(&client).await?; - let mut designated_dealer_account = cluster.load_faucet_account(&client).await?; - let mut emitter = TxnEmitter::new( - &mut treasury_compliance_account, - &mut designated_dealer_account, - client, - TransactionFactory::new(cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - let mut emit_job_request = EmitJobRequest::new( - cluster - .validator_instances() - .iter() - .map(Instance::rest_client) - .collect(), - ) - .accounts_per_client(args.accounts_per_client) - .thread_params(thread_params) - .invalid_transaction_ratio(args.invalid_tx); - if let Some(workers_per_endpoint) = args.workers_per_ac { - emit_job_request = emit_job_request.workers_per_endpoint(workers_per_endpoint); - } - let stats = emitter - .emit_txn_for_with_stats(duration, emit_job_request, 10) - .await?; - println!("Total stats: {}", stats); - println!("Average rate: {}", stats.rate(duration)); - Ok(()) -} - -async fn run_health_check( - logs: &LogTail, - health_check_runner: &mut HealthCheckRunner, - duration: Duration, -) -> Result<()> { - let health_check_deadline = Instant::now() + duration; - loop { - let deadline = Instant::now() + Duration::from_secs(1); - // Receive all events that arrived to log tail within next 1 second - // This assumes so far that event propagation time is << 1s, this need to be refined - // in future to account for actual event propagation delay - let events = logs.recv_all_until_deadline(deadline); - let result = health_check_runner - .run(&events, &HashSet::new(), PrintFailures::All) - .await; - let now = Instant::now(); - if now > health_check_deadline { - return result.map(|_| ()); - } - } -} - -impl BasicSwarmUtil { - pub fn setup(args: &Args) -> Self { - if args.peers.is_empty() { - panic!("Peers not set in args"); - } - let parsed_peers: Vec<_> = args - .peers - .iter() - .map(|peer| parse_host_port(peer).expect("Failed to parse host_port")) - .collect(); - - let cluster = - Cluster::from_host_port(parsed_peers, &args.mint_file, args.chain_id, args.vasp); - Self { cluster } - } - - pub async fn diag(&self, vasp: bool) -> Result<()> { - let client = self.cluster.random_validator_instance().rest_client(); - let mut treasury_compliance_account = self.cluster.load_tc_account(&client).await?; - let mut designated_dealer_account = self.cluster.load_faucet_account(&client).await?; - let emitter = TxnEmitter::new( - &mut treasury_compliance_account, - &mut designated_dealer_account, - client, - TransactionFactory::new(self.cluster.chain_id), - StdRng::from_seed(OsRng.gen()), - ); - let mut faucet_account: Option = None; - let instances: Vec<_> = self.cluster.validator_and_fullnode_instances().collect(); - for instance in &instances { - let client = instance.rest_client(); - print!("Getting faucet account sequence number on {}...", instance); - let account = if vasp { - self.cluster - .load_dd_account(&client) - .await - .map_err(|e| format_err!("Failed to get dd account: {}", e))? - } else { - self.cluster - .load_faucet_account(&client) - .await - .map_err(|e| { - format_err!("Failed to get faucet account sequence number: {}", e) - })? - }; - println!("seq={}", account.sequence_number()); - if let Some(faucet_account) = &faucet_account { - if account.sequence_number() != faucet_account.sequence_number() { - bail!( - "Loaded sequence number {}, which is different from seen before {}", - account.sequence_number(), - faucet_account.sequence_number() - ); - } - } else { - faucet_account = Some(account); - } - } - let mut faucet_account = - faucet_account.expect("There is no faucet account set (not expected)"); - let faucet_account_address = faucet_account.address(); - for instance in &instances { - print!("Submitting txn through {}...", instance); - let deadline = emitter - .submit_single_transaction( - &instance.rest_client(), - &mut faucet_account, - &faucet_account_address, - 10, - ) - .await - .map_err(|e| format_err!("Failed to submit txn through {}: {}", instance, e))?; - println!("seq={}", faucet_account.sequence_number()); - println!( - "Waiting all full nodes to get to seq {}", - faucet_account.sequence_number() - ); - loop { - let addresses = &[faucet_account_address]; - let clients = instances - .iter() - .map(|instance| instance.rest_client()) - .collect::>(); - let futures = clients - .iter() - .map(|client| query_sequence_numbers(client, addresses)); - let results = join_all(futures).await; - let mut all_good = true; - for (instance, result) in zip(instances.iter(), results) { - let seq = result.map_err(|e| { - format_err!("Failed to query sequence number from {}: {}", instance, e) - })?[0]; - let ip = instance.ip(); - let color = if seq != faucet_account.sequence_number() { - all_good = false; - color::Fg(color::Red).to_string() - } else { - color::Fg(color::Green).to_string() - }; - print!( - "[{}{}:{}{}] ", - color, - &ip[..min(ip.len(), 10)], - seq, - color::Fg(color::Reset) - ); - } - println!(); - if all_good { - break; - } - if Instant::now() > deadline { - bail!("Not all full nodes were updated and transaction expired"); - } - tokio::time::sleep(Duration::from_secs(1)).await; - } - } - println!("Looks like all full nodes are healthy!"); - Ok(()) - } -} - -impl ClusterTestRunner { - pub async fn teardown(&mut self) { - self.cluster_swarm.cleanup().await.expect("Cleanup failed"); - let workspace = self - .cluster_swarm - .get_workspace() - .await - .expect("Failed to get workspace"); - let asg_name = format!("{}-k8s-testnet-validators", workspace); - aws::set_asg_size(0, 0.0, &asg_name, false, true) - .await - .unwrap_or_else(|_| panic!("{} scaling failed", asg_name)); - } - - /// Discovers cluster, setup log, etc - pub async fn setup(args: &Args) -> Result { - let start_time = Instant::now(); - let current_tag = args.deploy.as_deref().unwrap_or("master"); - let cluster_swarm = ClusterSwarmKube::new() - .await - .map_err(|e| format_err!("Failed to initialize ClusterSwarmKube: {}", e))?; - let prometheus_ip = "diem-testnet-prometheus-server.default.svc.cluster.local"; - let grafana_base_url = cluster_swarm - .get_grafana_baseurl() - .await - .expect("Failed to discover grafana url in k8s"); - let prometheus = Prometheus::new(prometheus_ip, grafana_base_url); - let cluster_builder = ClusterBuilder::new(current_tag.to_string(), cluster_swarm.clone()); - let cluster_builder_params = args.cluster_builder_params.clone(); - let cluster = cluster_builder - .setup_cluster(&cluster_builder_params, true) - .await - .map_err(|e| format_err!("Failed to setup cluster: {}", e))?; - let log_tail_started = Instant::now(); - let logs = DebugPortLogWorker::spawn_new(&cluster); - let log_tail_startup_time = Instant::now() - log_tail_started; - info!( - "Log tail thread started in {} ms", - log_tail_startup_time.as_millis() - ); - let health_check_runner = HealthCheckRunner::new_all(cluster.clone()); - let slack = SlackClient::new(); - let slack_changelog_url = env::var("SLACK_CHANGELOG_URL") - .map(|u| u.parse().expect("Failed to parse SLACK_CHANGELOG_URL")) - .ok(); - let client = cluster.random_validator_instance().rest_client(); - let root_account = cluster.load_diem_root_account(&client).await?; - let treasury_compliance_account = cluster.load_tc_account(&client).await?; - let designated_dealer_account = cluster.load_faucet_account(&client).await?; - let github = GitHub::new(); - let mut report = SuiteReport::new(); - let end_time = (Instant::now() - start_time).as_secs() as u64; - report.report_text(format!("Test runner setup time spent {} secs", end_time)); - let mut global_emit_job_request = EmitJobRequest::default() - .accounts_per_client(args.accounts_per_client) - .thread_params(EmitThreadParams { - wait_millis: args.wait_millis, - wait_committed: !args.burst, - }) - .invalid_transaction_ratio(args.invalid_tx); - if let Some(workers_per_endpoint) = args.workers_per_ac { - global_emit_job_request = - global_emit_job_request.workers_per_endpoint(workers_per_endpoint); - } - let emit_to_validator = - if cluster.fullnode_instances().len() < cluster.validator_instances().len() { - true - } else { - args.emit_to_validator.unwrap_or(false) - }; - Ok(Self { - logs, - cluster_builder, - cluster_builder_params, - cluster, - health_check_runner, - slack, - slack_changelog_url, - root_account, - treasury_compliance_account, - designated_dealer_account, - prometheus, - github, - report, - global_emit_job_request, - emit_to_validator, - cluster_swarm, - current_tag: current_tag.to_string(), - }) - } - - pub fn send_changelog_message( - &self, - perf_msg: &str, - from_commit: &Option, - to_commit: &str, - ) { - info!( - "Generating changelog from {:?} to {}", - from_commit, to_commit - ); - let changelog = self.get_changelog(from_commit.as_ref(), to_commit); - self.slack_changelog_message(format!("{}\n\n{}", changelog, perf_msg)); - } - - fn get_changelog(&self, prev_commit: Option<&String>, upstream_commit: &str) -> String { - let commits = self.github.get_commits("diem/diem", upstream_commit); - match commits { - Err(e) => { - info!("Failed to get github commits: {:?}", e); - format!("*Revision upstream_{}*", upstream_commit) - } - Ok(commits) => { - let mut msg = format!("*Revision {}*", upstream_commit); - for commit in commits { - if let Some(prev_commit) = prev_commit { - if commit.sha.starts_with(prev_commit) { - break; - } - } - let commit_lines: Vec<_> = commit.commit.message.split('\n').collect(); - let commit_head = commit_lines[0]; - let commit_head = commit_head.replace("[breaking]", "*[breaking]*"); - let short_sha = &commit.sha[..6]; - let email_parts: Vec<_> = commit.commit.author.email.split('@').collect(); - let author = email_parts[0]; - let line = format!("\n>\u{2022} {} _{}_ {}", short_sha, author, commit_head); - msg.push_str(&line); - } - msg - } - } - } - - async fn run_suite(&mut self, suite: ExperimentSuite) -> Result<()> { - info!("Starting suite"); - let suite_started = Instant::now(); - for experiment in suite.experiments { - let start_time = Instant::now(); - let experiment_name = format!("{}", experiment); - let experiment_result = self - .run_single_experiment(experiment, None) - .await - .map_err(move |e| format_err!("Experiment `{}` failed: `{}`", experiment_name, e)); - let end_time = (Instant::now() - start_time).as_secs() as u64; - if let Err(e) = experiment_result.as_ref() { - self.report.report_text(e.to_string()); - self.print_report(); - experiment_result?; - } - self.report - .report_text_same_line(format!(", time spent {} secs", end_time)) - } - info!( - "Suite completed in {:?}", - Instant::now().duration_since(suite_started) - ); - self.print_report(); - Ok(()) - } - - pub fn print_report(&self) { - let json_report = - serde_json::to_string_pretty(&self.report).expect("Failed to serialize report to json"); - info!( - "\n====json-report-begin===\n{}\n====json-report-end===", - json_report - ); - } - - pub async fn run_named_suite(&mut self, name: &str) -> Result { - let suite = ExperimentSuite::new_by_name(&self.cluster, name)?; - self.run_suite(suite).await?; - Ok(self.report.to_string()) - } - - pub async fn run_and_report(&mut self, experiment: Box) -> Result<()> { - let experiment_name = format!("{}", experiment); - match self - .run_single_experiment(experiment, Some(self.global_emit_job_request.clone())) - .await - { - Ok(_) => { - self.print_report(); - Ok(()) - } - Err(err) => { - self.report.report_text(format!( - "Experiment `{}` failed: `{}`", - experiment_name, err - )); - self.print_report(); - Err(err) - } - } - } - - pub async fn run_single_experiment( - &mut self, - experiment: Box, - global_emit_job_request: Option, - ) -> Result<()> { - let events = self.logs.recv_all(); - if let Err(s) = self - .health_check_runner - .run(&events, &HashSet::new(), PrintFailures::UnexpectedOnly) - .await - { - bail!( - "Some validators are unhealthy before experiment started : {}", - s - ); - } - - info!( - "{}Starting experiment {}{}{}{}", - Bold {}, - color::Fg(color::Blue), - experiment.to_string(), - color::Fg(color::Reset), - Reset {} - ); - - let deadline = Instant::now() + experiment.deadline(); - - self.experiment_loop(experiment, global_emit_job_request, deadline) - .await?; - - info!( - "{}Experiment finished, waiting until all affected validators recover{}", - Bold {}, - Reset {} - ); - - self.wait_until_all_healthy(deadline).await?; - - info!("Experiment completed"); - Ok(()) - } - - // inner poll loop of run_single_experiment - // do not use this fn, use run_single_experiment to run experiments - async fn experiment_loop( - &mut self, - mut experiment: Box, - mut global_emit_job_request: Option, - deadline: Instant, - ) -> Result<()> { - let affected_validators = experiment.affected_validators(); - let mut context = Context::new( - &mut self.root_account, - &mut self.treasury_compliance_account, - &mut self.designated_dealer_account, - &self.prometheus, - &mut self.cluster_builder, - &self.cluster_builder_params, - &self.cluster, - &mut self.report, - &mut global_emit_job_request, - self.emit_to_validator, - &self.cluster_swarm, - &self.current_tag[..], - ); - let deadline_future = sleep_until(TokioInstant::from_std(deadline)).fuse(); - let mut run_future = experiment.run(&mut context).fuse(); - let sleep = sleep(HEALTH_POLL_INTERVAL).fuse(); - tokio::pin!(sleep); - tokio::pin!(deadline_future); - - loop { - select! { - _delay = deadline_future => { - bail!("Experiment deadline reached"); - } - result = run_future => { - return result.map_err(|e|format_err!("Failed to run experiment: {}", e)); - } - _delay = sleep => { - let events = self.logs.recv_all(); - if let Err(s) = self.health_check_runner.run( - &events, - &affected_validators, - PrintFailures::UnexpectedOnly, - ).await { - bail!("Validators which were not under experiment failed : {}", s); - } - } - } - } - } - - async fn wait_until_all_healthy(&mut self, deadline: Instant) -> Result<()> { - info!("Waiting for all nodes to be healthy"); - for instance in self.cluster.validator_instances() { - self.health_check_runner.invalidate(instance.peer_name()); - } - loop { - let now = Instant::now(); - if now > deadline { - bail!("Nodes did not become healthy after deployment"); - } - let deadline = now + HEALTH_POLL_INTERVAL; - let events = self.logs.recv_all_until_deadline(deadline); - if let Ok(failed_instances) = self - .health_check_runner - .run(&events, &HashSet::new(), PrintFailures::None) - .await - { - if failed_instances.is_empty() { - break; - } - } - } - info!( - "All nodes are now healthy. Checking json rpc endpoints of validators and full nodes" - ); - loop { - let results = join_all( - self.cluster - .validator_and_fullnode_instances() - .map(Instance::try_rest_api), - ) - .await; - - if results.iter().all(Result::is_ok) { - break; - } - if Instant::now() > deadline { - for (instance, result) in - zip(self.cluster.validator_and_fullnode_instances(), results) - { - if let Err(err) = result { - warn!("Instance {} still unhealthy: {}", instance, err); - } - } - bail!("Some json rpc endpoints did not become healthy after deployment"); - } - } - info!("All json rpc endpoints are healthy"); - Ok(()) - } - - fn slack_changelog_message(&self, msg: String) { - info!("{}", msg); - if let Some(ref changelog_url) = self.slack_changelog_url { - if let Err(e) = self.slack.send_message(changelog_url, &msg) { - info!("Failed to send slack message: {}", e); - } - } - } - - pub async fn exec_on_pod(&self, pod: &str, cmd: &str) -> Result<()> { - let instance = self - .cluster - .find_instance_by_pod(pod) - .ok_or_else(|| format_err!("Can not find instance with pod {}", pod))?; - instance.exec(cmd, false).await - } -} - -struct Bold {} - -struct Reset {} - -impl fmt::Debug for Bold { - fn fmt(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result { - Ok(()) - } -} - -impl fmt::Display for Bold { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", style::Bold) - } -} - -impl fmt::Debug for Reset { - fn fmt(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result { - Ok(()) - } -} - -impl fmt::Display for Reset { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", style::Reset) - } -} diff --git a/testsuite/cluster-test/src/prometheus.rs b/testsuite/cluster-test/src/prometheus.rs deleted file mode 100644 index 63d2abf7341d2..0000000000000 --- a/testsuite/cluster-test/src/prometheus.rs +++ /dev/null @@ -1,212 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] - -use anyhow::{anyhow, bail, format_err, Result}; -use reqwest::Url; -use serde::Deserialize; -use std::{collections::HashMap, time::Duration}; - -#[derive(Clone)] -pub struct Prometheus { - url: Url, - client: reqwest::blocking::Client, - grafana_base_url: Url, -} - -pub struct MatrixResponse { - inner: HashMap, -} - -pub struct TimeSeries { - inner: Vec<(u64, f64)>, -} - -impl Prometheus { - pub fn new(ip: &str, grafana_base_url: String) -> Self { - let url = format!("http://{}:80", ip) - .parse() - .expect("Failed to parse prometheus url"); - let grafana_base_url = grafana_base_url - .parse() - .expect("Failed to parse prometheus public url"); - let client = reqwest::blocking::Client::new(); - Self { - url, - client, - grafana_base_url, - } - } - - pub fn link_to_dashboard(&self, start: Duration, end: Duration) -> String { - format!( - "{}d/performance/performance?orgId=1&from={}&to={}", - self.grafana_base_url, - start.as_millis(), - end.as_millis() - ) - } - - fn query_range( - &self, - query: String, - start: &Duration, - end: &Duration, - step: u64, - ) -> Result { - let url = self - .url - .join(&format!( - "api/v1/query_range?query={}&start={}&end={}&step={}", - query, - start.as_secs(), - end.as_secs(), - step - )) - .map_err(|e| { - anyhow!( - "Failed to make query range due to unparseable url: {} resulting in Error: {}", - self.url, - e - ) - })?; - let response = self - .client - .get(url.clone()) - .send() - .map_err(|e| format_err!("Failed to query prometheus: {:?}", e))?; - - // We don't check HTTP error code here - // Prometheus supplies error status in json response along with error text - - let response: PrometheusResponse = response.json().map_err(|e| { - format_err!("Failed to parse prometheus response: {:?}. Url: {}", e, url) - })?; - - match response.data { - Some(data) => MatrixResponse::from_prometheus(data), - None => bail!( - "Prometheus query failed: {} {}", - response.error_type, - response.error - ), - } - } - pub fn query_range_avg( - &self, - query: String, - start: &Duration, - end: &Duration, - step: u64, - ) -> Result { - let response = self.query_range(query, start, end, step)?; - response - .avg() - .ok_or_else(|| format_err!("Failed to compute avg")) - } -} - -impl MatrixResponse { - pub fn avg(&self) -> Option { - if self.inner.is_empty() { - return None; - } - let mut sum = 0.; - let mut count = 0usize; - for time_series in self.inner.values() { - if let Some(ts_avg) = time_series.avg() { - sum += ts_avg; - count += 1; - } - } - if count == 0 { - None - } else { - Some(sum / (count as f64)) - } - } -} - -impl TimeSeries { - pub fn get(&self) -> &[(u64, f64)] { - &self.inner - } - - pub fn avg(&self) -> Option { - let mut sum = 0.; - let mut count = 0usize; - for (_, v) in self.inner.iter() { - if !v.is_normal() { - // Some time series can return NaN (for example, latency query that has division in - // it). If we include this NaN in sum, it will 'poison' it - if one - // of values is NaN, sum will be NaN too, and avg will be NaN - // Instead of poisoning, we simply ignore NaN values when calculating avg - continue; - } - sum += *v; - count += 1; - } - if count == 0 { - None - } else { - Some(sum / (count as f64)) - } - } -} - -#[derive(Debug, Deserialize)] -struct PrometheusResponse { - data: Option, - #[serde(default)] - error: String, - #[serde(alias = "errorType")] - #[serde(default)] - error_type: String, - status: String, -} - -#[derive(Debug, Deserialize)] -struct PrometheusData { - result: Vec, -} - -#[derive(Debug, Deserialize)] -struct PrometheusResult { - metric: PrometheusMetric, - values: Vec<(u64, String)>, -} - -#[derive(Debug, Deserialize)] -struct PrometheusMetric { - op: Option, - peer_id: String, -} - -impl MatrixResponse { - fn from_prometheus(data: PrometheusData) -> Result { - let mut inner = HashMap::new(); - for entry in data.result { - let peer_id = entry.metric.peer_id; - if entry.values.is_empty() { - continue; - } - let time_series = TimeSeries::from_prometheus(entry.values)?; - inner.insert(peer_id, time_series); - } - Ok(Self { inner }) - } -} - -impl TimeSeries { - fn from_prometheus(values: Vec<(u64, String)>) -> Result { - let mut inner = vec![]; - for (ts, value) in values { - let value = value.parse().map_err(|e| { - format_err!("Failed to parse entry in prometheus time series: {:?}", e) - })?; - inner.push((ts, value)); - } - Ok(TimeSeries { inner }) - } -} diff --git a/testsuite/cluster-test/src/report.rs b/testsuite/cluster-test/src/report.rs deleted file mode 100644 index 57dc20bd3463b..0000000000000 --- a/testsuite/cluster-test/src/report.rs +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -use forge::TxnStats; -use serde::Serialize; -use std::{fmt, time::Duration}; - -#[derive(Default, Debug, Serialize)] -pub struct SuiteReport { - metrics: Vec, - text: String, -} - -#[derive(Debug, Serialize)] -pub struct ReportedMetric { - pub experiment: String, - pub metric: String, - pub value: f64, -} - -impl SuiteReport { - pub fn new() -> Self { - Default::default() - } - - pub fn report_metric( - &mut self, - experiment: E, - metric: M, - value: f64, - ) { - self.metrics.push(ReportedMetric { - experiment: experiment.to_string(), - metric: metric.to_string(), - value, - }); - } - - pub fn report_text(&mut self, text: String) { - if !self.text.is_empty() { - self.text.push('\n'); - } - self.text.push_str(&text); - } - - pub fn report_text_same_line(&mut self, text: String) { - self.text.push_str(&text); - } - - pub fn report_txn_stats( - &mut self, - experiment: String, - stats: TxnStats, - window: Duration, - additional: &str, - ) { - let submitted_txn = stats.submitted; - let expired_txn = stats.expired; - let avg_tps = stats.committed / window.as_secs(); - let avg_latency_client = if stats.committed == 0 { - 0u64 - } else { - stats.latency / stats.committed - }; - let p99_latency = stats.latency_buckets.percentile(99, 100); - self.report_metric(experiment.clone(), "submitted_txn", submitted_txn as f64); - self.report_metric(experiment.clone(), "expired_txn", expired_txn as f64); - self.report_metric(experiment.clone(), "avg_tps", avg_tps as f64); - self.report_metric(experiment.clone(), "avg_latency", avg_latency_client as f64); - self.report_metric(experiment.clone(), "p99_latency", p99_latency as f64); - let expired_text = if expired_txn == 0 { - "no expired txns".to_string() - } else { - format!("(!) expired {} out of {} txns", expired_txn, submitted_txn) - }; - self.report_text(format!( - "{} : {:.0} TPS, {:.1} ms latency, {:.1} ms p99 latency,{} {}", - experiment, avg_tps, avg_latency_client, p99_latency, additional, expired_text - )); - } -} - -impl fmt::Display for SuiteReport { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.text) - } -} diff --git a/testsuite/cluster-test/src/stats.rs b/testsuite/cluster-test/src/stats.rs deleted file mode 100644 index 332c9be2849a1..0000000000000 --- a/testsuite/cluster-test/src/stats.rs +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -use crate::prometheus::Prometheus; -use anyhow::format_err; -use std::time::Duration; - -pub struct PrometheusRangeView<'a> { - prometheus: &'a Prometheus, - start: Duration, - end: Duration, -} - -impl<'a> PrometheusRangeView<'a> { - pub fn new(prometheus: &'a Prometheus, start: Duration, end: Duration) -> Self { - Self { - prometheus, - start, - end, - } - } - - pub fn avg_txns_per_block(&self) -> Option { - self.query_avg( - "txn_per_block", - "irate(diem_consensus_num_txns_per_block_sum[1m])/irate(diem_consensus_num_txns_per_block_count[1m])".to_string(), - ) - } - - pub fn avg_backup_bytes_per_second(&self) -> Option { - self.query_avg( - "backup_bytes_per_second", - "sum(irate(diem_backup_service_sent_bytes[1m])) by(peer_id)".to_string(), - ) - } -} - -impl<'a> PrometheusRangeView<'a> { - const STEP: u64 = 10; - - fn query_avg(&self, name: &str, query: String) -> Option { - self.prometheus - .query_range_avg(query, &self.start, &self.end, Self::STEP) - .map_err(|e| format_err!("No {} data: {}", name, e)) - .ok() - } -} diff --git a/testsuite/cluster-test/src/suite.rs b/testsuite/cluster-test/src/suite.rs deleted file mode 100644 index 88da93f548738..0000000000000 --- a/testsuite/cluster-test/src/suite.rs +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright (c) The Diem Core Contributors -// SPDX-License-Identifier: Apache-2.0 - -#![forbid(unsafe_code)] -use std::{cmp::min, env}; - -use crate::{ - cluster::Cluster, - experiments::{ - CompatiblityTestParams, CpuFlamegraphParams, Experiment, ExperimentParam, - PerformanceBenchmarkParams, PerformanceBenchmarkThreeRegionSimulationParams, - RebootRandomValidatorsParams, ReconfigurationParams, RecoveryTimeParams, - StateSyncPerformanceParams, TwinValidatorsParams, ValidatorVersioningParams, - }, -}; -use anyhow::{format_err, Result}; - -pub struct ExperimentSuite { - pub experiments: Vec>, -} - -impl ExperimentSuite { - fn new_pre_release(cluster: &Cluster) -> Self { - let mut experiments: Vec> = vec![]; - if env::var("RECOVERY_EXP").is_ok() { - experiments.push(Box::new( - RecoveryTimeParams { - num_accounts_to_mint: 100_000, - } - .build(cluster), - )); - } - let count = min(3, cluster.validator_instances().len() / 3); - // Reboot different sets of 3 validators *100 times - for _ in 0..10 { - let b = Box::new(RebootRandomValidatorsParams::new(count, 0).build(cluster)); - experiments.push(b); - } - experiments.push(Box::new( - PerformanceBenchmarkParams::non_zero_gas_price(0, 1) - .enable_db_backup() - .build(cluster), - )); - experiments.push(Box::new( - PerformanceBenchmarkParams::new_nodes_down(0) - .enable_db_backup() - .build(cluster), - )); - experiments.push(Box::new( - PerformanceBenchmarkParams::new_nodes_down(10) - .enable_db_backup() - .build(cluster), - )); - experiments.push(Box::new( - PerformanceBenchmarkThreeRegionSimulationParams {}.build(cluster), - )); - experiments.push(Box::new( - PerformanceBenchmarkParams::new_fixed_tps(0, 10) - .enable_db_backup() - .build(cluster), - )); - experiments.push(Box::new(StateSyncPerformanceParams::new(30).build(cluster))); - experiments.push(Box::new(TwinValidatorsParams { pair: 1 }.build(cluster))); - // This can't be run before any experiment that requires clean_data. - experiments.push(Box::new( - ReconfigurationParams { - count: 101, - emit_txn: false, - } - .build(cluster), - )); - experiments.push(Box::new( - CpuFlamegraphParams { duration_secs: 60 }.build(cluster), - )); - Self { experiments } - } - - fn new_twin_suite(cluster: &Cluster) -> Self { - let experiments: Vec> = vec![ - Box::new(TwinValidatorsParams { pair: 1 }.build(cluster)), - Box::new(CpuFlamegraphParams { duration_secs: 60 }.build(cluster)), - ]; - Self { experiments } - } - - fn new_perf_suite(cluster: &Cluster) -> Self { - let experiments: Vec> = vec![ - Box::new(PerformanceBenchmarkParams::new_nodes_down(0).build(cluster)), - Box::new(PerformanceBenchmarkParams::new_nodes_down(10).build(cluster)), - Box::new(PerformanceBenchmarkThreeRegionSimulationParams {}.build(cluster)), - Box::new(PerformanceBenchmarkParams::new_fixed_tps(0, 10).build(cluster)), - ]; - Self { experiments } - } - - fn new_land_blocking_suite(cluster: &Cluster) -> Self { - let experiments: Vec> = vec![Box::new( - PerformanceBenchmarkParams::new_nodes_down(0).build(cluster), - )]; - Self { experiments } - } - - fn new_land_blocking_compat_suite(cluster: &Cluster) -> Result { - let count: usize = match env::var("BATCH_SIZE") { - Ok(val) => val - .parse() - .map_err(|e| format_err!("Failed to parse BATCH_SIZE {}: {}", val, e))?, - Err(_) => cluster.validator_instances().len() / 2, - }; - let updated_image_tag = env::var("UPDATE_TO_TAG") - .map_err(|_| format_err!("Expected environment variable UPDATE_TO_TAG"))?; - let mut experiments: Vec> = vec![Box::new( - CompatiblityTestParams { - count, - updated_image_tag, - } - .build(cluster), - )]; - experiments.extend(Self::new_land_blocking_suite(cluster).experiments); - Ok(Self { experiments }) - } - - fn new_versioning_suite(cluster: &Cluster) -> Result { - let count: usize = match env::var("BATCH_SIZE") { - Ok(val) => val - .parse() - .map_err(|e| format_err!("Failed to parse BATCH_SIZE {}: {}", val, e))?, - Err(_) => cluster.validator_instances().len() / 2, - }; - let updated_image_tag = env::var("UPDATE_TO_TAG") - .map_err(|_| format_err!("Expected environment variable UPDATE_TO_TAG"))?; - let experiments: Vec> = vec![Box::new( - ValidatorVersioningParams { - count, - updated_image_tag, - } - .build(cluster), - )]; - Ok(Self { experiments }) - } - - fn new_invalid_tx_suite(cluster: &Cluster) -> Self { - let experiments: Vec> = vec![ - Box::new(PerformanceBenchmarkParams::new_nodes_down(0).build(cluster)), - Box::new(PerformanceBenchmarkParams::mix_invalid_tx(0, 10).build(cluster)), - ]; - Self { experiments } - } - - fn new_state_sync_suite(cluster: &Cluster) -> Self { - let experiments: Vec> = - vec![Box::new(StateSyncPerformanceParams::new(30).build(cluster))]; - Self { experiments } - } - - pub fn new_by_name(cluster: &Cluster, name: &str) -> Result { - match name { - "perf" => Ok(Self::new_perf_suite(cluster)), - "pre_release" => Ok(Self::new_pre_release(cluster)), - "twin" => Ok(Self::new_twin_suite(cluster)), - "land_blocking" => Ok(Self::new_land_blocking_suite(cluster)), - "land_blocking_compat" => Self::new_land_blocking_compat_suite(cluster), - "versioning" => Self::new_versioning_suite(cluster), - "invalid" => Ok(Self::new_invalid_tx_suite(cluster)), - "state_sync" => Ok(Self::new_state_sync_suite(cluster)), - other => Err(format_err!("Unknown suite: {}", other)), - } - } -} diff --git a/testsuite/diem-swarm/src/main.rs b/testsuite/diem-swarm/src/main.rs index b744c04cb52c4..2afca1491944b 100644 --- a/testsuite/diem-swarm/src/main.rs +++ b/testsuite/diem-swarm/src/main.rs @@ -90,30 +90,6 @@ fn main() { }) .collect::>(); - let node_address_list = ports - .iter() - .map(|port| format!("localhost:{}", port.0)) - .collect::>() - .join(","); - - println!("To run transaction generator run:"); - println!( - "\tcluster-test --mint-file {:?} --swarm --peers {:?} --emit-tx --workers-per-ac 1", - diem_root_key_path, node_address_list, - ); - - let node_address_list = ports - .iter() - .map(|port| format!("localhost:{}:{}", port.0, port.1)) - .collect::>() - .join(","); - - println!("To run health check:"); - println!( - "\tcluster-test --mint-file {:?} --swarm --peers {:?} --health-check --duration 30", - diem_root_key_path, node_address_list, - ); - let _faucet = if args.start_faucet { let faucet_port = diem_config::utils::get_available_port(); let server_port = ports[0].0; diff --git a/testsuite/forge/src/github.rs b/testsuite/forge/src/github.rs index ba2fec622b05e..01895600db284 100644 --- a/testsuite/forge/src/github.rs +++ b/testsuite/forge/src/github.rs @@ -49,7 +49,7 @@ impl GitHub { })?; let request = self.client.get(url); let response = request - .header(USER_AGENT, "diem-cluster-test") + .header(USER_AGENT, "diem-forge") .send() .map_err(|e| format_err!("Failed to query github: {:?}", e))?; let response: Vec = response diff --git a/testsuite/smoke-test/src/workspace_builder.rs b/testsuite/smoke-test/src/workspace_builder.rs index edba83f68de48..c42f95f26fa97 100644 --- a/testsuite/smoke-test/src/workspace_builder.rs +++ b/testsuite/smoke-test/src/workspace_builder.rs @@ -12,24 +12,15 @@ use std::{env, path::PathBuf, process::Command}; const WORKSPACE_BUILD_ERROR_MSG: &str = r#" Unable to build all workspace binaries. Cannot continue running tests. - Try running 'cargo build --all --bins --exclude cluster-test --exclude diem-node' yourself. + Try running 'cargo build --all --bins --exclude diem-node' yourself. "#; // Global flag indicating if all binaries in the workspace have been built. static WORKSPACE_BUILT: Lazy = Lazy::new(|| { info!("Building project binaries"); let args = if cfg!(debug_assertions) { - // special case: excluding cluster-test as it exports no-struct-opt feature that poisons everything // use get_diem_node_with_failpoints to get diem-node binary - vec![ - "build", - "--all", - "--bins", - "--exclude", - "cluster-test", - "--exclude", - "diem-node", - ] + vec!["build", "--all", "--bins", "--exclude", "diem-node"] } else { vec!["build", "--all", "--bins", "--release"] }; diff --git a/x.toml b/x.toml index 665419d7b7790..32aa8b8264496 100644 --- a/x.toml +++ b/x.toml @@ -159,7 +159,6 @@ members = [ "bytecode-verifier-tests", "bytecode-verifier-transactional-tests", "cli", - "cluster-test", "testcases", "diem-documentation-tool", "diem-e2e-tests-replay",