Skip to content

Commit

Permalink
update some metrics names for alerts and dashboards
Browse files Browse the repository at this point in the history
libra -> aptos
mempool has no aptos prefix

Closes: aptos-labs#187
  • Loading branch information
davidiw authored and aptos-bot committed Mar 15, 2022
1 parent 03ab897 commit 5c83cef
Show file tree
Hide file tree
Showing 11 changed files with 40 additions and 40 deletions.
2 changes: 1 addition & 1 deletion terraform/helm/validator/files/dashboards/components.json
Original file line number Diff line number Diff line change
Expand Up @@ -2021,7 +2021,7 @@
"steppedLine": false,
"targets": [
{
"expr": "aptos_core_mempool_index_size{role=~\"$role\",owner=~\"$owner\",index=\"system_ttl\"}",
"expr": "core_mempool_index_size{role=~\"$role\",owner=~\"$owner\",index=\"system_ttl\"}",
"format": "time_series",
"hide": false,
"instant": false,
Expand Down
Binary file modified terraform/helm/validator/files/dashboards/components.json.gz
Binary file not shown.
8 changes: 4 additions & 4 deletions terraform/helm/validator/files/dashboards/fullnodes.json
Original file line number Diff line number Diff line change
Expand Up @@ -451,14 +451,14 @@
"steppedLine": false,
"targets": [
{
"expr": "aptos_core_mempool_index_size{index=\"system_ttl\", kubernetes_pod_name=~\".*fullnode.*\", owner=~\"$owner\"}",
"expr": "core_mempool_index_size{index=\"system_ttl\", kubernetes_pod_name=~\".*fullnode.*\", owner=~\"$owner\"}",
"interval": "",
"legendFormat": "{{owner}}-{{kubernetes_pod_name}}",
"refId": "A"
},
{
"exemplar": true,
"expr": "aptos_core_mempool_index_size{index=\"system_ttl\", job=~\".*fullnode.*\", owner=~\"$owner\"}",
"expr": "core_mempool_index_size{index=\"system_ttl\", job=~\".*fullnode.*\", owner=~\"$owner\"}",
"hide": false,
"interval": "",
"legendFormat": "{{owner}}-{{job}}",
Expand Down Expand Up @@ -537,14 +537,14 @@
"steppedLine": false,
"targets": [
{
"expr": "aptos_core_mempool_txn_commit_latency_sum{kubernetes_pod_name=~\".*fullnode.*\", owner=~\"$owner\"} / aptos_core_mempool_txn_commit_latency_count{kubernetes_pod_name=~\".*fullnode.*\", owner=~\"$owner\"}",
"expr": "core_mempool_txn_commit_latency_sum{kubernetes_pod_name=~\".*fullnode.*\", owner=~\"$owner\"} / core_mempool_txn_commit_latency_count{kubernetes_pod_name=~\".*fullnode.*\", owner=~\"$owner\"}",
"interval": "",
"legendFormat": "{{owner}}-{{kubernetes_pod_name}}",
"refId": "A"
},
{
"exemplar": true,
"expr": "aptos_core_mempool_txn_commit_latency_sum{job=~\".*fullnode.*\", owner=~\"$owner\"} / aptos_core_mempool_txn_commit_latency_count{job=~\".*fullnode.*\", owner=~\"$owner\"}",
"expr": "core_mempool_txn_commit_latency_sum{job=~\".*fullnode.*\", owner=~\"$owner\"} / core_mempool_txn_commit_latency_count{job=~\".*fullnode.*\", owner=~\"$owner\"}",
"hide": false,
"interval": "",
"legendFormat": "{{owner}}-{{job}}",
Expand Down
Binary file modified terraform/helm/validator/files/dashboards/fullnodes.json.gz
Binary file not shown.
52 changes: 26 additions & 26 deletions terraform/helm/validator/files/dashboards/mempool.json
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@
"steppedLine": false,
"targets": [
{
"expr": "aptos_core_mempool_index_size{index=\"system_ttl\", owner=~\"$owner\", role=~\"$role\"}",
"expr": "core_mempool_index_size{index=\"system_ttl\", owner=~\"$owner\", role=~\"$role\"}",
"format": "time_series",
"hide": false,
"instant": false,
Expand Down Expand Up @@ -296,7 +296,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_core_mempool_removed_txns_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(core_mempool_removed_txns_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}}",
"refId": "A"
Expand Down Expand Up @@ -393,7 +393,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (owner, role, network) (aptos_mempool_active_upstream_peers_count{owner=~\"$owner\", role=~\"$role\"})",
"expr": "sum by (owner, role, network) (mempool_active_upstream_peers_count{owner=~\"$owner\", role=~\"$role\"})",
"interval": "",
"legendFormat": "{{owner}}:{{role}} ({{network}} network)",
"refId": "A"
Expand Down Expand Up @@ -601,7 +601,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_service_latency_ms_count{result=\"fail\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(mempool_service_latency_ms_count{result=\"fail\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}} - {{type}}",
"refId": "A"
Expand Down Expand Up @@ -698,7 +698,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_db_error_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(mempool_db_error_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}}",
"refId": "A"
Expand Down Expand Up @@ -795,7 +795,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_json_rpc_callback_fail_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(mempool_json_rpc_callback_fail_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}}",
"refId": "A"
Expand Down Expand Up @@ -892,7 +892,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_network_send_fail_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(mempool_network_send_fail_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}}",
"refId": "A"
Expand Down Expand Up @@ -989,7 +989,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_vm_reconfig_update_fail_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(mempool_vm_reconfig_update_fail_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}}",
"refId": "A"
Expand Down Expand Up @@ -1086,7 +1086,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_core_mempool_invariant_violated_count{owner=\"$owner\", role=\"$role\"}[$interval])",
"expr": "rate(mempool_core_mempool_invariant_violated_count{owner=\"$owner\", role=\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}}",
"refId": "A"
Expand Down Expand Up @@ -1183,7 +1183,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_unexpected_network_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(mempool_unexpected_network_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}} <- {{network}}:{{peer}}",
"refId": "A"
Expand Down Expand Up @@ -1294,7 +1294,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_core_mempool_gc_latency_count{type=\"client_expiration\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(core_mempool_gc_latency_count{type=\"client_expiration\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}} : {{status}}",
"refId": "A"
Expand Down Expand Up @@ -1391,7 +1391,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_core_mempool_gc_latency_count{type=\"system_ttl\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(core_mempool_gc_latency_count{type=\"system_ttl\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}} : {{status}}",
"refId": "A"
Expand Down Expand Up @@ -1488,7 +1488,7 @@
"steppedLine": false,
"targets": [
{
"expr": "aptos_core_mempool_index_size{index=\"parking_lot\", owner=~\"$owner\", role=~\"$role\"}",
"expr": "core_mempool_index_size{index=\"parking_lot\", owner=~\"$owner\", role=~\"$role\"}",
"interval": "",
"legendFormat": "{{owner}}:{{role}}",
"refId": "A"
Expand Down Expand Up @@ -1585,7 +1585,7 @@
"steppedLine": false,
"targets": [
{
"expr": "aptos_core_mempool_index_size{index=\"timeline\", owner=~\"$owner\", role=~\"$role\"}",
"expr": "core_mempool_index_size{index=\"timeline\", owner=~\"$owner\", role=~\"$role\"}",
"interval": "",
"legendFormat": "{{owner}}:{{role}}",
"refId": "A"
Expand Down Expand Up @@ -1682,7 +1682,7 @@
"steppedLine": false,
"targets": [
{
"expr": "aptos_core_mempool_index_size{index=\"priority\", role=\"validator\", owner=~\"$owner\"}",
"expr": "core_mempool_index_size{index=\"priority\", role=\"validator\", owner=~\"$owner\"}",
"interval": "",
"legendFormat": "{{owner}}",
"refId": "A"
Expand Down Expand Up @@ -1876,7 +1876,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_core_mempool_gc_event_count{type=\"system_ttl\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(core_mempool_gc_event_count{type=\"system_ttl\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}}",
"refId": "A"
Expand Down Expand Up @@ -1973,7 +1973,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_core_mempool_gc_latency_sum{owner=~\"$owner\", role=~\"$role\"}[$interval])/rate(aptos_core_mempool_gc_latency_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(core_mempool_gc_latency_sum{owner=~\"$owner\", role=~\"$role\"}[$interval])/rate(core_mempool_gc_latency_count{owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}}",
"refId": "A"
Expand Down Expand Up @@ -2085,7 +2085,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_service_transactions_sum{type=\"commit_accepted\", owner=~\"$owner\", role=~\"$role\"}[$interval])\n",
"expr": "rate(mempool_service_transactions_sum{type=\"commit_accepted\", owner=~\"$owner\", role=~\"$role\"}[$interval])\n",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
Expand Down Expand Up @@ -2186,7 +2186,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_service_latency_ms_sum{type=\"commit_accepted\", owner=~\"$owner\", role=~\"$role\"}[$interval]) / rate(aptos_mempool_service_latency_ms_count{type=\"commit_accepted\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(mempool_service_latency_ms_sum{type=\"commit_accepted\", owner=~\"$owner\", role=~\"$role\"}[$interval]) / rate(mempool_service_latency_ms_count{type=\"commit_accepted\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
Expand Down Expand Up @@ -2285,7 +2285,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_service_transactions_sum{type=\"commit_rejected\", owner=~\"$owner\", role=~\"$role\"}[$interval])\n",
"expr": "rate(mempool_service_transactions_sum{type=\"commit_rejected\", owner=~\"$owner\", role=~\"$role\"}[$interval])\n",
"interval": "",
"legendFormat": "{{owner}}",
"refId": "A"
Expand Down Expand Up @@ -2382,7 +2382,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_service_latency_ms_sum{type=\"commit_rejected\", owner=~\"$owner\"}[$interval])/rate(aptos_mempool_service_latency_ms_count{type=\"commit_rejected\", owner=~\"$owner\"}[$interval])",
"expr": "rate(mempool_service_latency_ms_sum{type=\"commit_rejected\", owner=~\"$owner\"}[$interval])/rate(mempool_service_latency_ms_count{type=\"commit_rejected\", owner=~\"$owner\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}",
"refId": "A"
Expand Down Expand Up @@ -2480,7 +2480,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_service_transactions_sum{type=\"get_block\", owner=~\"$owner\"}[$interval]) / rate(aptos_mempool_service_transactions_count{type=\"get_block\", owner=~\"$owner\"}[$interval])",
"expr": "rate(mempool_service_transactions_sum{type=\"get_block\", owner=~\"$owner\"}[$interval]) / rate(mempool_service_transactions_count{type=\"get_block\", owner=~\"$owner\"}[$interval])",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
Expand Down Expand Up @@ -2580,7 +2580,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_service_latency_ms_sum{type=\"get_block\", owner=~\"$owner\"}[$interval]) / rate(aptos_mempool_service_latency_ms_count{type=\"get_block\", owner=~\"$owner\"}[$interval])",
"expr": "rate(mempool_service_latency_ms_sum{type=\"get_block\", owner=~\"$owner\"}[$interval]) / rate(mempool_service_latency_ms_count{type=\"get_block\", owner=~\"$owner\"}[$interval])",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
Expand Down Expand Up @@ -2793,7 +2793,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_bounded_executor_spawn_latency_sum{stage=\"start\", owner=~\"$owner\", role=~\"$role\"}[$interval]) / rate(aptos_mempool_bounded_executor_spawn_latency_count{stage=\"start\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(mempool_bounded_executor_spawn_latency_sum{stage=\"start\", owner=~\"$owner\", role=~\"$role\"}[$interval]) / rate(mempool_bounded_executor_spawn_latency_count{stage=\"start\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}} - {{task}}",
"refId": "A"
Expand Down Expand Up @@ -3202,7 +3202,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_process_txn_breakdown_latency_sum{portion=\"vm_validation\", owner=~\"$owner\", role=~\"$role\"}[$interval]) / rate(aptos_mempool_process_txn_breakdown_latency_count{portion=\"vm_validation\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(mempool_process_txn_breakdown_latency_sum{portion=\"vm_validation\", owner=~\"$owner\", role=~\"$role\"}[$interval]) / rate(mempool_process_txn_breakdown_latency_count{portion=\"vm_validation\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}}",
"refId": "A"
Expand Down Expand Up @@ -3299,7 +3299,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_mempool_process_txn_breakdown_latency_sum{portion=\"storage_fetch\", owner=~\"$owner\", role=~\"$role\"}[$interval]) / rate(aptos_mempool_process_txn_breakdown_latency_count{portion=\"storage_fetch\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"expr": "rate(mempool_process_txn_breakdown_latency_sum{portion=\"storage_fetch\", owner=~\"$owner\", role=~\"$role\"}[$interval]) / rate(mempool_process_txn_breakdown_latency_count{portion=\"storage_fetch\", owner=~\"$owner\", role=~\"$role\"}[$interval])",
"interval": "",
"legendFormat": "{{owner}}:{{role}}",
"refId": "A"
Expand Down
Binary file modified terraform/helm/validator/files/dashboards/mempool.json.gz
Binary file not shown.
4 changes: 2 additions & 2 deletions terraform/helm/validator/files/dashboards/overview.json
Original file line number Diff line number Diff line change
Expand Up @@ -1610,7 +1610,7 @@
"steppedLine": false,
"targets": [
{
"expr": "aptos_core_mempool_index_size{index=\"system_ttl\", owner=~\"$owner\"}",
"expr": "core_mempool_index_size{index=\"system_ttl\", owner=~\"$owner\"}",
"format": "time_series",
"hide": false,
"instant": false,
Expand Down Expand Up @@ -1701,7 +1701,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_core_mempool_txn_commit_latency_sum{stage=~\"commit_accepted|commit_rejected\", owner=~\"$owner\"}[1m])/rate(aptos_core_mempool_txn_commit_latency_count{stage=~\"commit_accepted|commit_rejected\", owner=~\"$owner\"}[1m])",
"expr": "rate(core_mempool_txn_commit_latency_sum{stage=~\"commit_accepted|commit_rejected\", owner=~\"$owner\"}[1m])/rate(core_mempool_txn_commit_latency_count{stage=~\"commit_accepted|commit_rejected\", owner=~\"$owner\"}[1m])",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
Expand Down
Binary file modified terraform/helm/validator/files/dashboards/overview.json.gz
Binary file not shown.
4 changes: 2 additions & 2 deletions terraform/helm/validator/files/dashboards/validator.json
Original file line number Diff line number Diff line change
Expand Up @@ -1060,7 +1060,7 @@
"steppedLine": false,
"targets": [
{
"expr": "rate(aptos_core_mempool_txn_commit_latency_sum{stage=~\"commit_accepted|commit_rejected\", owner=~\"$owner\"}[1m])/rate(aptos_core_mempool_txn_commit_latency_count{stage=~\"commit_accepted|commit_rejected\", owner=~\"$owner\"}[1m])",
"expr": "rate(core_mempool_txn_commit_latency_sum{stage=~\"commit_accepted|commit_rejected\", owner=~\"$owner\"}[1m])/rate(core_mempool_txn_commit_latency_count{stage=~\"commit_accepted|commit_rejected\", owner=~\"$owner\"}[1m])",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
Expand Down Expand Up @@ -1159,7 +1159,7 @@
"steppedLine": false,
"targets": [
{
"expr": "aptos_core_mempool_index_size{index=\"system_ttl\", owner=~\"$owner\"}",
"expr": "core_mempool_index_size{index=\"system_ttl\", owner=~\"$owner\"}",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
Expand Down
Binary file modified terraform/helm/validator/files/dashboards/validator.json.gz
Binary file not shown.
10 changes: 5 additions & 5 deletions terraform/helm/validator/files/rules/alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ groups:
summary: "The block commit rate is low"
annotations:
- alert: High local timeout rate
expr: rate(libra_consensus_timeout_count{role="validator"}[1m]) > 0.5
expr: rate(aptos_consensus_timeout_count{role="validator"}[1m]) > 0.5
for: 20m
labels:
severity: warning
summary: "Consensus timeout rate is high"
annotations:
- alert: High consensus error rate
expr: rate(libra_consensus_error_count{role="validator"}[1m]) / on (role) rate(consensus_duration_count{op='main_loop', role="validator"}[1m]) > 0.25
expr: rate(aptos_consensus_error_count{role="validator"}[1m]) / on (role) rate(consensus_duration_count{op='main_loop', role="validator"}[1m]) > 0.25
for: 20m
labels:
severity: warning
Expand All @@ -42,21 +42,21 @@ groups:

# Mempool alerts
- alert: Mempool has no active upstream peers
expr: (sum by (owner, kubernetes_pod_name) (aptos_mempool_active_upstream_peers_count)) == 0
expr: (sum by (owner, kubernetes_pod_name) (mempool_active_upstream_peers_count)) == 0
for: 3m
labels:
severity: error
summary: "Mempool has no active upstream peers (unable to forward transactions to anyone!)"
annotations:
- alert: Mempool is at >80% capacity
expr: aptos_core_mempool_index_size{index="system_ttl"} > 800000 # assumes default mempool size 1_000_000
expr: core_mempool_index_size{index="system_ttl"} > 800000 # assumes default mempool size 1_000_000
for: 5m
labels:
severity: warning
summary: "Mempool is at >80% capacity (it may soon become full!)"
annotations:
- alert: Mempool is growing at a significant rate
expr: rate(aptos_core_mempool_index_size{index="system_ttl"}[1m]) > 30000
expr: rate(core_mempool_index_size{index="system_ttl"}[1m]) > 30000
for: 10m
labels:
severity: warning
Expand Down

0 comments on commit 5c83cef

Please sign in to comment.