forked from aptos-labs/aptos-core
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Closes: aptos-labs#878
- Loading branch information
Showing
10 changed files
with
877 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
apiVersion: v2 | ||
name: aptos-monitoring | ||
version: 0.1.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# Severeties: info, warning, [error, critical] | ||
# Last 2 items are high urgency | ||
|
||
global: | ||
|
||
route: | ||
group_by: [...] # TBD | ||
|
||
# When a new group of alerts is created by an incoming alert, wait at | ||
# least 'group_wait' to send the initial notification. | ||
# This way ensures that you get multiple alerts for the same group that start | ||
# firing shortly after another are batched together on the first | ||
# notification. | ||
group_wait: 30s | ||
|
||
# When the first notification was sent, wait 'group_interval' to send a batch | ||
# of new alerts that started firing for that group. | ||
group_interval: 5m | ||
|
||
# If an alert has successfully been sent, wait 'repeat_interval' to | ||
# resend them. | ||
repeat_interval: 10m | ||
|
||
# A default receiver | ||
receiver: 'default' | ||
|
||
# The child route trees. | ||
# https://prometheus.io/docs/alerting/latest/configuration/#route | ||
routes: {{ .Values.monitoring.alertmanager.alertRouteTrees | toJson }} | ||
|
||
# A list of notification receivers | ||
# https://prometheus.io/docs/alerting/latest/configuration/#receiver | ||
receivers: {{ .Values.monitoring.alertmanager.alertReceivers | toJson }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../../testnet/testnet/files/dashboards |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
{{- if .Values.monitoring.grafana.googleAuth }} | ||
|
||
[auth] | ||
# Set to true to disable (hide) the login form, useful if you use OAuth | ||
disable_login_form = true | ||
|
||
{{- with .Values.monitoring.grafana.config }} | ||
[auth.google] | ||
enabled = true | ||
client_id = {{ .client_id }} | ||
client_secret = {{ .client_secret }} | ||
scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email | ||
auth_url = https://accounts.google.com/o/oauth2/auth | ||
token_url = https://accounts.google.com/o/oauth2/token | ||
allowed_domains = {{ .allowed_domains }} | ||
allow_sign_up = true | ||
{{- end }} | ||
|
||
[users] | ||
auto_assign_org_role = Editor | ||
|
||
[server] | ||
protocol = http | ||
root_url = http://mon.{{ .Values.service.domain }}/grafana | ||
serve_from_sub_path = true | ||
|
||
{{- else }} | ||
[auth.anonymous] | ||
enabled = true | ||
|
||
# Role for unauthenticated users, other valid values are `Editor` and `Admin` | ||
org_role = Editor | ||
|
||
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
global: | ||
scrape_interval: 15s | ||
evaluation_interval: 15s | ||
external_labels: | ||
chain_name: {{ .Values.chain.name }} | ||
|
||
# Alertmanager configuration | ||
alerting: | ||
alertmanagers: | ||
- static_configs: | ||
- targets: | ||
- localhost:9093 | ||
|
||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. | ||
rule_files: | ||
{{- range $path, $_ := .Files.Glob "files/rules/*.yml" }} | ||
- {{ base $path }} | ||
{{- end }} | ||
|
||
scrape_configs: | ||
{{ if .Values.monitoring.fullKubernetesScrape }} | ||
- job_name: 'kubernetes-apiservers' | ||
scheme: https | ||
tls_config: | ||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt | ||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token | ||
|
||
kubernetes_sd_configs: | ||
- role: endpoints | ||
|
||
# Keep only the default/kubernetes service endpoints for the https port. This | ||
# will add targets for each API server which Kubernetes adds an endpoint to | ||
# the default/kubernetes service. | ||
relabel_configs: | ||
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] | ||
action: keep | ||
regex: default;kubernetes;https | ||
- target_label: owner | ||
replacement: {{ .Values.validator.name }} | ||
{{ end }} | ||
|
||
- job_name: 'kubernetes-nodes' | ||
scheme: https | ||
tls_config: | ||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt | ||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token | ||
|
||
kubernetes_sd_configs: | ||
- role: node | ||
|
||
{{ if not .Values.monitoring.fullKubernetesScrape }} | ||
metric_relabel_configs: | ||
- source_labels: [namespace] | ||
action: keep | ||
regex: "{{ .Release.Namespace }}" | ||
{{ end }} | ||
|
||
relabel_configs: | ||
- action: labelmap | ||
regex: __meta_kubernetes_node_label_(.+) | ||
- target_label: __address__ | ||
replacement: kubernetes.default.svc:443 | ||
- source_labels: [__meta_kubernetes_node_name] | ||
regex: (.+) | ||
target_label: __metrics_path__ | ||
replacement: /api/v1/nodes/${1}/proxy/metrics | ||
- target_label: owner | ||
replacement: {{ .Values.validator.name }} | ||
|
||
- job_name: 'kubernetes-cadvisor' | ||
scheme: https | ||
tls_config: | ||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt | ||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token | ||
|
||
kubernetes_sd_configs: | ||
- role: node | ||
|
||
relabel_configs: | ||
- target_label: __address__ | ||
replacement: kubernetes.default.svc:443 | ||
- source_labels: [__meta_kubernetes_node_name] | ||
regex: (.+) | ||
target_label: __metrics_path__ | ||
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor | ||
- target_label: owner | ||
replacement: {{ .Values.validator.name }} | ||
|
||
{{ if not .Values.monitoring.fullKubernetesScrape }} | ||
metric_relabel_configs: | ||
- source_labels: [namespace, pod] | ||
action: keep | ||
regex: "{{ .Release.Namespace }};{{ .Release.Name }}-.*" | ||
{{ end }} | ||
|
||
{{ if .Values.monitoring.useKubeStateMetrics }} | ||
- job_name: 'kube-state-metrics' | ||
static_configs: | ||
- targets: ['kube-state-metrics.default.svc.cluster.local:8080'] | ||
labels: | ||
owner: {{ .Values.validator.name }} | ||
{{ end }} | ||
|
||
- job_name: "aptos-procs" | ||
|
||
kubernetes_sd_configs: | ||
- role: pod | ||
|
||
relabel_configs: | ||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_part_of] | ||
action: keep | ||
regex: "{{ .Chart.Name }}" | ||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] | ||
action: keep | ||
regex: "{{ .Release.Name }}" | ||
- source_labels: [__meta_kubernetes_pod_container_port_number] | ||
action: keep | ||
regex: "9101" | ||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] | ||
action: replace | ||
target_label: role | ||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] | ||
action: replace | ||
target_label: instance | ||
- source_labels: [__meta_kubernetes_pod_name] | ||
action: replace | ||
target_label: kubernetes_pod_name | ||
- target_label: owner | ||
replacement: {{ .Values.validator.name }} | ||
|
||
- job_name: "pushgateway" | ||
static_configs: | ||
- targets: ['localhost:9091'] | ||
labels: | ||
owner: {{ .Values.validator.name }} | ||
|
||
{{ if .Values.vault.prometheusTarget }} | ||
- job_name: "vault" | ||
static_configs: | ||
- targets: ['{{ .Values.vault.prometheusTarget }}'] | ||
labels: | ||
owner: {{ .Values.validator.name }} | ||
scheme: "https" | ||
metrics_path: "/v1/sys/metrics" | ||
params: | ||
format: ["prometheus"] | ||
tls_config: | ||
insecure_skip_verify: true | ||
{{ end }} | ||
|
||
{{ if .Values.monitoring.prometheus.remote_write.enabled }} | ||
{{ with .Values.monitoring.prometheus.remote_write }} | ||
remote_write: | ||
- url: {{ .url }} | ||
sigv4: | ||
region: {{ .region }} | ||
queue_config: | ||
max_samples_per_send: 1000 | ||
max_shards: 200 | ||
capacity: 2500 | ||
{{ end }} | ||
{{ end }} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
groups: | ||
- name: "Aptos alerts" | ||
rules: | ||
# consensus | ||
- alert: Zero Block Commit Rate | ||
expr: rate(aptos_consensus_last_committed_round{role="validator"}[1m]) == 0 OR absent(aptos_consensus_last_committed_round{role="validator"}) | ||
for: 20m | ||
labels: | ||
severity: error | ||
summary: "The block commit rate is low" | ||
annotations: | ||
- alert: High local timeout rate | ||
expr: rate(aptos_consensus_timeout_count{role="validator"}[1m]) > 0.5 | ||
for: 20m | ||
labels: | ||
severity: warning | ||
summary: "Consensus timeout rate is high" | ||
annotations: | ||
- alert: High consensus error rate | ||
expr: rate(aptos_consensus_error_count{role="validator"}[1m]) / on (role) rate(consensus_duration_count{op='main_loop', role="validator"}[1m]) > 0.25 | ||
for: 20m | ||
labels: | ||
severity: warning | ||
summary: "Consensus error rate is high" | ||
annotations: | ||
|
||
# State sync alerts | ||
- alert: State sync is not making progress | ||
expr: rate(aptos_state_sync_version{type="synced"}[5m]) == 0 OR absent(aptos_state_sync_version{type="synced"}) | ||
for: 5m | ||
labels: | ||
severity: error | ||
summary: "State sync is not making progress (i.e., it is not keeping up with the head of the blockchain)" | ||
annotations: | ||
|
||
# Mempool alerts | ||
- alert: Mempool has no active upstream peers | ||
expr: (sum by (owner, kubernetes_pod_name) (mempool_active_upstream_peers_count)) == 0 | ||
for: 3m | ||
labels: | ||
severity: error | ||
summary: "Mempool has no active upstream peers (unable to forward transactions to anyone!)" | ||
annotations: | ||
- alert: Mempool is at >80% capacity | ||
expr: core_mempool_index_size{index="system_ttl"} > 800000 # assumes default mempool size 1_000_000 | ||
for: 5m | ||
labels: | ||
severity: warning | ||
summary: "Mempool is at >80% capacity (it may soon become full!)" | ||
annotations: | ||
- alert: Mempool is growing at a significant rate | ||
expr: rate(core_mempool_index_size{index="system_ttl"}[1m]) > 30000 | ||
for: 10m | ||
labels: | ||
severity: warning | ||
summary: "Mempool is growing at a significant rate (it may soon become full!)" | ||
annotations: | ||
|
||
# Networking alerts | ||
- alert: Validator Connected Peers | ||
expr: 0 == min(aptos_network_peers{state="connected", role_type="validator", role="validator"}) | ||
for: 15m | ||
labels: | ||
severity: error | ||
summary: "Validator node has zero connected peers" | ||
annotations: | ||
|
||
# Storage core metrics | ||
- alert: Validator Low Disk Space | ||
expr: (kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*validator-e.*"} - kubelet_volume_stats_used_bytes) / 1024 / 1024 / 1024 < 50 | ||
for: 5m | ||
labels: | ||
severity: warning | ||
summary: "Less than 50 GB of free space on Validator DB volume." | ||
annotations: | ||
- alert: Validator Very Low Disk Space | ||
expr: (kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*validator-e.*"} - kubelet_volume_stats_used_bytes) / 1024 / 1024 / 1024 < 20 | ||
for: 5m | ||
labels: | ||
severity: critical | ||
summary: "Less than 20 GB of free space on Validator DB volume." | ||
annotations: | ||
- alert: AptosDB API Success Rate | ||
expr: sum by(owner, kubernetes_pod_name) (rate(aptos_storage_api_latency_seconds_count{result="Ok"}[1m])) / sum by(owner, kubernetes_pod_name) (rate(aptos_storage_api_latency_seconds_count[1m])) < 0.99 # 99% | ||
for: 5m | ||
labels: | ||
severity: error | ||
summary: "AptosDB API success rate dropped." | ||
annotations: | ||
- alert: RocksDB Read Latency | ||
expr: sum by (owner, kubernetes_pod_name) (rate(aptos_schemadb_get_latency_seconds_sum[1m])) / sum by (owner, kubernetes_pod_name) (rate(aptos_schemadb_get_latency_seconds_count[1m])) > 0.001 # 1 millisecond | ||
for: 5m | ||
labels: | ||
severity: warning | ||
summary: "RocksDB read latency raised." | ||
annotations: | ||
|
||
{{- if .Values.backup.enable }} | ||
# DB Backup and Backup Verify | ||
- alert: Backup Coordinator Liveness | ||
# It's okay if one of these metrics stops changing but the pushgateway still reports the old value, since the alerts following this will detect the staleness. In fact this one is just to detect the absence of these metrics. | ||
expr: absent(irate(aptos_backup_metadata_num_file_downloads[1m]) > 0) and on() (absent(aptos_db_backup_coordinator_heartbeat_timestamp_s) or absent(aptos_db_backup_coordinator_epoch_ending_epoch) or absent(aptos_db_backup_coordinator_transaction_version) or absent(aptos_db_backup_coordinator_state_snapshot_version)) | ||
for: 10m | ||
labels: | ||
severity: warning | ||
summary: "Backup coordinator or one of its work streams is not heartbeating." | ||
annotations: | ||
- alert: Epoch Ending Backup Timeliness | ||
expr: max(aptos_storage_next_block_epoch) by(owner) - on (owner) aptos_db_backup_coordinator_epoch_ending_epoch > 1 # "==1" when caught up. | ||
for: 10m | ||
labels: | ||
severity: warning | ||
summary: "Epoch ending backup is not keeping up." | ||
annotations: | ||
- alert: Transaction Backup Timeliness | ||
expr: max(aptos_storage_latest_transaction_version) by(owner) - on (owner) aptos_db_backup_coordinator_transaction_version > {{ .Values.backup.config.transaction_batch_size }} # more than txn_backup_batch_size | ||
for: 10m | ||
labels: | ||
severity: warning | ||
summary: "Transaction backup is not keeping up." | ||
annotations: | ||
- alert: State Snapshot Backup Timeliness | ||
expr: max(aptos_storage_latest_transaction_version) by(owner) - on (owner) aptos_db_backup_coordinator_state_snapshot_version > {{ .Values.backup.config.state_snapshot_interval }} # more than state_snapshot_interval | ||
for: 10m | ||
labels: | ||
severity: warning | ||
summary: "State snapshot backup is not keeping up." | ||
annotations: | ||
- alert: Backup Verify Scheduling | ||
expr: absent(time() - max_over_time(aptos_db_backup_verify_coordinator_start_timestamp_s[1w]) < 86400) # assuming the verifier schedule is per day | ||
for: 10m | ||
labels: | ||
severity: warning | ||
summary: "Backup Verify not started as scheduled." | ||
annotations: | ||
- alert: Backup Verify Success Timeliness | ||
expr: absent(max_over_time(aptos_db_backup_verify_coordinator_succeed_timestamp_s[1w]) - max_over_time(aptos_db_backup_verify_coordinator_start_timestamp_s[1w]) > 0) | ||
for: 20h # assuming backup verify succeeds in 20 hours | ||
labels: | ||
severity: warning | ||
summary: "Backup Verify didn't finish succeeded in time." | ||
annotations: | ||
{{- end }} | ||
|
||
# Logging alerts | ||
- alert: Logs Being Dropped | ||
expr: 1 < (rate(aptos_struct_log_queue_error[1m]) + rate(aptos_struct_log_send_error[1m])) | ||
for: 5m | ||
labels: | ||
severity: warning | ||
summary: "Logs being dropped" | ||
annotations: | ||
description: "Logging Transmit Error rate is high \ | ||
check the logging dashboard and \ | ||
there may be network issues, downstream throughput issues, or something wrong with Vector \ | ||
TODO: Runbook" |
Oops, something went wrong.