Skip to content

Commit

Permalink
[helm] add monitoring chart
Browse files Browse the repository at this point in the history
  • Loading branch information
sherry-x authored and aptos-bot committed May 10, 2022
1 parent d3751f7 commit 729ebdb
Show file tree
Hide file tree
Showing 10 changed files with 877 additions and 0 deletions.
3 changes: 3 additions & 0 deletions terraform/helm/monitoring/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
apiVersion: v2
name: aptos-monitoring
version: 0.1.0
33 changes: 33 additions & 0 deletions terraform/helm/monitoring/files/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Severeties: info, warning, [error, critical]
# Last 2 items are high urgency

global:

route:
group_by: [...] # TBD

# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s

# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m

# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 10m

# A default receiver
receiver: 'default'

# The child route trees.
# https://prometheus.io/docs/alerting/latest/configuration/#route
routes: {{ .Values.monitoring.alertmanager.alertRouteTrees | toJson }}

# A list of notification receivers
# https://prometheus.io/docs/alerting/latest/configuration/#receiver
receivers: {{ .Values.monitoring.alertmanager.alertReceivers | toJson }}
1 change: 1 addition & 0 deletions terraform/helm/monitoring/files/dashboards
34 changes: 34 additions & 0 deletions terraform/helm/monitoring/files/grafana.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{{- if .Values.monitoring.grafana.googleAuth }}

[auth]
# Set to true to disable (hide) the login form, useful if you use OAuth
disable_login_form = true

{{- with .Values.monitoring.grafana.config }}
[auth.google]
enabled = true
client_id = {{ .client_id }}
client_secret = {{ .client_secret }}
scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email
auth_url = https://accounts.google.com/o/oauth2/auth
token_url = https://accounts.google.com/o/oauth2/token
allowed_domains = {{ .allowed_domains }}
allow_sign_up = true
{{- end }}

[users]
auto_assign_org_role = Editor

[server]
protocol = http
root_url = http://mon.{{ .Values.service.domain }}/grafana
serve_from_sub_path = true

{{- else }}
[auth.anonymous]
enabled = true

# Role for unauthenticated users, other valid values are `Editor` and `Admin`
org_role = Editor

{{- end }}
163 changes: 163 additions & 0 deletions terraform/helm/monitoring/files/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
chain_name: {{ .Values.chain.name }}

# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
{{- range $path, $_ := .Files.Glob "files/rules/*.yml" }}
- {{ base $path }}
{{- end }}

scrape_configs:
{{ if .Values.monitoring.fullKubernetesScrape }}
- job_name: 'kubernetes-apiservers'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

kubernetes_sd_configs:
- role: endpoints

# Keep only the default/kubernetes service endpoints for the https port. This
# will add targets for each API server which Kubernetes adds an endpoint to
# the default/kubernetes service.
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- target_label: owner
replacement: {{ .Values.validator.name }}
{{ end }}

- job_name: 'kubernetes-nodes'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

kubernetes_sd_configs:
- role: node

{{ if not .Values.monitoring.fullKubernetesScrape }}
metric_relabel_configs:
- source_labels: [namespace]
action: keep
regex: "{{ .Release.Namespace }}"
{{ end }}

relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- target_label: owner
replacement: {{ .Values.validator.name }}

- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

kubernetes_sd_configs:
- role: node

relabel_configs:
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- target_label: owner
replacement: {{ .Values.validator.name }}

{{ if not .Values.monitoring.fullKubernetesScrape }}
metric_relabel_configs:
- source_labels: [namespace, pod]
action: keep
regex: "{{ .Release.Namespace }};{{ .Release.Name }}-.*"
{{ end }}

{{ if .Values.monitoring.useKubeStateMetrics }}
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics.default.svc.cluster.local:8080']
labels:
owner: {{ .Values.validator.name }}
{{ end }}

- job_name: "aptos-procs"

kubernetes_sd_configs:
- role: pod

relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_part_of]
action: keep
regex: "{{ .Chart.Name }}"
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
action: keep
regex: "{{ .Release.Name }}"
- source_labels: [__meta_kubernetes_pod_container_port_number]
action: keep
regex: "9101"
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
action: replace
target_label: role
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
action: replace
target_label: instance
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- target_label: owner
replacement: {{ .Values.validator.name }}

- job_name: "pushgateway"
static_configs:
- targets: ['localhost:9091']
labels:
owner: {{ .Values.validator.name }}

{{ if .Values.vault.prometheusTarget }}
- job_name: "vault"
static_configs:
- targets: ['{{ .Values.vault.prometheusTarget }}']
labels:
owner: {{ .Values.validator.name }}
scheme: "https"
metrics_path: "/v1/sys/metrics"
params:
format: ["prometheus"]
tls_config:
insecure_skip_verify: true
{{ end }}

{{ if .Values.monitoring.prometheus.remote_write.enabled }}
{{ with .Values.monitoring.prometheus.remote_write }}
remote_write:
- url: {{ .url }}
sigv4:
region: {{ .region }}
queue_config:
max_samples_per_send: 1000
max_shards: 200
capacity: 2500
{{ end }}
{{ end }}

156 changes: 156 additions & 0 deletions terraform/helm/monitoring/files/rules/alerts.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
groups:
- name: "Aptos alerts"
rules:
# consensus
- alert: Zero Block Commit Rate
expr: rate(aptos_consensus_last_committed_round{role="validator"}[1m]) == 0 OR absent(aptos_consensus_last_committed_round{role="validator"})
for: 20m
labels:
severity: error
summary: "The block commit rate is low"
annotations:
- alert: High local timeout rate
expr: rate(aptos_consensus_timeout_count{role="validator"}[1m]) > 0.5
for: 20m
labels:
severity: warning
summary: "Consensus timeout rate is high"
annotations:
- alert: High consensus error rate
expr: rate(aptos_consensus_error_count{role="validator"}[1m]) / on (role) rate(consensus_duration_count{op='main_loop', role="validator"}[1m]) > 0.25
for: 20m
labels:
severity: warning
summary: "Consensus error rate is high"
annotations:

# State sync alerts
- alert: State sync is not making progress
expr: rate(aptos_state_sync_version{type="synced"}[5m]) == 0 OR absent(aptos_state_sync_version{type="synced"})
for: 5m
labels:
severity: error
summary: "State sync is not making progress (i.e., it is not keeping up with the head of the blockchain)"
annotations:

# Mempool alerts
- alert: Mempool has no active upstream peers
expr: (sum by (owner, kubernetes_pod_name) (mempool_active_upstream_peers_count)) == 0
for: 3m
labels:
severity: error
summary: "Mempool has no active upstream peers (unable to forward transactions to anyone!)"
annotations:
- alert: Mempool is at >80% capacity
expr: core_mempool_index_size{index="system_ttl"} > 800000 # assumes default mempool size 1_000_000
for: 5m
labels:
severity: warning
summary: "Mempool is at >80% capacity (it may soon become full!)"
annotations:
- alert: Mempool is growing at a significant rate
expr: rate(core_mempool_index_size{index="system_ttl"}[1m]) > 30000
for: 10m
labels:
severity: warning
summary: "Mempool is growing at a significant rate (it may soon become full!)"
annotations:

# Networking alerts
- alert: Validator Connected Peers
expr: 0 == min(aptos_network_peers{state="connected", role_type="validator", role="validator"})
for: 15m
labels:
severity: error
summary: "Validator node has zero connected peers"
annotations:

# Storage core metrics
- alert: Validator Low Disk Space
expr: (kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*validator-e.*"} - kubelet_volume_stats_used_bytes) / 1024 / 1024 / 1024 < 50
for: 5m
labels:
severity: warning
summary: "Less than 50 GB of free space on Validator DB volume."
annotations:
- alert: Validator Very Low Disk Space
expr: (kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*validator-e.*"} - kubelet_volume_stats_used_bytes) / 1024 / 1024 / 1024 < 20
for: 5m
labels:
severity: critical
summary: "Less than 20 GB of free space on Validator DB volume."
annotations:
- alert: AptosDB API Success Rate
expr: sum by(owner, kubernetes_pod_name) (rate(aptos_storage_api_latency_seconds_count{result="Ok"}[1m])) / sum by(owner, kubernetes_pod_name) (rate(aptos_storage_api_latency_seconds_count[1m])) < 0.99 # 99%
for: 5m
labels:
severity: error
summary: "AptosDB API success rate dropped."
annotations:
- alert: RocksDB Read Latency
expr: sum by (owner, kubernetes_pod_name) (rate(aptos_schemadb_get_latency_seconds_sum[1m])) / sum by (owner, kubernetes_pod_name) (rate(aptos_schemadb_get_latency_seconds_count[1m])) > 0.001 # 1 millisecond
for: 5m
labels:
severity: warning
summary: "RocksDB read latency raised."
annotations:

{{- if .Values.backup.enable }}
# DB Backup and Backup Verify
- alert: Backup Coordinator Liveness
# It's okay if one of these metrics stops changing but the pushgateway still reports the old value, since the alerts following this will detect the staleness. In fact this one is just to detect the absence of these metrics.
expr: absent(irate(aptos_backup_metadata_num_file_downloads[1m]) > 0) and on() (absent(aptos_db_backup_coordinator_heartbeat_timestamp_s) or absent(aptos_db_backup_coordinator_epoch_ending_epoch) or absent(aptos_db_backup_coordinator_transaction_version) or absent(aptos_db_backup_coordinator_state_snapshot_version))
for: 10m
labels:
severity: warning
summary: "Backup coordinator or one of its work streams is not heartbeating."
annotations:
- alert: Epoch Ending Backup Timeliness
expr: max(aptos_storage_next_block_epoch) by(owner) - on (owner) aptos_db_backup_coordinator_epoch_ending_epoch > 1 # "==1" when caught up.
for: 10m
labels:
severity: warning
summary: "Epoch ending backup is not keeping up."
annotations:
- alert: Transaction Backup Timeliness
expr: max(aptos_storage_latest_transaction_version) by(owner) - on (owner) aptos_db_backup_coordinator_transaction_version > {{ .Values.backup.config.transaction_batch_size }} # more than txn_backup_batch_size
for: 10m
labels:
severity: warning
summary: "Transaction backup is not keeping up."
annotations:
- alert: State Snapshot Backup Timeliness
expr: max(aptos_storage_latest_transaction_version) by(owner) - on (owner) aptos_db_backup_coordinator_state_snapshot_version > {{ .Values.backup.config.state_snapshot_interval }} # more than state_snapshot_interval
for: 10m
labels:
severity: warning
summary: "State snapshot backup is not keeping up."
annotations:
- alert: Backup Verify Scheduling
expr: absent(time() - max_over_time(aptos_db_backup_verify_coordinator_start_timestamp_s[1w]) < 86400) # assuming the verifier schedule is per day
for: 10m
labels:
severity: warning
summary: "Backup Verify not started as scheduled."
annotations:
- alert: Backup Verify Success Timeliness
expr: absent(max_over_time(aptos_db_backup_verify_coordinator_succeed_timestamp_s[1w]) - max_over_time(aptos_db_backup_verify_coordinator_start_timestamp_s[1w]) > 0)
for: 20h # assuming backup verify succeeds in 20 hours
labels:
severity: warning
summary: "Backup Verify didn't finish succeeded in time."
annotations:
{{- end }}

# Logging alerts
- alert: Logs Being Dropped
expr: 1 < (rate(aptos_struct_log_queue_error[1m]) + rate(aptos_struct_log_send_error[1m]))
for: 5m
labels:
severity: warning
summary: "Logs being dropped"
annotations:
description: "Logging Transmit Error rate is high \
check the logging dashboard and \
there may be network issues, downstream throughput issues, or something wrong with Vector \
TODO: Runbook"
Loading

0 comments on commit 729ebdb

Please sign in to comment.