From de6f82f7627a32ad651a0503cbe3e89a8bf1a0ca Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Tue, 25 May 2021 10:19:43 +0200 Subject: [PATCH 1/4] Extend Alertmanager dashboard with currently unused metrics. Metrics for general operation: - Added "Tenants" stat panel using: `cortex_alertmanager_tenants_discovered` - Added "Tenant Configuration Sync" row using: `cortex_alertmanager_sync_configs_failed_total` `cortex_alertmanager_sync_configs_total` `cortex_alertmanager_ring_check_errors_total` Metrics specific to sharding operation: - Added "Sharding Initial State Sync" row using: `cortex_alertmanager_state_initial_sync_completed_total` `cortex_alertmanager_state_initial_sync_completed_total` `cortex_alertmanager_state_initial_sync_duration_seconds` - Added "Sharding State Operations" row using: `cortex_alertmanager_state_fetch_replica_state_total` `cortex_alertmanager_state_fetch_replica_state_failed_total` `cortex_alertmanager_state_replication_total` `cortex_alertmanager_state_replication_failed_total` `cortex_alertmanager_partial_state_merges_total` `cortex_alertmanager_partial_state_merges_failed_total` `cortex_alertmanager_state_persist_total` `cortex_alertmanager_state_persist_failed_total` --- .../dashboards/alertmanager.libsonnet | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index b329ce6b..6d7ee562 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -17,6 +17,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Total Silences') + $.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short') ) + .addPanel( + $.panel('Tenants') + + $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher('alertmanager'), format='short') + ) ) .addRow( $.row('Alerts Received') @@ -86,5 +90,150 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRows( $.getObjectStoreRows('Alertmanager Configuration Object Store (Alertmanager accesses)', 'alertmanager-storage') + ) + .addRow( + $.row('Replication') + .addPanel( + $.panel('Tenants (By Instance)') + + $.queryPanel( + 'sum by(pod) (cortex_alertmanager_tenants_owned{%s})' % $.jobMatcher('alertmanager'), + '{{pod}}' + ) + + $.stack + ) + .addPanel( + $.panel('Alerts (By Instance)') + + $.queryPanel( + 'sum by(pod) (cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), + '{{pod}}' + ) + + $.stack + ) + .addPanel( + $.panel('Silences (By Instance)') + + $.queryPanel( + 'sum by(pod) (cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), + '{{pod}}' + ) + + $.stack + ) + ) + .addRow( + $.row('Tenant Configuration Sync') + .addPanel( + $.panel('Syncs/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Syncs/sec (By Reason)') + + $.queryPanel( + 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + '{{reason}}' + ) + ) + .addPanel( + $.panel('Ring Check Errors/sec') + + $.queryPanel( + 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'errors' + ) + ) + ) + .addRow( + $.row('Sharding Initial State Sync') + .addPanel( + $.panel('Syncs/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_initial_sync_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Syncs/sec (By Outcome)') + + $.queryPanel( + 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + '{{outcome}}' + ) + ) + .addPanel( + $.panel('Duration') + + utils.latencyRecordingRulePanel('cortex_alertmanager_state_initial_sync_duration_seconds', $.jobSelector('alertmanager')) + ) + ) + .addRow( + $.row('Sharding State Operations') + .addPanel( + $.panel('Replica Fetches/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_fetch_replica_state_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Replica Updates/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Partial Merges/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Remote Storage Persists/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_persist_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) ), } From ec2f7e12b50086180a2251ac5318fda531c28ef9 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Tue, 25 May 2021 11:06:20 +0200 Subject: [PATCH 2/4] Update Changelog. --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fc3cc32..7b284f6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,11 @@ * [ENHANCEMENT] Ruler dashboard: added "Per route p99 latency" panel in the "Configuration API" row. #353 * [ENHANCEMENT] Increased the `for` duration of the `CortexIngesterReachingSeriesLimit` warning alert to 3h. #362 * [ENHANCEMENT] Added a new tier (`medium_small_user`) so we have another tier between 100K and 1Mil active series. #364 +* [ENHANCEMENT] Extend Alertmanager dashboard: #313 + * "Tenants" stat panel - shows number of discovered tenant configurations. + * "Tenant Configuration Sync" row - information about the configuration sync procedure. + * "Sharding Initial State Sync" row - information about the initial state sync procedure when sharding is enabled. + * "Sharding State Operations" row - information about various state operations which occur when sharding is enabled (replication, fetch, marge, persist). * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 From 7104443c702b687996377af5f78cc053cecae0cb Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Thu, 27 May 2021 12:55:33 +0200 Subject: [PATCH 3/4] Review comments + fix latency panel. --- .../dashboards/alertmanager.libsonnet | 52 +++++++------------ 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index 6d7ee562..922b2861 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -94,26 +94,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Replication') .addPanel( - $.panel('Tenants (By Instance)') + + $.panel('Per %s Tenants' % $._config.per_instance_label) + $.queryPanel( - 'sum by(pod) (cortex_alertmanager_tenants_owned{%s})' % $.jobMatcher('alertmanager'), - '{{pod}}' + 'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + '{{%s}}' % $._config.per_instance_label ) + $.stack ) .addPanel( - $.panel('Alerts (By Instance)') + + $.panel('Per %s Alerts' % $._config.per_instance_label) + $.queryPanel( - 'sum by(pod) (cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), - '{{pod}}' + 'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + '{{%s}}' % $._config.per_instance_label ) + $.stack ) .addPanel( - $.panel('Silences (By Instance)') + + $.panel('Per %s Silences' % $._config.per_instance_label) + $.queryPanel( - 'sum by(pod) (cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), - '{{pod}}' + 'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + '{{%s}}' % $._config.per_instance_label ) + $.stack ) @@ -150,37 +150,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Sharding Initial State Sync') + $.row('Sharding Runtime State Sync') .addPanel( $.panel('Syncs/sec') + - $.queryPanel( - [ - ||| - sum(rate(cortex_alertmanager_state_initial_sync_total{%s}[$__rate_interval])) - - - sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval])) - ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), - ], - ['success', 'failed'] - ) - ) - .addPanel( - $.panel('Syncs/sec (By Outcome)') + $.queryPanel( 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), '{{outcome}}' ) ) .addPanel( - $.panel('Duration') + - utils.latencyRecordingRulePanel('cortex_alertmanager_state_initial_sync_duration_seconds', $.jobSelector('alertmanager')) + $.panel('Sync duration') + + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) ) - ) - .addRow( - $.row('Sharding State Operations') .addPanel( - $.panel('Replica Fetches/sec') + + $.panel('Fetch state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -193,8 +176,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ['success', 'failed'] ) ) + ) + .addRow( + $.row('Sharding State Operations') .addPanel( - $.panel('Replica Updates/sec') + + $.panel('Replicate state to other alertmanagers /sec') + $.queryPanel( [ ||| @@ -208,7 +194,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Partial Merges/sec') + + $.panel('Merge state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -222,7 +208,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Remote Storage Persists/sec') + + $.panel('Persist state to remote storage /sec') + $.queryPanel( [ ||| From 629d2884eb7ff7c42692f91c2186d936157c0388 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Fri, 30 Jul 2021 11:49:02 +0200 Subject: [PATCH 4/4] Review comments. --- CHANGELOG.md | 3 ++- cortex-mixin/dashboards/alertmanager.libsonnet | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b284f6c..b0d814ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,9 +34,10 @@ * [ENHANCEMENT] Added a new tier (`medium_small_user`) so we have another tier between 100K and 1Mil active series. #364 * [ENHANCEMENT] Extend Alertmanager dashboard: #313 * "Tenants" stat panel - shows number of discovered tenant configurations. + * "Replication" row - information about the replication of tenants/alerts/silences over instances. * "Tenant Configuration Sync" row - information about the configuration sync procedure. * "Sharding Initial State Sync" row - information about the initial state sync procedure when sharding is enabled. - * "Sharding State Operations" row - information about various state operations which occur when sharding is enabled (replication, fetch, marge, persist). + * "Sharding Runtime State Sync" row - information about various state operations which occur when sharding is enabled (replication, fetch, marge, persist). * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index 922b2861..6f578b11 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -150,16 +150,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Sharding Runtime State Sync') + $.row('Sharding Initial State Sync') .addPanel( - $.panel('Syncs/sec') + + $.panel('Initial syncs/sec') + $.queryPanel( 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), '{{outcome}}' ) ) .addPanel( - $.panel('Sync duration') + + $.panel('Initial sync duration') + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) ) .addPanel( @@ -178,7 +178,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Sharding State Operations') + $.row('Sharding Runtime State Sync') .addPanel( $.panel('Replicate state to other alertmanagers /sec') + $.queryPanel(