Skip to content

Commit

Permalink
Undo being "smart" and creating AverageIntCounter, as it missaligns w…
Browse files Browse the repository at this point in the history
…ith counter dumping (aptos-labs#7100)

naive implementation had issues with sum and count counters not being pulled by metrics reporting at the same time.

Looking at code Histogram has complicated special handling of it, that I don't want to repeat.
  • Loading branch information
igor-aptos authored Mar 28, 2023
1 parent 274c8ea commit 85aca12
Show file tree
Hide file tree
Showing 8 changed files with 42 additions and 87 deletions.
14 changes: 7 additions & 7 deletions consensus/src/counters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
// SPDX-License-Identifier: Apache-2.0

use aptos_metrics_core::{
exponential_buckets, op_counters::DurationHistogram, register_counter, register_gauge,
register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter,
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, AverageIntCounter,
exponential_buckets, op_counters::DurationHistogram, register_avg_counter, register_counter,
register_gauge, register_gauge_vec, register_histogram, register_histogram_vec,
register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
Counter, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge,
IntGaugeVec,
};
Expand Down Expand Up @@ -187,16 +187,16 @@ pub static LEADER_REPUTATION_ROUND_HISTORY_SIZE: Lazy<IntGauge> = Lazy::new(|| {
});

/// Counts when chain_health backoff is triggered
pub static CHAIN_HEALTH_BACKOFF_TRIGGERED: Lazy<AverageIntCounter> = Lazy::new(|| {
AverageIntCounter::register(
pub static CHAIN_HEALTH_BACKOFF_TRIGGERED: Lazy<Histogram> = Lazy::new(|| {
register_avg_counter(
"aptos_chain_health_backoff_triggered",
"Counts when chain_health backoff is triggered",
)
});

/// Counts when waiting for full blocks is triggered
pub static WAIT_FOR_FULL_BLOCKS_TRIGGERED: Lazy<AverageIntCounter> = Lazy::new(|| {
AverageIntCounter::register(
pub static WAIT_FOR_FULL_BLOCKS_TRIGGERED: Lazy<Histogram> = Lazy::new(|| {
register_avg_counter(
"aptos_wait_for_full_blocks_triggered",
"Counts when waiting for full blocks is triggered",
)
Expand Down
4 changes: 2 additions & 2 deletions consensus/src/liveness/proposal_generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,15 +244,15 @@ impl ProposalGenerator {
.max_block_bytes
.min(value.max_sending_block_bytes_override);

CHAIN_HEALTH_BACKOFF_TRIGGERED.observe(1);
CHAIN_HEALTH_BACKOFF_TRIGGERED.observe(1.0);
warn!(
"Generating proposal reducing limits to {} txns and {} bytes, due to chain health backoff",
max_block_txns,
max_block_bytes,
);
(max_block_txns, max_block_bytes)
} else {
CHAIN_HEALTH_BACKOFF_TRIGGERED.observe(0);
CHAIN_HEALTH_BACKOFF_TRIGGERED.observe(0.0);
(self.max_block_txns, self.max_block_bytes)
};

Expand Down
2 changes: 1 addition & 1 deletion consensus/src/payload_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ impl PayloadClient for QuorumStoreClient {
&& pending_uncommitted_blocks < self.wait_for_full_blocks_above_pending_blocks;
let return_empty = pending_ordering && return_non_full;

WAIT_FOR_FULL_BLOCKS_TRIGGERED.observe(u64::from(!return_non_full));
WAIT_FOR_FULL_BLOCKS_TRIGGERED.observe(if !return_non_full { 1.0 } else { 0.0 });

fail_point!("consensus::pull_payload", |_| {
Err(anyhow::anyhow!("Injected error in pull_payload").into())
Expand Down
12 changes: 6 additions & 6 deletions consensus/src/quorum_store/batch_generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@ impl BatchGenerator {
);
trace!("QS: dynamic_max_pull_txn_per_s: {}", dynamic_pull_txn_per_s);
}
counters::QS_BACKPRESSURE_TXN_COUNT.observe(1);
counters::QS_BACKPRESSURE_DYNAMIC_MAX.observe(dynamic_pull_txn_per_s);
counters::QS_BACKPRESSURE_TXN_COUNT.observe(1.0);
counters::QS_BACKPRESSURE_DYNAMIC_MAX.observe(dynamic_pull_txn_per_s as f64);
} else {
// additive increase, every second
if back_pressure_increase_latest.elapsed() >= back_pressure_increase_duration {
Expand All @@ -228,13 +228,13 @@ impl BatchGenerator {
);
trace!("QS: dynamic_max_pull_txn_per_s: {}", dynamic_pull_txn_per_s);
}
counters::QS_BACKPRESSURE_TXN_COUNT.observe(0);
counters::QS_BACKPRESSURE_DYNAMIC_MAX.observe(dynamic_pull_txn_per_s);
counters::QS_BACKPRESSURE_TXN_COUNT.observe(0.0);
counters::QS_BACKPRESSURE_DYNAMIC_MAX.observe(dynamic_pull_txn_per_s as f64);
}
if self.back_pressure.proof_count {
counters::QS_BACKPRESSURE_PROOF_COUNT.observe(1);
counters::QS_BACKPRESSURE_PROOF_COUNT.observe(1.0);
} else {
counters::QS_BACKPRESSURE_PROOF_COUNT.observe(0);
counters::QS_BACKPRESSURE_PROOF_COUNT.observe(0.0);
}
let since_last_non_empty_pull_ms = std::cmp::min(
now.duration_since(last_non_empty_pull).as_millis(),
Expand Down
22 changes: 11 additions & 11 deletions consensus/src/quorum_store/counters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
// SPDX-License-Identifier: Apache-2.0

use aptos_metrics_core::{
exponential_buckets, op_counters::DurationHistogram, register_histogram,
register_histogram_vec, register_int_counter, register_int_counter_vec, AverageIntCounter,
Histogram, HistogramVec, IntCounter, IntCounterVec,
exponential_buckets, op_counters::DurationHistogram, register_avg_counter, register_histogram,
register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram,
HistogramVec, IntCounter, IntCounterVec,
};
use once_cell::sync::Lazy;
use std::time::Duration;
Expand Down Expand Up @@ -397,22 +397,22 @@ pub static RECEIVED_BATCH_RESPONSE_COUNT: Lazy<IntCounter> = Lazy::new(|| {
.unwrap()
});

pub static QS_BACKPRESSURE_TXN_COUNT: Lazy<AverageIntCounter> = Lazy::new(|| {
AverageIntCounter::register(
pub static QS_BACKPRESSURE_TXN_COUNT: Lazy<Histogram> = Lazy::new(|| {
register_avg_counter(
"quorum_store_backpressure_txn_count",
"Indicator of whether Quorum Store is backpressured due to txn count exceeding threshold.",
)
});

pub static QS_BACKPRESSURE_PROOF_COUNT: Lazy<AverageIntCounter> = Lazy::new(|| {
AverageIntCounter::register(
pub static QS_BACKPRESSURE_PROOF_COUNT: Lazy<Histogram> = Lazy::new(|| {
register_avg_counter(
"quorum_store_backpressure_proof_count",
"Indicator of whether Quorum Store is backpressured due to proof count exceeding threshold."
)
});

pub static QS_BACKPRESSURE_DYNAMIC_MAX: Lazy<AverageIntCounter> = Lazy::new(|| {
AverageIntCounter::register(
pub static QS_BACKPRESSURE_DYNAMIC_MAX: Lazy<Histogram> = Lazy::new(|| {
register_avg_counter(
"quorum_store_backpressure_dynamic_max",
"What the dynamic max is set to",
)
Expand Down Expand Up @@ -456,8 +456,8 @@ pub static BATCH_TO_POS_DURATION: Lazy<DurationHistogram> = Lazy::new(|| {
)
});

pub static BATCH_SUCCESSFUL_CREATION: Lazy<AverageIntCounter> = Lazy::new(|| {
AverageIntCounter::register(
pub static BATCH_SUCCESSFUL_CREATION: Lazy<Histogram> = Lazy::new(|| {
register_avg_counter(
"quorum_store_batch_successful_creation",
"Counter for whether we are successfully creating batches",
)
Expand Down
3 changes: 2 additions & 1 deletion consensus/src/quorum_store/proof_coordinator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,8 @@ impl ProofCoordinator {
.observe(state.aggregated_signature.len() as f64);
counters::BATCH_RECEIVED_REPLIES_VOTING_POWER
.observe(state.aggregated_voting_power as f64);
counters::BATCH_SUCCESSFUL_CREATION.observe(u64::from(state.completed));
counters::BATCH_SUCCESSFUL_CREATION
.observe(if state.completed { 1.0 } else { 0.0 });
if !state.completed {
counters::TIMEOUT_BATCHES_COUNT.inc();
batch_ids.push(signed_batch_info_info.batch_id());
Expand Down
70 changes: 12 additions & 58 deletions crates/aptos-metrics-core/src/avg_counter.rs
Original file line number Diff line number Diff line change
@@ -1,61 +1,15 @@
// Copyright © Aptos Foundation

use prometheus::{register_counter, register_int_counter, Counter, IntCounter};

pub struct AverageCounter {
sum: Counter,
count: IntCounter,
}

impl AverageCounter {
pub fn register(name: &str, desc: &str) -> AverageCounter {
AverageCounter {
sum: register_counter!(
format!("{}_sum", name),
format!("{}. Sum part of the counter", desc),
)
.unwrap(),
count: register_int_counter!(
format!("{}_count", name),
format!("{}. Count part of the counter", desc),
)
.unwrap(),
}
}

pub fn observe(&self, value: f64) {
if value != 0.0 {
self.sum.inc_by(value);
}
self.count.inc();
}
}

pub struct AverageIntCounter {
sum: IntCounter,
count: IntCounter,
}

impl AverageIntCounter {
pub fn register(name: &str, desc: &str) -> AverageIntCounter {
AverageIntCounter {
sum: register_int_counter!(
format!("{}_sum", name),
format!("{}. Sum part of the counter", desc),
)
.unwrap(),
count: register_int_counter!(
format!("{}_count", name),
format!("{}. Count part of the counter", desc),
)
.unwrap(),
}
}

pub fn observe(&self, value: u64) {
if value != 0 {
self.sum.inc_by(value);
}
self.count.inc();
}
use prometheus::{register_histogram, Histogram};

// use histogram, instead of pair of sum/count counters, to guarantee
// atomicity of observing and fetching (which Histogram handles correctly)
pub fn register_avg_counter(name: &str, desc: &str) -> Histogram {
register_histogram!(
name,
desc,
// We need to have at least one bucket in histogram, otherwise default buckets are used
vec![0.5],
)
.unwrap()
}
2 changes: 1 addition & 1 deletion crates/aptos-metrics-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ pub use prometheus::{
};

mod avg_counter;
pub use avg_counter::{AverageCounter, AverageIntCounter};
pub use avg_counter::register_avg_counter;
pub mod const_metric;
pub mod op_counters;

0 comments on commit 85aca12

Please sign in to comment.