Skip to content

Commit

Permalink
[Narwhal] add metrics for certificate verification and accept time (M…
Browse files Browse the repository at this point in the history
…ystenLabs#10970)

## Description 

This can help investigate slow Narwhal catchups, when FetchCertificates
RPC latency is normal.

## Test Plan 

n/a

---
If your changes are not user-facing and not a breaking change, you can
skip the following section. Otherwise, please indicate what changed, and
then add to the Release Notes section as highlighted during the release
process.

### Type of Change (Check all that apply)

- [ ] user-visible impact
- [ ] breaking change for a client SDKs
- [ ] breaking change for FNs (FN binary must upgrade)
- [ ] breaking change for validators or node operators (must upgrade
binaries)
- [ ] breaking change for on-chain data layout
- [ ] necessitate either a data wipe or data migration

### Release notes
  • Loading branch information
mwtian authored Apr 17, 2023
1 parent a5a69fd commit 545edc0
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 4 deletions.
14 changes: 12 additions & 2 deletions narwhal/primary/src/certificate_fetcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -312,11 +312,11 @@ async fn run_fetch_task(

// Process and store fetched certificates.
let num_certs_fetched = response.certificates.len();
process_certificates_helper(response, &state.synchronizer).await?;
process_certificates_helper(response, &state.synchronizer, state.metrics.clone()).await?;
state
.metrics
.certificate_fetcher_num_certificates_processed
.add(num_certs_fetched as i64);
.inc_by(num_certs_fetched as u64);

debug!("Successfully fetched and processed {num_certs_fetched} certificates");
Ok(())
Expand Down Expand Up @@ -407,6 +407,7 @@ async fn fetch_certificates_helper(
async fn process_certificates_helper(
response: FetchCertificatesResponse,
synchronizer: &Synchronizer,
metrics: Arc<PrimaryMetrics>,
) -> DagResult<()> {
trace!("Start sending fetched certificates to processing");
if response.certificates.len() > MAX_CERTIFICATES_TO_FETCH {
Expand All @@ -426,25 +427,34 @@ async fn process_certificates_helper(
.map(|certs| {
let certs = certs.to_vec();
let sync = synchronizer.clone();
let metrics = metrics.clone();
// Use threads dedicated to computation heavy work.
spawn_blocking(move || {
let now = Instant::now();
for c in &certs {
sync.sanitize_certificate(c)?;
}
metrics
.certificate_fetcher_total_verification_us
.inc_by(now.elapsed().as_micros() as u64);
Ok::<Vec<Certificate>, DagError>(certs)
})
})
.collect_vec();
// Process verified certificates in the same order as received.
for task in verify_tasks {
let certificates = task.await.map_err(|_| DagError::Canceled)??;
let now = Instant::now();
for cert in certificates {
if let Err(e) = synchronizer.try_accept_fetched_certificate(cert).await {
// It is possible that subsequent certificates are useful,
// so not stopping early.
warn!("Failed to accept fetched certificate: {e}");
}
}
metrics
.certificate_fetcher_total_accept_us
.inc_by(now.elapsed().as_micros() as u64);
}

trace!("Fetched certificates have been processed");
Expand Down
20 changes: 18 additions & 2 deletions narwhal/primary/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,11 @@ pub struct PrimaryMetrics {
/// 0 if there is no inflight certificates fetching, 1 otherwise.
pub certificate_fetcher_inflight_fetch: IntGauge,
/// Number of fetched certificates successfully processed by core.
pub certificate_fetcher_num_certificates_processed: IntGauge,
pub certificate_fetcher_num_certificates_processed: IntCounter,
/// Total time spent in certificate verifications, in microseconds.
pub certificate_fetcher_total_verification_us: IntCounter,
/// Total time spent to accept certificates via Synchronizer, in microseconds.
pub certificate_fetcher_total_accept_us: IntCounter,
/// Number of votes that were requested but not sent due to previously having voted differently
pub votes_dropped_equivocation_protection: IntCounter,
/// Number of pending batches in proposer
Expand Down Expand Up @@ -426,12 +430,24 @@ impl PrimaryMetrics {
registry
)
.unwrap(),
certificate_fetcher_num_certificates_processed: register_int_gauge_with_registry!(
certificate_fetcher_num_certificates_processed: register_int_counter_with_registry!(
"certificate_fetcher_num_certificates_processed",
"Number of fetched certificates successfully processed by core.",
registry
)
.unwrap(),
certificate_fetcher_total_verification_us: register_int_counter_with_registry!(
"certificate_fetcher_total_verification_us",
"Total time spent in certificate verifications, in microseconds.",
registry
)
.unwrap(),
certificate_fetcher_total_accept_us: register_int_counter_with_registry!(
"certificate_fetcher_total_accept_us",
"Total time spent to accept certificates via Synchronizer, in microseconds.",
registry
)
.unwrap(),
votes_dropped_equivocation_protection: register_int_counter_with_registry!(
"votes_dropped_equivocation_protection",
"Number of votes that were requested but not sent due to previously having voted differently",
Expand Down

0 comments on commit 545edc0

Please sign in to comment.