From 6b98051ad6c826eea7addf0bfb41974893f93f26 Mon Sep 17 00:00:00 2001 From: Laura Makdah Date: Tue, 16 Aug 2022 20:48:23 -0700 Subject: [PATCH] update to latency metrics --- crates/sui-benchmark/src/benchmark.rs | 12 +- .../src/benchmark/load_generator.rs | 19 +- .../src/authority_active/gossip/mod.rs | 18 +- crates/sui-core/src/authority_client.rs | 179 +++++------------- crates/sui-core/src/gateway_state.rs | 2 +- crates/sui-tool/src/commands.rs | 10 +- crates/sui/tests/shared_objects_tests.rs | 2 +- 7 files changed, 103 insertions(+), 139 deletions(-) diff --git a/crates/sui-benchmark/src/benchmark.rs b/crates/sui-benchmark/src/benchmark.rs index 4d66fdb7b132a..3fb33e13322f0 100644 --- a/crates/sui-benchmark/src/benchmark.rs +++ b/crates/sui-benchmark/src/benchmark.rs @@ -13,8 +13,11 @@ use crate::benchmark::{ use futures::{join, StreamExt}; use multiaddr::Multiaddr; use rayon::{iter::ParallelIterator, prelude::*}; +use std::sync::Arc; use std::{panic, thread, thread::sleep, time::Duration}; -use sui_core::authority_client::{AuthorityAPI, NetworkAuthorityClient}; +use sui_core::authority_client::{ + AuthorityAPI, NetworkAuthorityClient, NetworkAuthorityClientMetrics, +}; use sui_types::{ batch::UpdateItem, messages::{BatchInfoRequest, BatchInfoResponseItem, TransactionInfoRequest}, @@ -217,7 +220,12 @@ fn run_latency_microbench( async fn run_follower(address: Multiaddr) { // We spawn a second client that listens to the batch interface - let authority_client = NetworkAuthorityClient::connect(&address).await.unwrap(); + let authority_client = NetworkAuthorityClient::connect( + &address, + Arc::new(NetworkAuthorityClientMetrics::new_for_tests()), + ) + .await + .unwrap(); follow(authority_client, false).await; } diff --git a/crates/sui-benchmark/src/benchmark/load_generator.rs b/crates/sui-benchmark/src/benchmark/load_generator.rs index 946748600a556..3d7f109216866 100644 --- a/crates/sui-benchmark/src/benchmark/load_generator.rs +++ b/crates/sui-benchmark/src/benchmark/load_generator.rs @@ -24,6 +24,7 @@ use tokio::{sync::Notify, time}; use tracing::{error, info}; use sui_config::NetworkConfig; +use sui_core::authority_client::NetworkAuthorityClientMetrics; use sui_types::committee::StakeUnit; pub fn check_transaction_response(reply_message: Result) { @@ -50,7 +51,11 @@ pub async fn send_tx_chunks( let mut tasks = Vec::new(); for tx_chunks in tx_chunks.chunks(tx_chunks.len() / conn) { - let client = NetworkAuthorityClient::connect_lazy(&address).unwrap(); + let client = NetworkAuthorityClient::connect_lazy( + &address, + Arc::new(NetworkAuthorityClientMetrics::new_for_tests()), + ) + .unwrap(); let txns = tx_chunks.to_vec(); let task = tokio::spawn(async move { @@ -93,7 +98,11 @@ pub async fn send_transactions( let mut tasks = Vec::new(); for tx_chunks in tx_chunks.chunks(tx_chunks.len() / conn) { - let client = NetworkAuthorityClient::connect_lazy(&address).unwrap(); + let client = NetworkAuthorityClient::connect_lazy( + &address, + Arc::new(NetworkAuthorityClientMetrics::new_for_tests()), + ) + .unwrap(); let txns = tx_chunks.to_vec(); let task = tokio::spawn(async move { @@ -131,7 +140,11 @@ pub async fn send_confs( let mut tasks = Vec::new(); for tx_chunks in tx_chunks.chunks(tx_chunks.len() / conn) { - let client = NetworkAuthorityClient::connect_lazy(&address).unwrap(); + let client = NetworkAuthorityClient::connect_lazy( + &address, + Arc::new(NetworkAuthorityClientMetrics::new_for_tests()), + ) + .unwrap(); let txns = tx_chunks.to_vec(); let task = tokio::spawn(async move { diff --git a/crates/sui-core/src/authority_active/gossip/mod.rs b/crates/sui-core/src/authority_active/gossip/mod.rs index 550177ede4dc1..f1c4e799596cf 100644 --- a/crates/sui-core/src/authority_active/gossip/mod.rs +++ b/crates/sui-core/src/authority_active/gossip/mod.rs @@ -70,6 +70,7 @@ pub struct GossipMetrics { pub wait_for_finality_latency_sec: Histogram, pub total_attempts_cert_downloads: IntCounter, pub total_successful_attempts_cert_downloads: IntCounter, + pub follower_stream_duration: Histogram, } impl GossipMetrics { @@ -101,7 +102,7 @@ impl GossipMetrics { .unwrap(), wait_for_finality_latency_sec: register_histogram_with_registry!( "gossip_wait_for_finality_latency_sec", - "Latency histogram for gossip/node sync process to wait for txs to become final, in sec", + "Latency histogram for gossip/node sync process to wait for txs to become final, in seconds", registry, ) .unwrap(), @@ -117,6 +118,12 @@ impl GossipMetrics { registry, ) .unwrap(), + follower_stream_duration: register_histogram_with_registry!( + "follower_stream_duration", + "Latency histogram of the duration of the follower streams to peers, in seconds", + registry, + ) + .unwrap(), } } @@ -263,8 +270,8 @@ async fn wait_for_one_gossip_task_to_finish( ) where A: AuthorityAPI + Send + Sync + 'static + Clone, { - let (finished_name, _result) = gossip_tasks.select_next_some().await; - if let Err(err) = _result { + let (finished_name, result) = gossip_tasks.select_next_some().await; + if let Err(err) = result { active_authority.set_failure_backoff(finished_name).await; active_authority.state.metrics.gossip_task_error_count.inc(); error!(peer = ?finished_name, "Peer returned error: {:?}", err); @@ -466,6 +473,8 @@ where let mut last_seq_in_cur_batch: TxSequenceNumber = 0; let mut streamx = Box::pin(self.client.handle_batch_stream(req).await?); let metrics = handler.get_metrics(); + let mut timer = metrics.follower_stream_duration.start_timer(); + loop { tokio::select! { _ = &mut timeout => { @@ -513,6 +522,8 @@ where // The stream has closed, re-request: None => { + timer.stop_and_record(); + timer = metrics.follower_stream_duration.start_timer(); info!(peer = ?self.peer_name, ?latest_seq, "Gossip stream was closed. Restarting"); self.client.metrics_seq_number_to_handle_batch_stream.set(latest_seq.unwrap_or_default() as i64); self.client.metrics_total_times_reconnect_follower_stream.inc(); @@ -545,6 +556,7 @@ where } }; } + timer.stop_and_record(); Ok(()) } } diff --git a/crates/sui-core/src/authority_client.rs b/crates/sui-core/src/authority_client.rs index 6fbb15b9b25fd..6b305989b56b7 100644 --- a/crates/sui-core/src/authority_client.rs +++ b/crates/sui-core/src/authority_client.rs @@ -5,13 +5,12 @@ use crate::authority::AuthorityState; use anyhow::anyhow; use async_trait::async_trait; -use futures::{stream::BoxStream, FutureExt, TryStreamExt}; +use futures::{stream::BoxStream, TryStreamExt}; use multiaddr::Multiaddr; use mysten_network::config::Config; use narwhal_crypto::traits::ToFromBytes; use prometheus::{register_histogram_with_registry, Histogram}; use std::collections::BTreeMap; -use std::mem; use std::sync::Arc; use sui_config::genesis::Genesis; use sui_network::{api::ValidatorClient, tonic}; @@ -135,9 +134,10 @@ impl AuthorityAPI for NetworkAuthorityClient { &self, transaction: Transaction, ) -> Result { - self.metrics - .handle_transaction_request_bytes - .observe(mem::size_of_val(&transaction) as f64); + let timer = self + .metrics + .handle_transaction_request_latency + .start_timer(); let response = self .client() @@ -146,9 +146,7 @@ impl AuthorityAPI for NetworkAuthorityClient { .map(tonic::Response::into_inner) .map_err(Into::into); - self.metrics - .handle_transaction_response_bytes - .observe(mem::size_of_val(&response) as f64); + timer.stop_and_record(); response } @@ -158,9 +156,10 @@ impl AuthorityAPI for NetworkAuthorityClient { &self, certificate: CertifiedTransaction, ) -> Result { - self.metrics - .handle_certificate_request_bytes - .observe(mem::size_of_val(&certificate) as f64); + let timer = self + .metrics + .handle_certificate_request_latency + .start_timer(); let response = self .client() @@ -169,9 +168,7 @@ impl AuthorityAPI for NetworkAuthorityClient { .map(tonic::Response::into_inner) .map_err(Into::into); - self.metrics - .handle_certificate_response_bytes - .observe(mem::size_of_val(&response) as f64); + timer.stop_and_record(); response } @@ -180,9 +177,10 @@ impl AuthorityAPI for NetworkAuthorityClient { &self, request: AccountInfoRequest, ) -> Result { - self.metrics - .handle_account_info_request_bytes - .observe(mem::size_of_val(&request) as f64); + let timer = self + .metrics + .handle_account_info_request_latency + .start_timer(); let response = self .client() @@ -191,9 +189,7 @@ impl AuthorityAPI for NetworkAuthorityClient { .map(tonic::Response::into_inner) .map_err(Into::into); - self.metrics - .handle_account_info_response_bytes - .observe(mem::size_of_val(&response) as f64); + timer.stop_and_record(); response } @@ -202,9 +198,10 @@ impl AuthorityAPI for NetworkAuthorityClient { &self, request: ObjectInfoRequest, ) -> Result { - self.metrics - .handle_object_info_request_bytes - .observe(mem::size_of_val(&request) as f64); + let timer = self + .metrics + .handle_object_info_request_latency + .start_timer(); let response = self .client() @@ -213,9 +210,7 @@ impl AuthorityAPI for NetworkAuthorityClient { .map(tonic::Response::into_inner) .map_err(Into::into); - self.metrics - .handle_object_info_response_bytes - .observe(mem::size_of_val(&response) as f64); + timer.stop_and_record(); response } @@ -225,9 +220,10 @@ impl AuthorityAPI for NetworkAuthorityClient { &self, request: TransactionInfoRequest, ) -> Result { - self.metrics - .handle_transaction_info_request_bytes - .observe(mem::size_of_val(&request) as f64); + let timer = self + .metrics + .handle_transaction_info_request_latency + .start_timer(); let response = self .client() @@ -236,9 +232,7 @@ impl AuthorityAPI for NetworkAuthorityClient { .map(tonic::Response::into_inner) .map_err(Into::into); - self.metrics - .handle_transaction_info_response_bytes - .observe(mem::size_of_val(&response) as f64); + timer.stop_and_record(); response } @@ -252,18 +246,9 @@ impl AuthorityAPI for NetworkAuthorityClient { .batch_info_request_start_seq .observe(request.start.unwrap_or(0) as f64); - self.metrics - .handle_batch_stream_request_bytes - .observe(mem::size_of_val(&request) as f64); - let stream = self .client() .batch_info(request) - .inspect(|item| { - self.metrics - .handle_batch_stream_response_bytes - .observe(mem::size_of_val(&item) as f64); - }) .await .map(tonic::Response::into_inner)? .map_err(Into::into); @@ -276,9 +261,7 @@ impl AuthorityAPI for NetworkAuthorityClient { &self, request: CheckpointRequest, ) -> Result { - self.metrics - .handle_checkpoint_request_bytes - .observe(mem::size_of_val(&request) as f64); + let timer = self.metrics.handle_checkpoint_request_latency.start_timer(); let response = self .client() @@ -287,9 +270,7 @@ impl AuthorityAPI for NetworkAuthorityClient { .map(tonic::Response::into_inner) .map_err(Into::into); - self.metrics - .handle_checkpoint_response_bytes - .observe(mem::size_of_val(&response) as f64); + timer.stop_and_record(); response } @@ -495,20 +476,12 @@ impl LocalAuthorityClient { #[derive(Clone)] pub struct NetworkAuthorityClientMetrics { - pub handle_transaction_request_bytes: Histogram, - pub handle_transaction_response_bytes: Histogram, - pub handle_certificate_request_bytes: Histogram, - pub handle_certificate_response_bytes: Histogram, - pub handle_account_info_request_bytes: Histogram, - pub handle_account_info_response_bytes: Histogram, - pub handle_object_info_request_bytes: Histogram, - pub handle_object_info_response_bytes: Histogram, - pub handle_transaction_info_request_bytes: Histogram, - pub handle_transaction_info_response_bytes: Histogram, - pub handle_batch_stream_request_bytes: Histogram, - pub handle_batch_stream_response_bytes: Histogram, - pub handle_checkpoint_request_bytes: Histogram, - pub handle_checkpoint_response_bytes: Histogram, + pub handle_transaction_request_latency: Histogram, + pub handle_certificate_request_latency: Histogram, + pub handle_account_info_request_latency: Histogram, + pub handle_object_info_request_latency: Histogram, + pub handle_transaction_info_request_latency: Histogram, + pub handle_checkpoint_request_latency: Histogram, pub batch_info_request_start_seq: Histogram, } @@ -516,87 +489,39 @@ pub struct NetworkAuthorityClientMetrics { impl NetworkAuthorityClientMetrics { pub fn new(registry: &prometheus::Registry) -> Self { Self { - handle_transaction_request_bytes: register_histogram_with_registry!( - "handle_transaction_request_bytes", - "Number of bytes sent by handle transaction request", - registry - ) - .unwrap(), - handle_transaction_response_bytes: register_histogram_with_registry!( - "handle_transaction_response_bytes", - "Number of bytes sent by handle transaction response", - registry - ) - .unwrap(), - handle_certificate_request_bytes: register_histogram_with_registry!( - "handle_certificate_request_bytes", - "Number of bytes sent by handle certrificate request", - registry - ) - .unwrap(), - handle_certificate_response_bytes: register_histogram_with_registry!( - "handle_certificate_response_bytes", - "Number of bytes sent by handle certificate response", - registry - ) - .unwrap(), - handle_account_info_request_bytes: register_histogram_with_registry!( - "handle_account_info_request_bytes", - "Number of bytes sent by handle account info request", - registry - ) - .unwrap(), - handle_account_info_response_bytes: register_histogram_with_registry!( - "handle_account_info_response_bytes", - "Number of bytes sent by handle account info response", - registry - ) - .unwrap(), - handle_object_info_request_bytes: register_histogram_with_registry!( - "handle_object_info_request_bytes", - "Number of bytes sent by handle object info request", - registry - ) - .unwrap(), - handle_object_info_response_bytes: register_histogram_with_registry!( - "handle_object_info_response_bytes", - "Number of bytes sent by handle object info response", - registry - ) - .unwrap(), - handle_transaction_info_request_bytes: register_histogram_with_registry!( - "handle_transaction_info_request_bytes", - "Number of bytes sent by handle transaction info request", + handle_transaction_request_latency: register_histogram_with_registry!( + "handle_transaction_request_latency", + "Latency of handle transaction request", registry ) .unwrap(), - handle_transaction_info_response_bytes: register_histogram_with_registry!( - "handle_transaction_info_response_bytes", - "Number of bytes sent by handle transaction info response", + handle_certificate_request_latency: register_histogram_with_registry!( + "handle_certificate_request_latency", + "Latency of handle certificate request", registry ) .unwrap(), - handle_batch_stream_request_bytes: register_histogram_with_registry!( - "handle_batch_stream_request_bytes", - "Number of bytes sent by handle batch stream request", + handle_account_info_request_latency: register_histogram_with_registry!( + "handle_account_info_request_latency", + "Latency of handle account info request", registry ) .unwrap(), - handle_batch_stream_response_bytes: register_histogram_with_registry!( - "handle_batch_stream_response_bytes", - "Number of bytes sent by handle batch request", + handle_object_info_request_latency: register_histogram_with_registry!( + "handle_object_info_request_latency", + "Latency of handle object info request", registry ) .unwrap(), - handle_checkpoint_request_bytes: register_histogram_with_registry!( - "handle_checkpoint_request_bytes", - "Number of bytes sent by handle checkpoint request", + handle_transaction_info_request_latency: register_histogram_with_registry!( + "handle_transaction_info_request_latency", + "Latency of handle transaction info request", registry ) .unwrap(), - handle_checkpoint_response_bytes: register_histogram_with_registry!( - "handle_checkpoint_response_bytes", - "Number of bytes sent by handle checkpoint response", + handle_checkpoint_request_latency: register_histogram_with_registry!( + "handle_checkpoint_request_latency", + "Latency of handle checkpoint request", registry ) .unwrap(), diff --git a/crates/sui-core/src/gateway_state.rs b/crates/sui-core/src/gateway_state.rs index 4d863e7460b83..b166a676984a6 100644 --- a/crates/sui-core/src/gateway_state.rs +++ b/crates/sui-core/src/gateway_state.rs @@ -245,7 +245,7 @@ impl GatewayState { let committee = Self::make_committee(config)?; let default_registry = Registry::new(); let prometheus_registry = prometheus_registry.unwrap_or(&default_registry); - let network_metrics = NetworkAuthorityClientMetrics::new(&prometheus_registry); + let network_metrics = NetworkAuthorityClientMetrics::new(prometheus_registry); let authority_clients = Self::make_authority_clients(config, network_metrics); Ok(Arc::new(GatewayState::new( diff --git a/crates/sui-tool/src/commands.rs b/crates/sui-tool/src/commands.rs index b95914e927833..e667497ce32f0 100644 --- a/crates/sui-tool/src/commands.rs +++ b/crates/sui-tool/src/commands.rs @@ -6,11 +6,14 @@ use futures::future::join_all; use std::cmp::min; use std::collections::BTreeMap; use std::path::PathBuf; +use std::sync::Arc; use std::time::Duration; use sui_config::genesis::Genesis; use sui_tool::db_tool::{execute_db_tool_command, print_db_all_tables, DbToolCommand}; -use sui_core::authority_client::{AuthorityAPI, NetworkAuthorityClient}; +use sui_core::authority_client::{ + AuthorityAPI, NetworkAuthorityClient, NetworkAuthorityClientMetrics, +}; use sui_types::{base_types::*, batch::*, messages::*, object::Owner}; use anyhow::anyhow; @@ -134,7 +137,10 @@ fn make_clients(genesis: PathBuf) -> Result