Skip to content

Commit

Permalink
[logging] Added metrics around submission of struct logs
Browse files Browse the repository at this point in the history
In order to have higher confidence around logging, this adds a slew of
counters to keep track of errors, and sent messages.  Because if you
can't log it, then you'll have to use metrics to know what's going on.

Closes: aptos-labs#5309
Approved by: andll
  • Loading branch information
gregnazario authored and bors-libra committed Jul 28, 2020
1 parent 5a3cc6c commit 9c09a63
Show file tree
Hide file tree
Showing 3 changed files with 339 additions and 7 deletions.
72 changes: 71 additions & 1 deletion common/logger/src/counters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,77 @@
use once_cell::sync::Lazy;
use prometheus::{register_int_counter, IntCounter};

/// Count of the struct logs.
/// Count of the struct logs submitted by macro
pub static STRUCT_LOG_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!("libra_struct_log_count", "Count of the struct logs.").unwrap()
});

/// Count of struct logs processed, but not necessarily sent
pub static PROCESSED_STRUCT_LOG_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"libra_struct_log_processed_count",
"Count of the struct logs received by the sender."
)
.unwrap()
});

/// Count of struct logs submitted through TCP
pub static SENT_STRUCT_LOG_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"libra_struct_log_tcp_submit_count",
"Count of the struct logs submitted by TCP."
)
.unwrap()
});

/// Count of logs failed to be submitted (but not by the queue)
pub static DROPPED_STRUCT_LOG_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"libra_struct_log_intentionally_dropped_count",
"Count of the intentionally dropped struct logs."
)
.unwrap()
});

/// Metric for when we connect the outbound TCP
pub static STRUCT_LOG_TCP_CONNECT_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"libra_struct_log_tcp_connect_count",
"Count of the tcp connections made for struct logs."
)
.unwrap()
});

/// Metric for when we fail to log during sending to the queue
pub static STRUCT_LOG_QUEUE_ERROR_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"libra_struct_log_queue_error_count",
"Count of all errors during queuing struct logs."
)
.unwrap()
});

/// Metric for when we fail to log during sending
pub static STRUCT_LOG_SEND_ERROR_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"libra_struct_log_send_error_count",
"Count of all errors during sending struct logs."
)
.unwrap()
});

pub static STRUCT_LOG_CONNECT_ERROR_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"libra_struct_log_connect_error_count",
"Count of all errors during connecting for struct logs."
)
.unwrap()
});

pub static STRUCT_LOG_PARSE_ERROR_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"libra_struct_log_parse_erro_count",
"Count of all parse errors during struct logs."
)
.unwrap()
});
30 changes: 24 additions & 6 deletions common/logger/src/struct_log.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
// Copyright (c) The Libra Core Contributors
// SPDX-License-Identifier: Apache-2.0

use crate::counters::{
PROCESSED_STRUCT_LOG_COUNT, SENT_STRUCT_LOG_COUNT, STRUCT_LOG_CONNECT_ERROR_COUNT,
STRUCT_LOG_PARSE_ERROR_COUNT, STRUCT_LOG_QUEUE_ERROR_COUNT, STRUCT_LOG_SEND_ERROR_COUNT,
STRUCT_LOG_TCP_CONNECT_COUNT,
};
use chrono::Utc;
use once_cell::sync::Lazy;
use serde::Serialize;
Expand Down Expand Up @@ -419,6 +423,7 @@ impl TCPStructLog {
impl StructLogSink for TCPStructLog {
fn send(&self, entry: StructuredLogEntry) {
if let Err(e) = self.sender.try_send(entry) {
STRUCT_LOG_QUEUE_ERROR_COUNT.inc();
// Use log crate macro to avoid generation of structured log in this case
// Otherwise we will have infinite loop
log::error!("[Logging] Failed to send structured log: {}", e);
Expand All @@ -440,14 +445,19 @@ impl TCPStructLogThread {
// Continually iterate over requests unless writing fails.
fn process_requests(&self, stream: &mut TcpStream) {
for entry in &self.receiver {
PROCESSED_STRUCT_LOG_COUNT.inc();
// Parse string, skipping over anything that can't be parsed
let json_string = match entry.to_json_string() {
Ok(json_string) => json_string,
Err(_) => continue,
Err(_) => {
STRUCT_LOG_PARSE_ERROR_COUNT.inc();
continue;
}
};

// If we fail to write, exit out and create a new stream
if let Err(e) = stream.write(json_string.as_bytes()) {
STRUCT_LOG_SEND_ERROR_COUNT.inc();
println!(
"[Logging] Error while sending data to logstash({}): {}",
self.address, e
Expand All @@ -460,6 +470,8 @@ impl TCPStructLogThread {

// Start over stream
return;
} else {
SENT_STRUCT_LOG_COUNT.inc()
}
}
}
Expand All @@ -479,6 +491,7 @@ impl TCPStructLogThread {
pub fn run(self) {
loop {
let mut maybe_stream = self.connect();
STRUCT_LOG_TCP_CONNECT_COUNT.inc();

// This is to ensure that we do actually connect before sending requests
// If the request process loop ends, the stream is broken. Reset and create a new one.
Expand All @@ -488,22 +501,27 @@ impl TCPStructLogThread {
if let Err(err) =
stream.set_write_timeout(Some(Duration::from_millis(WRITE_TIMEOUT_MS)))
{
STRUCT_LOG_CONNECT_ERROR_COUNT.inc();
println!("[Logging] Failed to set write timeout: {}", err);
continue;
}

// Write a log signifying that the logger connected, and test the stream
if let Err(err) = Self::write_control_msg(stream, "connected") {
STRUCT_LOG_CONNECT_ERROR_COUNT.inc();
println!("[Logging] control message failed: {}", err);
continue;
}

self.process_requests(&mut stream)
}
Err(err) => println!(
"[Logging] Failed to connect to {}, cause {}",
self.address, err
),
Err(err) => {
STRUCT_LOG_CONNECT_ERROR_COUNT.inc();
println!(
"[Logging] Failed to connect to {}, cause {}",
self.address, err
)
}
}
}
}
Expand Down
Loading

0 comments on commit 9c09a63

Please sign in to comment.