From 459c9699b8fd9614145a1ebfdde2d462dc93d3c6 Mon Sep 17 00:00:00 2001 From: Michael <68944931+michaelh-laine@users.noreply.github.com> Date: Wed, 5 Oct 2022 20:55:45 +0200 Subject: [PATCH] Resolve PagerDuty incident on All Clear instead of triggering new incident (#28232) --- Cargo.lock | 1 + notifier/Cargo.toml | 1 + notifier/src/lib.rs | 48 ++++++++++++++++++++++++++++-------------- watchtower/src/main.rs | 14 ++++++------ 4 files changed, 42 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0f366de2938a43..fa31b33977a2dd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5668,6 +5668,7 @@ dependencies = [ "log", "reqwest", "serde_json", + "solana-sdk 1.15.0", ] [[package]] diff --git a/notifier/Cargo.toml b/notifier/Cargo.toml index 02c2ae28a4226e..a4245badcf09f8 100644 --- a/notifier/Cargo.toml +++ b/notifier/Cargo.toml @@ -13,6 +13,7 @@ edition = "2021" log = "0.4.17" reqwest = { version = "0.11.12", default-features = false, features = ["blocking", "brotli", "deflate", "gzip", "rustls-tls", "json"] } serde_json = "1.0" +solana-sdk = { path = "../sdk", version = "=1.15.0" } [lib] name = "solana_notifier" diff --git a/notifier/src/lib.rs b/notifier/src/lib.rs index 594f5938d87e2d..87c01bfebf29eb 100644 --- a/notifier/src/lib.rs +++ b/notifier/src/lib.rs @@ -27,6 +27,7 @@ use log::*; use { reqwest::{blocking::Client, StatusCode}, serde_json::json, + solana_sdk::hash::Hash, std::{env, str::FromStr, thread::sleep, time::Duration}, }; @@ -83,7 +84,7 @@ fn get_twilio_config() -> Result<Option<TwilioWebHook>, String> { Ok(Some(config)) } -enum NotificationType { +enum NotificationChannel { Discord(String), Slack(String), PagerDuty(String), @@ -92,9 +93,15 @@ enum NotificationType { Log(Level), } +#[derive(Clone)] +pub enum NotificationType { + Trigger { incident: Hash }, + Resolve { incident: Hash }, +} + pub struct Notifier { client: Client, - notifiers: Vec<NotificationType>, + notifiers: Vec<NotificationChannel>, } impl Notifier { @@ -108,32 +115,32 @@ impl Notifier { let mut notifiers = vec![]; if let Ok(webhook) = env::var(format!("{}DISCORD_WEBHOOK", env_prefix)) { - notifiers.push(NotificationType::Discord(webhook)); + notifiers.push(NotificationChannel::Discord(webhook)); } if let Ok(webhook) = env::var(format!("{}SLACK_WEBHOOK", env_prefix)) { - notifiers.push(NotificationType::Slack(webhook)); + notifiers.push(NotificationChannel::Slack(webhook)); } if let Ok(routing_key) = env::var(format!("{}PAGERDUTY_INTEGRATION_KEY", env_prefix)) { - notifiers.push(NotificationType::PagerDuty(routing_key)); + notifiers.push(NotificationChannel::PagerDuty(routing_key)); } if let (Ok(bot_token), Ok(chat_id)) = ( env::var(format!("{}TELEGRAM_BOT_TOKEN", env_prefix)), env::var(format!("{}TELEGRAM_CHAT_ID", env_prefix)), ) { - notifiers.push(NotificationType::Telegram(TelegramWebHook { + notifiers.push(NotificationChannel::Telegram(TelegramWebHook { bot_token, chat_id, })); } if let Ok(Some(webhook)) = get_twilio_config() { - notifiers.push(NotificationType::Twilio(webhook)); + notifiers.push(NotificationChannel::Twilio(webhook)); } if let Ok(log_level) = env::var(format!("{}LOG_NOTIFIER_LEVEL", env_prefix)) { match Level::from_str(&log_level) { - Ok(level) => notifiers.push(NotificationType::Log(level)), + Ok(level) => notifiers.push(NotificationChannel::Log(level)), Err(e) => warn!( "could not parse specified log notifier level string ({}): {}", log_level, e @@ -153,10 +160,10 @@ impl Notifier { self.notifiers.is_empty() } - pub fn send(&self, msg: &str) { + pub fn send(&self, msg: &str, notification_type: &NotificationType) { for notifier in &self.notifiers { match notifier { - NotificationType::Discord(webhook) => { + NotificationChannel::Discord(webhook) => { for line in msg.split('\n') { // Discord rate limiting is aggressive, limit to 1 message a second sleep(Duration::from_millis(1000)); @@ -183,14 +190,23 @@ impl Notifier { } } } - NotificationType::Slack(webhook) => { + NotificationChannel::Slack(webhook) => { let data = json!({ "text": msg }); if let Err(err) = self.client.post(webhook).json(&data).send() { warn!("Failed to send Slack message: {:?}", err); } } - NotificationType::PagerDuty(routing_key) => { - let data = json!({"payload":{"summary":msg,"source":"solana-watchtower","severity":"critical"},"routing_key":routing_key,"event_action":"trigger"}); + NotificationChannel::PagerDuty(routing_key) => { + let event_action = match notification_type { + NotificationType::Trigger { incident: _ } => String::from("trigger"), + NotificationType::Resolve { incident: _ } => String::from("resolve"), + }; + let dedup_key = match notification_type { + NotificationType::Trigger { ref incident } => incident.clone().to_string(), + NotificationType::Resolve { ref incident } => incident.clone().to_string(), + }; + + let data = json!({"payload":{"summary":msg,"source":"solana-watchtower","severity":"critical"},"routing_key":routing_key,"event_action":event_action,"dedup_key":dedup_key}); let url = "https://events.pagerduty.com/v2/enqueue"; if let Err(err) = self.client.post(url).json(&data).send() { @@ -198,7 +214,7 @@ impl Notifier { } } - NotificationType::Telegram(TelegramWebHook { chat_id, bot_token }) => { + NotificationChannel::Telegram(TelegramWebHook { chat_id, bot_token }) => { let data = json!({ "chat_id": chat_id, "text": msg }); let url = format!("https://api.telegram.org/bot{}/sendMessage", bot_token); @@ -207,7 +223,7 @@ impl Notifier { } } - NotificationType::Twilio(TwilioWebHook { + NotificationChannel::Twilio(TwilioWebHook { account, token, to, @@ -222,7 +238,7 @@ impl Notifier { warn!("Failed to send Twilio message: {:?}", err); } } - NotificationType::Log(level) => { + NotificationChannel::Log(level) => { log!(*level, "{}", msg) } } diff --git a/watchtower/src/main.rs b/watchtower/src/main.rs index 59f6c02615ffc2..0ee297925b224c 100644 --- a/watchtower/src/main.rs +++ b/watchtower/src/main.rs @@ -10,7 +10,7 @@ use { }, solana_cli_output::display::format_labeled_address, solana_metrics::{datapoint_error, datapoint_info}, - solana_notifier::Notifier, + solana_notifier::{NotificationType, Notifier}, solana_rpc_client::rpc_client::RpcClient, solana_rpc_client_api::{client_error, response::RpcVoteAccountStatus}, solana_sdk::{ @@ -244,6 +244,7 @@ fn main() -> Result<(), Box<dyn error::Error>> { let mut last_notification_msg = "".into(); let mut num_consecutive_failures = 0; let mut last_success = Instant::now(); + let mut incident = Hash::new_unique(); loop { let failure = match get_cluster_info(&config, &rpc_client) { @@ -373,7 +374,7 @@ fn main() -> Result<(), Box<dyn error::Error>> { if num_consecutive_failures > config.unhealthy_threshold { datapoint_info!("watchtower-sanity", ("ok", false, bool)); if last_notification_msg != notification_msg { - notifier.send(¬ification_msg); + notifier.send(¬ification_msg, &NotificationType::Trigger { incident }); } datapoint_error!( "watchtower-sanity-failure", @@ -399,14 +400,15 @@ fn main() -> Result<(), Box<dyn error::Error>> { humantime::format_duration(alarm_duration) ); info!("{}", all_clear_msg); - notifier.send(&format!( - "solana-watchtower{}: {}", - config.name_suffix, all_clear_msg - )); + notifier.send( + &format!("solana-watchtower{}: {}", config.name_suffix, all_clear_msg), + &NotificationType::Resolve { incident }, + ); } last_notification_msg = "".into(); last_success = Instant::now(); num_consecutive_failures = 0; + incident = Hash::new_unique(); } sleep(config.interval); }