From 459c9699b8fd9614145a1ebfdde2d462dc93d3c6 Mon Sep 17 00:00:00 2001
From: Michael <68944931+michaelh-laine@users.noreply.github.com>
Date: Wed, 5 Oct 2022 20:55:45 +0200
Subject: [PATCH] Resolve PagerDuty incident on All Clear instead of triggering
 new incident (#28232)

---
 Cargo.lock             |  1 +
 notifier/Cargo.toml    |  1 +
 notifier/src/lib.rs    | 48 ++++++++++++++++++++++++++++--------------
 watchtower/src/main.rs | 14 ++++++------
 4 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0f366de2938a43..fa31b33977a2dd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5668,6 +5668,7 @@ dependencies = [
  "log",
  "reqwest",
  "serde_json",
+ "solana-sdk 1.15.0",
 ]
 
 [[package]]
diff --git a/notifier/Cargo.toml b/notifier/Cargo.toml
index 02c2ae28a4226e..a4245badcf09f8 100644
--- a/notifier/Cargo.toml
+++ b/notifier/Cargo.toml
@@ -13,6 +13,7 @@ edition = "2021"
 log = "0.4.17"
 reqwest = { version = "0.11.12", default-features = false, features = ["blocking", "brotli", "deflate", "gzip", "rustls-tls", "json"] }
 serde_json = "1.0"
+solana-sdk = { path = "../sdk", version = "=1.15.0" }
 
 [lib]
 name = "solana_notifier"
diff --git a/notifier/src/lib.rs b/notifier/src/lib.rs
index 594f5938d87e2d..87c01bfebf29eb 100644
--- a/notifier/src/lib.rs
+++ b/notifier/src/lib.rs
@@ -27,6 +27,7 @@ use log::*;
 use {
     reqwest::{blocking::Client, StatusCode},
     serde_json::json,
+    solana_sdk::hash::Hash,
     std::{env, str::FromStr, thread::sleep, time::Duration},
 };
 
@@ -83,7 +84,7 @@ fn get_twilio_config() -> Result<Option<TwilioWebHook>, String> {
     Ok(Some(config))
 }
 
-enum NotificationType {
+enum NotificationChannel {
     Discord(String),
     Slack(String),
     PagerDuty(String),
@@ -92,9 +93,15 @@ enum NotificationType {
     Log(Level),
 }
 
+#[derive(Clone)]
+pub enum NotificationType {
+    Trigger { incident: Hash },
+    Resolve { incident: Hash },
+}
+
 pub struct Notifier {
     client: Client,
-    notifiers: Vec<NotificationType>,
+    notifiers: Vec<NotificationChannel>,
 }
 
 impl Notifier {
@@ -108,32 +115,32 @@ impl Notifier {
         let mut notifiers = vec![];
 
         if let Ok(webhook) = env::var(format!("{}DISCORD_WEBHOOK", env_prefix)) {
-            notifiers.push(NotificationType::Discord(webhook));
+            notifiers.push(NotificationChannel::Discord(webhook));
         }
         if let Ok(webhook) = env::var(format!("{}SLACK_WEBHOOK", env_prefix)) {
-            notifiers.push(NotificationType::Slack(webhook));
+            notifiers.push(NotificationChannel::Slack(webhook));
         }
         if let Ok(routing_key) = env::var(format!("{}PAGERDUTY_INTEGRATION_KEY", env_prefix)) {
-            notifiers.push(NotificationType::PagerDuty(routing_key));
+            notifiers.push(NotificationChannel::PagerDuty(routing_key));
         }
 
         if let (Ok(bot_token), Ok(chat_id)) = (
             env::var(format!("{}TELEGRAM_BOT_TOKEN", env_prefix)),
             env::var(format!("{}TELEGRAM_CHAT_ID", env_prefix)),
         ) {
-            notifiers.push(NotificationType::Telegram(TelegramWebHook {
+            notifiers.push(NotificationChannel::Telegram(TelegramWebHook {
                 bot_token,
                 chat_id,
             }));
         }
 
         if let Ok(Some(webhook)) = get_twilio_config() {
-            notifiers.push(NotificationType::Twilio(webhook));
+            notifiers.push(NotificationChannel::Twilio(webhook));
         }
 
         if let Ok(log_level) = env::var(format!("{}LOG_NOTIFIER_LEVEL", env_prefix)) {
             match Level::from_str(&log_level) {
-                Ok(level) => notifiers.push(NotificationType::Log(level)),
+                Ok(level) => notifiers.push(NotificationChannel::Log(level)),
                 Err(e) => warn!(
                     "could not parse specified log notifier level string ({}): {}",
                     log_level, e
@@ -153,10 +160,10 @@ impl Notifier {
         self.notifiers.is_empty()
     }
 
-    pub fn send(&self, msg: &str) {
+    pub fn send(&self, msg: &str, notification_type: &NotificationType) {
         for notifier in &self.notifiers {
             match notifier {
-                NotificationType::Discord(webhook) => {
+                NotificationChannel::Discord(webhook) => {
                     for line in msg.split('\n') {
                         // Discord rate limiting is aggressive, limit to 1 message a second
                         sleep(Duration::from_millis(1000));
@@ -183,14 +190,23 @@ impl Notifier {
                         }
                     }
                 }
-                NotificationType::Slack(webhook) => {
+                NotificationChannel::Slack(webhook) => {
                     let data = json!({ "text": msg });
                     if let Err(err) = self.client.post(webhook).json(&data).send() {
                         warn!("Failed to send Slack message: {:?}", err);
                     }
                 }
-                NotificationType::PagerDuty(routing_key) => {
-                    let data = json!({"payload":{"summary":msg,"source":"solana-watchtower","severity":"critical"},"routing_key":routing_key,"event_action":"trigger"});
+                NotificationChannel::PagerDuty(routing_key) => {
+                    let event_action = match notification_type {
+                        NotificationType::Trigger { incident: _ } => String::from("trigger"),
+                        NotificationType::Resolve { incident: _ } => String::from("resolve"),
+                    };
+                    let dedup_key = match notification_type {
+                        NotificationType::Trigger { ref incident } => incident.clone().to_string(),
+                        NotificationType::Resolve { ref incident } => incident.clone().to_string(),
+                    };
+
+                    let data = json!({"payload":{"summary":msg,"source":"solana-watchtower","severity":"critical"},"routing_key":routing_key,"event_action":event_action,"dedup_key":dedup_key});
                     let url = "https://events.pagerduty.com/v2/enqueue";
 
                     if let Err(err) = self.client.post(url).json(&data).send() {
@@ -198,7 +214,7 @@ impl Notifier {
                     }
                 }
 
-                NotificationType::Telegram(TelegramWebHook { chat_id, bot_token }) => {
+                NotificationChannel::Telegram(TelegramWebHook { chat_id, bot_token }) => {
                     let data = json!({ "chat_id": chat_id, "text": msg });
                     let url = format!("https://api.telegram.org/bot{}/sendMessage", bot_token);
 
@@ -207,7 +223,7 @@ impl Notifier {
                     }
                 }
 
-                NotificationType::Twilio(TwilioWebHook {
+                NotificationChannel::Twilio(TwilioWebHook {
                     account,
                     token,
                     to,
@@ -222,7 +238,7 @@ impl Notifier {
                         warn!("Failed to send Twilio message: {:?}", err);
                     }
                 }
-                NotificationType::Log(level) => {
+                NotificationChannel::Log(level) => {
                     log!(*level, "{}", msg)
                 }
             }
diff --git a/watchtower/src/main.rs b/watchtower/src/main.rs
index 59f6c02615ffc2..0ee297925b224c 100644
--- a/watchtower/src/main.rs
+++ b/watchtower/src/main.rs
@@ -10,7 +10,7 @@ use {
     },
     solana_cli_output::display::format_labeled_address,
     solana_metrics::{datapoint_error, datapoint_info},
-    solana_notifier::Notifier,
+    solana_notifier::{NotificationType, Notifier},
     solana_rpc_client::rpc_client::RpcClient,
     solana_rpc_client_api::{client_error, response::RpcVoteAccountStatus},
     solana_sdk::{
@@ -244,6 +244,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
     let mut last_notification_msg = "".into();
     let mut num_consecutive_failures = 0;
     let mut last_success = Instant::now();
+    let mut incident = Hash::new_unique();
 
     loop {
         let failure = match get_cluster_info(&config, &rpc_client) {
@@ -373,7 +374,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
             if num_consecutive_failures > config.unhealthy_threshold {
                 datapoint_info!("watchtower-sanity", ("ok", false, bool));
                 if last_notification_msg != notification_msg {
-                    notifier.send(&notification_msg);
+                    notifier.send(&notification_msg, &NotificationType::Trigger { incident });
                 }
                 datapoint_error!(
                     "watchtower-sanity-failure",
@@ -399,14 +400,15 @@ fn main() -> Result<(), Box<dyn error::Error>> {
                     humantime::format_duration(alarm_duration)
                 );
                 info!("{}", all_clear_msg);
-                notifier.send(&format!(
-                    "solana-watchtower{}: {}",
-                    config.name_suffix, all_clear_msg
-                ));
+                notifier.send(
+                    &format!("solana-watchtower{}: {}", config.name_suffix, all_clear_msg),
+                    &NotificationType::Resolve { incident },
+                );
             }
             last_notification_msg = "".into();
             last_success = Instant::now();
             num_consecutive_failures = 0;
+            incident = Hash::new_unique();
         }
         sleep(config.interval);
     }