Skip to content

Commit eedb92a

Browse files
refactor container status check (solana-labs#30998)
* refactor container status check * remove blank line at EOF * add pagerduty integration Co-authored-by: axleiro <[email protected]> * fix discord webhook reference * remove webhook references --------- Co-authored-by: axleiro <[email protected]>
1 parent a1149ec commit eedb92a

File tree

2 files changed

+41
-33
lines changed

2 files changed

+41
-33
lines changed

metrics/metrics-main/alertmanager.sh

100644100755
+1-1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,6 @@ sudo docker run -it -d \
3939
--user root:root \
4040
--publish 9093:9093 \
4141
--name=alertmanager \
42-
--volume "PWD"/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
42+
--volume "$PWD"/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
4343
--volume /etc/hosts:/etc/hosts \
4444
$ALERTMANAGER_IMAGE

metrics/metrics-main/status.sh

100644100755
+40-32
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,40 @@
1-
#!/bin/bash -ex
2-
#
3-
# Status of the InfluxDB/Chronograf/Grafana/Chronograf_8889 containers
4-
#
5-
cd "$(dirname "$0")"
6-
7-
if [[ -z $HOST ]]; then
8-
HOST=metrics.solana.com
9-
fi
10-
echo "HOST: $HOST"
11-
12-
echo +++ status
13-
(
14-
set -x
15-
pwd
16-
sudo docker ps --no-trunc --size
17-
df -h
18-
free -h
19-
uptime
20-
)
21-
22-
# If the container is not running state or exited state, then sent the notification on slack and redeploy the container again
23-
24-
for container in chronograf_8889 grafana alertmanager alertmanager-discord prometheus chronograf kapacitor ; do
25-
if [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" != "running" ] || [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" = "exited" ]; then
26-
curl -X POST -H 'Content-type: application/json' --data '{"text": "'"$container"' container is down in the metrics-mainsystem server. Restarting..."}' "$SLACK_WEBHOOK"
27-
curl -X POST -H 'Content-type: application/json' --data '{"content": "'"$container"' container is down in the metrics-mainsystem server. Restarting..."}' "$DISCORD_WEBHOOK"
28-
echo "Starting up script"
29-
sudo bash $container.sh
30-
sleep 30
31-
fi
32-
done
1+
#!/bin/bash
2+
3+
# List of containers
4+
containers=("chronograf_8889" "grafana" "alertmanager" "alertmanager-discord" "prometheus" "chronograf" "kapacitor")
5+
6+
# Send a message to Discord
7+
send_discord_message() {
8+
local message="$1"
9+
curl -sS -H "Content-Type: application/json" -X POST -d "{\"content\": \"$message\"}" "$DISCORD_WEBHOOK"
10+
}
11+
12+
# Send a critical alert to PagerDuty
13+
send_pagerduty_alert() {
14+
local description="$1"
15+
curl -sS -H "Content-Type: application/json" -X POST -d "{\"event_action\": \"trigger\", \"payload\": {\"summary\": \"$description\", \"source\": \"Docker Monitor\", \"severity\": \"critical\"}}" "$PAGERDUTY_WEBHOOK"
16+
}
17+
18+
# Iterate over the containers and check their status
19+
for container in "${containers[@]}"; do
20+
container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null)
21+
22+
if [ "$container_status" != "running" ]; then
23+
send_discord_message "$container is down and it's being redeployed..."
24+
25+
# Run the container.sh script to redeploy the container
26+
chmod +x "$container.sh"
27+
./"$container.sh"
28+
sleep 10
29+
30+
# Check the container status again
31+
container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null)
32+
33+
if [ "$container_status" != "running" ]; then
34+
send_discord_message "$container failed to redeploy and manual intervention is required"
35+
send_pagerduty_alert "$container failed to redeploy and manual intervention is required."
36+
else
37+
send_discord_message "$container has been redeployed successfully"
38+
fi
39+
fi
40+
done

0 commit comments

Comments
 (0)