|
1 |
| -#!/bin/bash -ex |
2 |
| -# |
3 |
| -# Status of the InfluxDB/Chronograf/Grafana/Chronograf_8889 containers |
4 |
| -# |
5 |
| -cd "$(dirname "$0")" |
6 |
| - |
7 |
| -if [[ -z $HOST ]]; then |
8 |
| - HOST=metrics.solana.com |
9 |
| -fi |
10 |
| -echo "HOST: $HOST" |
11 |
| - |
12 |
| -echo +++ status |
13 |
| -( |
14 |
| - set -x |
15 |
| - pwd |
16 |
| - sudo docker ps --no-trunc --size |
17 |
| - df -h |
18 |
| - free -h |
19 |
| - uptime |
20 |
| -) |
21 |
| - |
22 |
| -# If the container is not running state or exited state, then sent the notification on slack and redeploy the container again |
23 |
| - |
24 |
| -for container in chronograf_8889 grafana alertmanager alertmanager-discord prometheus chronograf kapacitor ; do |
25 |
| - if [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" != "running" ] || [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" = "exited" ]; then |
26 |
| - curl -X POST -H 'Content-type: application/json' --data '{"text": "'"$container"' container is down in the metrics-mainsystem server. Restarting..."}' "$SLACK_WEBHOOK" |
27 |
| - curl -X POST -H 'Content-type: application/json' --data '{"content": "'"$container"' container is down in the metrics-mainsystem server. Restarting..."}' "$DISCORD_WEBHOOK" |
28 |
| - echo "Starting up script" |
29 |
| - sudo bash $container.sh |
30 |
| - sleep 30 |
31 |
| - fi |
32 |
| - done |
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# List of containers |
| 4 | +containers=("chronograf_8889" "grafana" "alertmanager" "alertmanager-discord" "prometheus" "chronograf" "kapacitor") |
| 5 | + |
| 6 | +# Send a message to Discord |
| 7 | +send_discord_message() { |
| 8 | + local message="$1" |
| 9 | + curl -sS -H "Content-Type: application/json" -X POST -d "{\"content\": \"$message\"}" "$DISCORD_WEBHOOK" |
| 10 | +} |
| 11 | + |
| 12 | +# Send a critical alert to PagerDuty |
| 13 | +send_pagerduty_alert() { |
| 14 | + local description="$1" |
| 15 | + curl -sS -H "Content-Type: application/json" -X POST -d "{\"event_action\": \"trigger\", \"payload\": {\"summary\": \"$description\", \"source\": \"Docker Monitor\", \"severity\": \"critical\"}}" "$PAGERDUTY_WEBHOOK" |
| 16 | +} |
| 17 | + |
| 18 | +# Iterate over the containers and check their status |
| 19 | +for container in "${containers[@]}"; do |
| 20 | + container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null) |
| 21 | + |
| 22 | + if [ "$container_status" != "running" ]; then |
| 23 | + send_discord_message "$container is down and it's being redeployed..." |
| 24 | + |
| 25 | + # Run the container.sh script to redeploy the container |
| 26 | + chmod +x "$container.sh" |
| 27 | + ./"$container.sh" |
| 28 | + sleep 10 |
| 29 | + |
| 30 | + # Check the container status again |
| 31 | + container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null) |
| 32 | + |
| 33 | + if [ "$container_status" != "running" ]; then |
| 34 | + send_discord_message "$container failed to redeploy and manual intervention is required" |
| 35 | + send_pagerduty_alert "$container failed to redeploy and manual intervention is required." |
| 36 | + else |
| 37 | + send_discord_message "$container has been redeployed successfully" |
| 38 | + fi |
| 39 | + fi |
| 40 | +done |
0 commit comments