Skip to content

Commit

Permalink
fix influx status check (solana-labs#31456)
Browse files Browse the repository at this point in the history
  • Loading branch information
joeaba authored May 3, 2023
1 parent fa2a5a5 commit 74315d2
Showing 1 changed file with 19 additions and 34 deletions.
53 changes: 19 additions & 34 deletions metrics/influx-enterprise/status.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash -ex
#
# (Re)starts the InfluxDB/Chronograf containers
# (Re)starts the InfluxDB services
#

cd "$(dirname "$0")"
Expand All @@ -18,53 +18,38 @@ check_service() {
local service=$1
shift
local servers=("$@")
local status="unknown"
local message=""

# Loop through the servers
for server in "${servers[@]}"; do
# Check if the service is running
if ssh -o StrictHostKeyChecking=no sol@"$server" sudo systemctl is-active "$service" >/dev/null; then
# Service is running
status="running"
break
fi
done

# If the service is not running, send an alert to Discord and try to restart it
if [[ "$status" == "unknown" ]]; then
message="The $service service is not running on $server. Restarting..."
echo "$message"
curl -H "Content-Type: application/json" -d '{"content":"'"$message"'"}' "$DISCORD_WEBHOOK"
message="The $service service is running on $server."
echo "$message"
else
# Service is not running, try to restart it
message="The $service service is not running on $server. Restarting..."
echo "$message"
curl -H "Content-Type: application/json" -d '{"content":"'"$message"'"}' "$DISCORD_WEBHOOK"

for server in "${servers[@]}"; do
# Try to restart the service
ssh -o StrictHostKeyChecking=no sol@"$server" sudo systemctl restart "$service"
sleep 10 # Wait for the service to start

if ssh -o StrictHostKeyChecking=no sol@"$server" sudo systemctl is-active "$service" >/dev/null; then
# Service restarted successfully
status="restarted"
message="The $service service was restarted successfully on $server."
break
echo "$message"
curl -H "Content-Type: application/json" -d '{"content":"'"$message"'"}' "$DISCORD_WEBHOOK"
else
# Service failed to restart
message="ERROR: The $service service failed to restart on $server."
echo "$message"
curl -H "Content-Type: application/json" -d '{"content":"'"$message"', manual intervention is required."}' "$DISCORD_WEBHOOK"
curl -H "Content-Type: application/json" -d '{"routing_key":"<your-pagerduty-service-key>","event_action":"trigger","payload":{"summary":"The '"$service"' service failed to restart on '"$server"'.","severity":"critical"}}' "$PAGERDUTY_WEBHOOK"
fi
done
fi

# Send message to Discord and PagerDuty
case "$status" in
"running")
# No message is sent when the service is already running properly
;;
"restarted")
echo "$message"
curl -H "Content-Type: application/json" -d '{"content":"'"$message"'"}' "$DISCORD_WEBHOOK"
;;
*)
echo "ERROR: The '$service' service failed to restart on '$server'."
curl -H "Content-Type: application/json" -d '{"content":"ERROR: The '"$service"' service failed to restart on '"$server"', manual intervention is required."}' "$DISCORD_WEBHOOK"
curl -H "Content-Type: application/json" -d '{"routing_key":"<your-pagerduty-service-key>","event_action":"trigger","payload":{"summary":"The '"$service"' service failed to restart on '"$server"'.","severity":"critical"}}' "$PAGERDUTY_WEBHOOK"
;;
esac
fi
done
}

# Check the influxdb service
Expand Down

0 comments on commit 74315d2

Please sign in to comment.