From 30ae028774f389f21bdef0a7fa03c90e178cfa03 Mon Sep 17 00:00:00 2001 From: Robert Metzger Date: Fri, 13 Nov 2020 07:50:57 +0100 Subject: [PATCH] [FLINK-17470] Send sigkill to hanging processes in standalone scripts --- .../src/main/flink-bin/bin/flink-daemon.sh | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/flink-dist/src/main/flink-bin/bin/flink-daemon.sh b/flink-dist/src/main/flink-bin/bin/flink-daemon.sh index 50e51d33e5035..cbbc0ce3e6795 100644 --- a/flink-dist/src/main/flink-bin/bin/flink-daemon.sh +++ b/flink-dist/src/main/flink-bin/bin/flink-daemon.sh @@ -88,6 +88,21 @@ out="${FLINK_LOG_PREFIX}.out" log_setting=("-Dlog.file=${log}" "-Dlog4j.configuration=file:${FLINK_CONF_DIR}/log4j.properties" "-Dlog4j.configurationFile=file:${FLINK_CONF_DIR}/log4j.properties" "-Dlogback.configurationFile=file:${FLINK_CONF_DIR}/logback.xml") +function guaranteed_kill { + to_stop_pid=$1 + daemon=$2 + + # send sigterm for graceful shutdown + kill $to_stop_pid + # wait 10 seconds for process to stop. By default, Flink kills the JVM 5 seconds after sigterm. + timeout 10 tail --pid=$to_stop_pid -f /dev/null + if [ "$?" -eq 124 ]; then + echo "Daemon $daemon didn't stop within 10 seconds. Killing it." + # send sigkill + kill -9 $to_stop_pid + fi +} + case $STARTSTOP in (start) @@ -142,7 +157,7 @@ case $STARTSTOP in if kill -0 $to_stop > /dev/null 2>&1; then echo "Stopping $DAEMON daemon (pid: $to_stop) on host $HOSTNAME." - kill $to_stop + guaranteed_kill $to_stop $DAEMON else echo "No $DAEMON daemon (pid: $to_stop) is running anymore on $HOSTNAME." fi @@ -159,7 +174,7 @@ case $STARTSTOP in while read to_stop; do if kill -0 $to_stop > /dev/null 2>&1; then echo "Stopping $DAEMON daemon (pid: $to_stop) on host $HOSTNAME." - kill $to_stop + guaranteed_kill $to_stop $DAEMON else echo "Skipping $DAEMON daemon (pid: $to_stop), because it is not running anymore on $HOSTNAME." fi