virt-launcher: fix exit pod race condition

This commit fixes a race condition that would cause a VMI compute pod to exit and end up in an erroneous 'Failed' state even if actually the pod terminated gracefully. The race condition is caused by having 2 goroutines doing a wait() on the same PID, while a fix existed already it was not covering the whole critical section causing to the race to persist (just with a lower failure rate). One example of the race is the following: The goroutine executing on the SIGCHLD signal handler completes the Wait4() and is about to write the exit status on the channel goroutine 1 ``` case syscall.SIGCHLD: var wstatus syscall.WaitStatus wpid, err := syscall.Wait4(-1, &wstatus, syscall.WNOHANG, nil) if err != nil { log.Log.Reason(err).Errorf("Failed to reap process %d", wpid) } **EXECUTION HERE** if wpid == cmd.Process.Pid { exitStatus <- wstatus } ``` While a second goroutine gets unblocked on the Wait() and continues its execution after not finding anything inside the exitStatus channel goroutine 2 ``` exitCode := 0 if err := cmd.Wait(); err != nil { select { case status := <-exitStatus: exitCode = int(status) default: **EXECUTION HERE** exitCode = 1 if exiterr, ok := err.(*exec.ExitError); ok { ``` This would cause the VMI pod to report an `exitCode=1` thus ending up in `Failed` state because internally the `Wait()` function actually calls first `waitid()` and then `wait4()`, both these functions can fail if the SIGCHLD handler reaps the child process first leaving no waitable pid in the system. The fix has been tested with the following script: ``` function drain { NODE=$1 kubectl drain $NODE --ignore-daemonsets --delete-local-data &> /dev/null sleep 0.5 kubectl uncordon $NODE } function check-failed-pod { pods=$(kubectl get pod --field-selector 'status.phase==Failed' -o json | jq '.items | length') if [[ $pods -gt 0 ]]; then return 0 else return 1 fi } nodes=(node01 node02) i=0 while true; do for node in "${nodes[@]}"; do echo "iteration $i" drain $node if check-failed-pod; then echo "Failed pod detected!" exit 1 fi i=$((i + 1)) done done ``` without this commit: 1 failure every ~7 node drains with this commit: no failures in over 200 iterations Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1943164 Signed-off-by: Antonio Cardace <[email protected]>
jrife · Jul 30, 2021 · c183cd5 · c183cd5
1 parent d4c02b3
commit c183cd5
Showing 1 changed file with 5 additions and 22 deletions.
diff --git a/cmd/virt-launcher/virt-launcher.go b/cmd/virt-launcher/virt-launcher.go
@@ -535,7 +535,7 @@ func ForkAndMonitor(containerDiskDir string) (int, error) {
 		return 1, err
 	}
 
-	exitStatus := make(chan syscall.WaitStatus, 10)
+	exitStatus := make(chan int, 10)
 	sigs := make(chan os.Signal, 10)
 	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGCHLD)
 	go func() {
@@ -549,12 +549,8 @@ func ForkAndMonitor(containerDiskDir string) (int, error) {
 				}
 
 				log.Log.Infof("Reaped pid %d with status %d", wpid, int(wstatus))
-				// there's a race between cmd.Wait() and syscall.Wait4 when
-				// cleaning up the cmd's pid after it exits. This allows us
-				// to detect the correct exit code regardless of which wait
-				// wins the race.
 				if wpid == cmd.Process.Pid {
-					exitStatus <- wstatus
+					exitStatus <- wstatus.ExitStatus()
 				}
 
 			default:
@@ -568,22 +564,9 @@ func ForkAndMonitor(containerDiskDir string) (int, error) {
 		}
 	}()
 
-	// wait for virt-launcher and collect the exit code
-	exitCode := 0
-	if err := cmd.Wait(); err != nil {
-		select {
-		case status := <-exitStatus:
-			exitCode = int(status)
-		default:
-			exitCode = 1
-			if exiterr, ok := err.(*exec.ExitError); ok {
-				if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
-					exitCode = status.ExitStatus()
-				}
-			}
-			log.Log.Reason(err).Error("dirty virt-launcher shutdown")
-		}
-
+	exitCode := <-exitStatus
+	if exitCode != 0 {
+		log.Log.Errorf("dirty virt-launcher shutdown: exit-code %d", exitCode)
 	}
 
 	// give qemu some time to shut down in case it survived virt-handler