Skip to content

Commit

Permalink
Fix graceful shutdown hotlooping
Browse files Browse the repository at this point in the history
  • Loading branch information
rmohr committed Sep 24, 2018
1 parent 02fecde commit 0c8767c
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 64 deletions.
72 changes: 41 additions & 31 deletions cmd/virt-launcher/virt-launcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import (
"kubevirt.io/kubevirt/pkg/virt-launcher"
notifyclient "kubevirt.io/kubevirt/pkg/virt-launcher/notify-client"
"kubevirt.io/kubevirt/pkg/virt-launcher/virtwrap"
"kubevirt.io/kubevirt/pkg/virt-launcher/virtwrap/api"
virtcli "kubevirt.io/kubevirt/pkg/virt-launcher/virtwrap/cli"
"kubevirt.io/kubevirt/pkg/virt-launcher/virtwrap/cmd-server"
"kubevirt.io/kubevirt/pkg/virt-launcher/virtwrap/util"
Expand Down Expand Up @@ -207,29 +208,22 @@ func initializeDirs(virtShareDir string,
}
}

func waitForDomainUUID(timeout time.Duration, domainManager virtwrap.DomainManager) string {
start := time.Now()
func waitForDomainUUID(timeout time.Duration, events chan watch.Event, stop chan struct{}) *api.Domain {
ticker := time.NewTicker(timeout).C

for time.Since(start) < timeout {
time.Sleep(time.Second)
list, err := domainManager.ListAllDomains()
if err != nil {
log.Log.Reason(err).Error("failed to retrieve domains from libvirt")
continue
}

if len(list) == 0 {
continue
}

domain := list[0]
if domain.Spec.UUID != "" {
select {
case <-ticker:
panic(fmt.Errorf("timed out waiting for domain to be defined"))
case e := <-events:
if e.Object != nil {
domain := e.Object.(*api.Domain)
log.Log.Infof("Detected domain with UUID %s", domain.Spec.UUID)
return domain.Spec.UUID
return domain
}
case <-stop:
return nil
}

panic(fmt.Errorf("timed out waiting for domain to be defined"))
return nil
}

func waitForFinalNotify(deleteNotificationSent chan watch.Event,
Expand All @@ -246,7 +240,7 @@ func waitForFinalNotify(deleteNotificationSent chan watch.Event,
log.Log.Info("Waiting on final notifications to be sent to virt-handler.")

// We don't want to block here forever. If the delete does not occur, that could mean
// something is wrong with libvirt. In this situation, wirt-handler will detect that
// something is wrong with libvirt. In this situation, virt-handler will detect that
// the domain went away eventually, however the exit status will be unknown.
timeout := time.After(30 * time.Second)
select {
Expand Down Expand Up @@ -350,30 +344,46 @@ func main() {
syscall.Kill(pid, syscall.SIGTERM)
}
}
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt,
syscall.SIGHUP,
syscall.SIGINT,
syscall.SIGTERM,
syscall.SIGQUIT,
)

signalStopChan := make(chan struct{})
go func() {
s := <-c
log.Log.Infof("Received signal %s", s.String())
close(signalStopChan)
}()

deleteNotificationSent := make(chan watch.Event, 10)
events := make(chan watch.Event, 10)
// Send domain notifications to virt-handler
startDomainEventMonitoring(*virtShareDir, domainConn, deleteNotificationSent)
startDomainEventMonitoring(*virtShareDir, domainConn, events)

// Marking Ready allows the container's readiness check to pass.
// This informs virt-controller that virt-launcher is ready to handle
// managing virtual machines.
markReady(*readinessFile)

domainUUID := waitForDomainUUID(*qemuTimeout, domainManager)
mon := virtlauncher.NewProcessMonitor(domainUUID,
gracefulShutdownTriggerFile,
*gracePeriodSeconds,
shutdownCallback)
domain := waitForDomainUUID(*qemuTimeout, events, signalStopChan)
if domain != nil {
mon := virtlauncher.NewProcessMonitor(domain.Spec.UUID,
gracefulShutdownTriggerFile,
*gracePeriodSeconds,
shutdownCallback)

// This is a wait loop that monitors the qemu pid. When the pid
// exits, the wait loop breaks.
mon.RunForever(*qemuTimeout)
// This is a wait loop that monitors the qemu pid. When the pid
// exits, the wait loop breaks.
mon.RunForever(*qemuTimeout, signalStopChan)
}

// Now that the pid has exited, we wait for the final delete notification to be
// sent back to virt-handler. This delete notification contains the reason the
// domain exited.
waitForFinalNotify(deleteNotificationSent, domainManager, vm)
waitForFinalNotify(events, domainManager, vm)

log.Log.Info("Exiting...")
}
Expand Down
18 changes: 16 additions & 2 deletions pkg/virt-handler/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ type VirtualMachineController struct {
// If the grace period has started but not expired, timeLeft represents
// the time in seconds left until the period expires.
// If the grace period has not started, timeLeft will be set to -1.
func (d *VirtualMachineController) hasGracePeriodExpired(dom *api.Domain) (hasExpired bool, timeLeft int) {
func (d *VirtualMachineController) hasGracePeriodExpired(dom *api.Domain) (hasExpired bool, timeLeft int64) {

hasExpired = false
timeLeft = 0
Expand Down Expand Up @@ -165,7 +165,7 @@ func (d *VirtualMachineController) hasGracePeriodExpired(dom *api.Domain) (hasEx
return
}

timeLeft = int(gracePeriod - diff)
timeLeft = int64(gracePeriod - diff)
if timeLeft < 1 {
timeLeft = 1
}
Expand Down Expand Up @@ -578,6 +578,20 @@ func (d *VirtualMachineController) processVmShutdown(vmi *v1.VirtualMachineInsta
}

log.Log.Object(vmi).Infof("Signaled graceful shutdown for %s", vmi.GetObjectMeta().GetName())

// Make sure that we don't hot-loop in case we send the first domain notification
if timeLeft == -1 {
timeLeft = 5
if vmi.Spec.TerminationGracePeriodSeconds != nil && *vmi.Spec.TerminationGracePeriodSeconds < timeLeft {
timeLeft = *vmi.Spec.TerminationGracePeriodSeconds
}
}
// In case we have a long grace period, we want to resend the graceful shutdown every 5 seconds
// That's important since a booting OS can miss ACPI signals
if timeLeft > 5 {
timeLeft = 5
}

// pending graceful shutdown.
d.Queue.AddAfter(controller.VirtualMachineKey(vmi), time.Duration(timeLeft)*time.Second)
d.recorder.Event(vmi, k8sv1.EventTypeNormal, v1.ShuttingDown.String(), "Signaled Graceful Shutdown")
Expand Down
24 changes: 7 additions & 17 deletions pkg/virt-launcher/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,17 @@ import (
"fmt"
"io/ioutil"
"os"
"os/signal"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"

"kubevirt.io/kubevirt/pkg/api/v1"
diskutils "kubevirt.io/kubevirt/pkg/ephemeral-disk-utils"
"kubevirt.io/kubevirt/pkg/log"
"kubevirt.io/kubevirt/pkg/precond"
cmdclient "kubevirt.io/kubevirt/pkg/virt-handler/cmd-client"
watchdog "kubevirt.io/kubevirt/pkg/watchdog"
"kubevirt.io/kubevirt/pkg/virt-handler/cmd-client"
"kubevirt.io/kubevirt/pkg/watchdog"
)

type OnShutdownCallback func(pid int)
Expand All @@ -53,7 +51,7 @@ type monitor struct {
}

type ProcessMonitor interface {
RunForever(startTimeout time.Duration)
RunForever(startTimeout time.Duration, stopChan chan struct{})
}

func GracefulShutdownTriggerDir(baseDir string) string {
Expand Down Expand Up @@ -242,7 +240,7 @@ func (mon *monitor) refresh() {
return
}

func (mon *monitor) monitorLoop(startTimeout time.Duration, signalChan chan os.Signal) {
func (mon *monitor) monitorLoop(startTimeout time.Duration, stopChan chan struct{}) {
// random value, no real rationale
rate := 1 * time.Second

Expand All @@ -262,8 +260,7 @@ func (mon *monitor) monitorLoop(startTimeout time.Duration, signalChan chan os.S
select {
case <-ticker.C:
mon.refresh()
case s := <-signalChan:
log.Log.Infof("Received signal %d.", s)
case <-stopChan:

if mon.gracePeriodStartTime != 0 {
continue
Expand All @@ -280,16 +277,9 @@ func (mon *monitor) monitorLoop(startTimeout time.Duration, signalChan chan os.S
ticker.Stop()
}

func (mon *monitor) RunForever(startTimeout time.Duration) {
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt,
syscall.SIGHUP,
syscall.SIGINT,
syscall.SIGTERM,
syscall.SIGQUIT,
)
func (mon *monitor) RunForever(startTimeout time.Duration, stopChan chan struct{}) {

mon.monitorLoop(startTimeout, c)
mon.monitorLoop(startTimeout, stopChan)
}

func readProcCmdline(pathname string) ([]string, error) {
Expand Down
21 changes: 11 additions & 10 deletions pkg/virt-launcher/monitor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,11 @@ var _ = Describe("VirtLauncher", func() {
})

It("verify start timeout works", func() {
stopChan := make(chan struct{})
done := make(chan string)

go func() {
mon.RunForever(time.Second)
mon.RunForever(time.Second, stopChan)
done <- "exit"
}()
noExitCheck := time.After(3 * time.Second)
Expand All @@ -161,17 +162,17 @@ var _ = Describe("VirtLauncher", func() {
})

It("verify monitor loop exits when signal arrives and no pid is present", func() {
signalChannel := make(chan os.Signal, 1)
stopChan := make(chan struct{})
done := make(chan string)

go func() {
mon.monitorLoop(1*time.Second, signalChannel)
mon.monitorLoop(1*time.Second, stopChan)
done <- "exit"
}()

time.Sleep(time.Second)

signalChannel <- syscall.SIGQUIT
close(stopChan)
noExitCheck := time.After(5 * time.Second)
exited := false

Expand All @@ -185,15 +186,15 @@ var _ = Describe("VirtLauncher", func() {
})

It("verify graceful shutdown trigger works", func() {
signalChannel := make(chan os.Signal, 1)
stopChan := make(chan struct{})
done := make(chan string)

StartProcess()
VerifyProcessStarted()
go func() { CleanupProcess() }()

go func() {
mon.monitorLoop(1*time.Second, signalChannel)
mon.monitorLoop(1*time.Second, stopChan)
done <- "exit"
}()

Expand All @@ -203,7 +204,7 @@ var _ = Describe("VirtLauncher", func() {
Expect(err).ToNot(HaveOccurred())
Expect(exists).To(Equal(false))

signalChannel <- syscall.SIGQUIT
close(stopChan)

time.Sleep(time.Second)

Expand All @@ -213,19 +214,19 @@ var _ = Describe("VirtLauncher", func() {
})

It("verify grace period works", func() {
signalChannel := make(chan os.Signal, 1)
stopChan := make(chan struct{})
done := make(chan string)

StartProcess()
VerifyProcessStarted()
go func() { CleanupProcess() }()
go func() {
mon.gracePeriod = 1
mon.monitorLoop(1*time.Second, signalChannel)
mon.monitorLoop(1*time.Second, stopChan)
done <- "exit"
}()

signalChannel <- syscall.SIGTERM
close(stopChan)
noExitCheck := time.After(5 * time.Second)
exited := false

Expand Down
8 changes: 5 additions & 3 deletions pkg/virt-launcher/notify-client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func newWatchEventError(err error) watch.Event {
return watch.Event{Type: watch.Error, Object: &metav1.Status{Status: metav1.StatusFailure, Message: err.Error()}}
}

func libvirtEventCallback(d cli.VirDomain, event *libvirt.DomainEventLifecycle, client *DomainEventClient, deleteNotificationSent chan watch.Event) {
func libvirtEventCallback(d cli.VirDomain, event *libvirt.DomainEventLifecycle, client *DomainEventClient, events chan watch.Event) {

// check for reconnects, and emit an error to force a resync
if event == nil {
Expand Down Expand Up @@ -121,10 +121,12 @@ func libvirtEventCallback(d cli.VirDomain, event *libvirt.DomainEventLifecycle,
case api.ReasonNonExistent:
event := watch.Event{Type: watch.Deleted, Object: domain}
client.SendDomainEvent(event)
deleteNotificationSent <- event
events <- event
default:
if event.Event == libvirt.DOMAIN_EVENT_DEFINED && libvirt.DomainEventDefinedDetailType(event.Detail) == libvirt.DOMAIN_EVENT_DEFINED_ADDED {
client.SendDomainEvent(watch.Event{Type: watch.Added, Object: domain})
event := watch.Event{Type: watch.Added, Object: domain}
client.SendDomainEvent(event)
events <- event
} else {
client.SendDomainEvent(watch.Event{Type: watch.Modified, Object: domain})
}
Expand Down
3 changes: 2 additions & 1 deletion tests/vmi_lifecycle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1034,7 +1034,8 @@ var _ = Describe("VMIlifecycle", func() {
// Delete the VirtualMachineInstance and wait for the confirmation of the delete
By("Deleting the VirtualMachineInstance")
Expect(virtClient.VirtualMachineInstance(vmi.Namespace).Delete(obj.Name, &metav1.DeleteOptions{})).To(Succeed())
tests.NewObjectEventWatcher(obj).Timeout(60*time.Second).SinceWatchedObjectResourceVersion().WaitFor(tests.NormalEvent, v1.Deleted)
event := tests.NewObjectEventWatcher(obj).SinceWatchedObjectResourceVersion().Timeout(60*time.Second).WaitFor(tests.NormalEvent, v1.Deleted)
Expect(event).ToNot(BeNil())

// Check if the graceful shutdown was logged
By("Checking that virt-handler logs VirtualMachineInstance graceful shutdown")
Expand Down

0 comments on commit 0c8767c

Please sign in to comment.