Skip to content

Commit

Permalink
Merge pull request kubevirt#526 from davidvossel/vm-launch-flow-v3
Browse files Browse the repository at this point in the history
VM Grace Period and Shutdown Flow Rework
  • Loading branch information
rmohr authored Dec 11, 2017
2 parents 129a90c + 499f0c3 commit b7af8a6
Show file tree
Hide file tree
Showing 28 changed files with 866 additions and 138 deletions.
5 changes: 5 additions & 0 deletions api/openapi-spec/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -3197,6 +3197,11 @@
"nodeSelector": {
"description": "If labels are specified, only nodes marked with all of these labels are considered when scheduling the VM.",
"type": "object"
},
"terminationGracePeriodSeconds": {
"description": "Grace period observed after signalling a VM to stop after which the VM is force terminated.",
"type": "integer",
"format": "int64"
}
}
},
Expand Down
1 change: 1 addition & 0 deletions cluster/vm-atomic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ metadata:
apiVersion: kubevirt.io/v1alpha1
kind: VirtualMachine
spec:
terminationGracePeriodSeconds: 0
domain:
devices:
disks:
Expand Down
1 change: 1 addition & 0 deletions cluster/vm-ephemeral.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ metadata:
apiVersion: kubevirt.io/v1alpha1
kind: VirtualMachine
spec:
terminationGracePeriodSeconds: 0
domain:
devices:
graphics:
Expand Down
1 change: 1 addition & 0 deletions cluster/vm-iscsi-auth.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ kind: VirtualMachine
metadata:
name: testvm
spec:
terminationGracePeriodSeconds: 0
domain:
devices:
graphics:
Expand Down
5 changes: 5 additions & 0 deletions cluster/vm-nocloud.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@ metadata:
apiVersion: kubevirt.io/v1alpha1
kind: VirtualMachine
spec:
terminationGracePeriodSeconds: 5
domain:
devices:
graphics:
- type: spice
consoles:
- type: pty
disks:
- type: RegistryDisk:v1alpha
source:
Expand Down
1 change: 1 addition & 0 deletions cluster/vm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ kind: VirtualMachine
metadata:
name: testvm
spec:
terminationGracePeriodSeconds: 0
domain:
devices:
graphics:
Expand Down
4 changes: 2 additions & 2 deletions cmd/fake-qemu-process/fake-qemu.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import (
func main() {
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt,
syscall.SIGQUIT,
syscall.SIGTERM,
)

fmt.Printf("Started fake qemu process\n")
Expand All @@ -39,7 +39,7 @@ func main() {
select {
case <-timeout:
case <-c:
time.Sleep(2 * time.Second)
time.Sleep(1 * time.Second)
}

fmt.Printf("Exit fake qemu process\n")
Expand Down
12 changes: 12 additions & 0 deletions cmd/virt-handler/virt-handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
cloudinit "kubevirt.io/kubevirt/pkg/cloud-init"
configdisk "kubevirt.io/kubevirt/pkg/config-disk"
"kubevirt.io/kubevirt/pkg/controller"
inotifyinformer "kubevirt.io/kubevirt/pkg/inotify-informer"
"kubevirt.io/kubevirt/pkg/kubecli"
"kubevirt.io/kubevirt/pkg/log"
registrydisk "kubevirt.io/kubevirt/pkg/registry-disk"
Expand All @@ -52,6 +53,7 @@ import (
virtcache "kubevirt.io/kubevirt/pkg/virt-handler/virtwrap/cache"
virtcli "kubevirt.io/kubevirt/pkg/virt-handler/virtwrap/cli"
"kubevirt.io/kubevirt/pkg/virt-handler/virtwrap/isolation"
virtlauncher "kubevirt.io/kubevirt/pkg/virt-launcher"
watchdog "kubevirt.io/kubevirt/pkg/watchdog"
)

Expand Down Expand Up @@ -161,6 +163,8 @@ func (app *virtHandlerApp) Run() {
cache.Indexers{},
)

virtlauncher.InitializeSharedDirectories(app.VirtShareDir)

watchdogInformer := cache.NewSharedIndexInformer(
watchdog.NewWatchdogListWatchFromClient(
app.VirtShareDir,
Expand All @@ -169,6 +173,13 @@ func (app *virtHandlerApp) Run() {
0,
cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc})

gracefulShutdownInformer := cache.NewSharedIndexInformer(
inotifyinformer.NewFileListWatchFromClient(
virtlauncher.GracefulShutdownTriggerDir(app.VirtShareDir)),
&virt_api.Domain{},
0,
cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc})

vmController := virthandler.NewController(
domainManager,
recorder,
Expand All @@ -180,6 +191,7 @@ func (app *virtHandlerApp) Run() {
vmSharedInformer,
domainSharedInformer,
watchdogInformer,
gracefulShutdownInformer,
)

// Bootstrapping. From here on the startup order matters
Expand Down
10 changes: 9 additions & 1 deletion cmd/virt-launcher/virt-launcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ func main() {
namespace := flag.String("namespace", "", "Namespace of the VM")
watchdogInterval := flag.Duration("watchdog-update-interval", defaultWatchdogInterval, "Interval at which watchdog file should be updated")
readinessFile := flag.String("readiness-file", "/tmp/health", "Pod looks for tihs file to determine when virt-launcher is initialized")
gracePeriodSeconds := flag.Int("grace-period-seconds", 30, "Grace period to observe before sending SIGTERM to vm process.")
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
pflag.Parse()

Expand Down Expand Up @@ -114,7 +115,14 @@ func main() {
}
}()

mon := virtlauncher.NewProcessMonitor("qemu")
gracefulShutdownTriggerFile := virtlauncher.GracefulShutdownTriggerFromNamespaceName(*virtShareDir, *namespace, *name)
err = virtlauncher.GracefulShutdownTriggerClear(gracefulShutdownTriggerFile)
if err != nil {
log.Log.Reason(err).Errorf("Error detected clearning graceful shutdown trigger file %s.", gracefulShutdownTriggerFile)
panic(err)
}

mon := virtlauncher.NewProcessMonitor("qemu", gracefulShutdownTriggerFile, *gracePeriodSeconds)

markReady(*readinessFile)
mon.RunForever(*qemuTimeout)
Expand Down
112 changes: 112 additions & 0 deletions docs/graceful-shutdown.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Virtual Machine Graceful Shutdown

## Overview

Virtual machine graceful shutdown is the process of signaling a virtual
machine to begin shutting down before forcing the virtual machine off. This
process gives the virtual machine a chance to react to a shutdown request
before the KubeVirt runtime does the equivalent of pulling the power plug on
the virtual machine.

The period between when a virtual machine is signaled to shutdown, and the
point in time that KubeVirt will force the virtual machine off if it is still
active is called a **Grace Period**. The grace period is a configurable value
represented in the Virtual Machine's specification by the
**terminationGracePeriodSeconds** option.

## Usage Examples

### Default: No terminationGracePeriodSeconds specified

By default, if the grace period option is not set a small default grace period
will be observed before killing the virtual machine. At the moment this default
is 30 seconds, which is consistent with value used for containers by
Kubernetes.

### Immediate Force Shutdown: terminationGracePeriodSeconds = 0

A 0 value for the grace period option means that the virtual machine should not
have a grace period observed during shutdown. If a user specifies 0 for this
value, the virtual machine will be immediately force killed on shutdown.

### Grace Period Values > 0

Any value > 0 specified for terminationGracePeriodSeconds represents the number
of seconds the KubeVirt runtime will wait between signaling a virtual machine
to shutdown and killing the virtual machine if it is still active.

## Design and Implementation

At the moment, the only way to shutdown a virtual machine is to remove the
cluster object from kubernetes. Once the virtual machine object has been
removed, we no longer have access to the terminationGracePeriodSeconds value
stored on the Virtual Machine's spec.

In order to guarantee the value stored in the virtual machine's
terinationGracePeriodSeconds is observed after the cluster object is deleted,
that value is cached locally by virt-handler during the start flow. When a
deleted virtual machine cluster object is detected, the cached grace period
value is observed as virt-handler is shutting the virtual machine down.

### Virt-Controller Involvement

The only change to virt-controller is that it now configurs a custom
grace period for virt-launcher pods that matches the grace period set on
the corresponding virtual machine object. The virt-launcher grace period
is slightly padded in order to ensure under normal operation that
virt-handler will have a chance to force a virtual machine off before the
virt-launcher pod terminates. If the virt-launcher pod terminates first,
the virtual machine will be forced off as a result of the kubernetes
runtime killing all processes in the virt-launcher cgroup.

### Virt-Handler Involvement

Virt-handler is now responsible for both signaling the virtual machine to
shutdown and ensuring the virtual machine is forced off after the grace
period is observed.

Signaling the beginning of the grace period can come from two sources.

1. The virt-handler virtual machine object informer can notify virt-handler
the cluster object has been removed for currently active virtual machine.

2. A virtual machine's corresponding virt-launcher pod can signal shutdown
by writing to a graceful shutdown trigger file in a shared directory between
virt-launcher and virt-handler.

Once the grace period begins for a virtual machine, virt-handler maintains
the state associated with the grace period in a local cache file. This
allows the grace period to be observed even if the virt-handler process
recovers during this period.

### Virt-Launcher Involvement

Virt-launcher intercepts signals (such as SIGTERM) sent to it by the kubernetes
runtime and notifies virt-handler to begin the gracefull shutdown process by
writting to the graceful shutdown trigger file.

After writting to the graceful shutdown trigger file, virt-launcher continues to
watch the pid until either the pid exits (as a result of virt-handler shutting
it down) or the kubernetes runtime kills the virt-launcher process with SIGKILL.

A force kill of virt-launcher will result in the corresponding virtual machine
exiting.

### Shutdown Notification Race (Virt-Launcher VS. VM Object Informer)

When a Virtual Machine object is removed from the cluster, that sets off a race
between two sources used to notify virt-handler it should shutdown the virtual
machine.

1. The virtual machine cluster informer.

2. virt-launcher graceful shutdown trigger.

It doesn't matter which one of these comes first. Once it begins, graceful
shutdown process idempotent.

It is worth noting that this race condition one of the reasons why
virt-launcher needs to signal virt-handler to perform graceful shutdown instead
of virt-launcher acting directly on the process. By centralizing the shutdown
flow to virt-handler, we can guarantee a single grace period is observed
accurately.
17 changes: 11 additions & 6 deletions pkg/api/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ import (
// GroupName is the group name use in this package
const GroupName = "kubevirt.io"

const DefaultGracePeriodSeconds int64 = 30

// GroupVersion is group version used to register these objects
var GroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1alpha1"}

Expand Down Expand Up @@ -180,6 +182,8 @@ type VMSpec struct {
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
// If affinity is specifies, obey all the affinity rules
Affinity *Affinity `json:"affinity,omitempty"`
// Grace period observed after signalling a VM to stop after which the VM is force terminated.
TerminationGracePeriodSeconds *int64 `json:"terminationGracePeriodSeconds,omitempty"`
}

// Affinity groups all the affinity rules related to a VM
Expand Down Expand Up @@ -357,12 +361,13 @@ func (s MigrationEvent) String() string {
type SyncEvent string

const (
Created SyncEvent = "Created"
Deleted SyncEvent = "Deleted"
Started SyncEvent = "Started"
Stopped SyncEvent = "Stopped"
SyncFailed SyncEvent = "SyncFailed"
Resumed SyncEvent = "Resumed"
Created SyncEvent = "Created"
Deleted SyncEvent = "Deleted"
Started SyncEvent = "Started"
ShuttingDown SyncEvent = "ShuttingDown"
Stopped SyncEvent = "Stopped"
SyncFailed SyncEvent = "SyncFailed"
Resumed SyncEvent = "Resumed"
)

func (s SyncEvent) String() string {
Expand Down
9 changes: 5 additions & 4 deletions pkg/api/v1/types_swagger_generated.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ func (VirtualMachineList) SwaggerDoc() map[string]string {

func (VMSpec) SwaggerDoc() map[string]string {
return map[string]string{
"": "VMSpec is a description of a VM. Not to be confused with api.DomainSpec in virt-handler.\nIt is expected that v1.DomainSpec will be merged into this structure.",
"domain": "Domain is the actual libvirt domain.",
"nodeSelector": "If labels are specified, only nodes marked with all of these labels are considered when scheduling the VM.",
"affinity": "If affinity is specifies, obey all the affinity rules",
"": "VMSpec is a description of a VM. Not to be confused with api.DomainSpec in virt-handler.\nIt is expected that v1.DomainSpec will be merged into this structure.",
"domain": "Domain is the actual libvirt domain.",
"nodeSelector": "If labels are specified, only nodes marked with all of these labels are considered when scheduling the VM.",
"affinity": "If affinity is specifies, obey all the affinity rules",
"terminationGracePeriodSeconds": "Grace period observed after signalling a VM to stop after which the VM is force terminated.",
}
}

Expand Down
25 changes: 20 additions & 5 deletions pkg/virt-controller/services/template.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package services

import (
"fmt"
"strconv"

kubev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -56,6 +57,18 @@ func (t *templateService) RenderLaunchManifest(vm *v1.VirtualMachine) (*kubev1.P
successThreshold := 1
failureThreshold := 5

gracePeriodSeconds := v1.DefaultGracePeriodSeconds
if vm.Spec.TerminationGracePeriodSeconds != nil {
gracePeriodSeconds = *vm.Spec.TerminationGracePeriodSeconds
}

// Pad the virt-launcher grace period.
// Ideally we want virt-handler to handle tearing down
// the vm without virt-launcher's termination forcing
// the vm down.
gracePeriodSeconds = gracePeriodSeconds + int64(15)
gracePeriodKillAfter := gracePeriodSeconds + int64(15)

// VM target container
container := kubev1.Container{
Name: "compute",
Expand All @@ -67,6 +80,7 @@ func (t *templateService) RenderLaunchManifest(vm *v1.VirtualMachine) (*kubev1.P
"--namespace", namespace,
"--kubevirt-share-dir", t.virtShareDir,
"--readiness-file", "/tmp/healthy",
"--grace-period-seconds", strconv.Itoa(int(gracePeriodSeconds)),
},
VolumeMounts: []kubev1.VolumeMount{
{
Expand Down Expand Up @@ -117,11 +131,12 @@ func (t *templateService) RenderLaunchManifest(vm *v1.VirtualMachine) (*kubev1.P
},
},
Spec: kubev1.PodSpec{
HostPID: true,
RestartPolicy: kubev1.RestartPolicyNever,
Containers: containers,
NodeSelector: vm.Spec.NodeSelector,
Volumes: volumes,
HostPID: true,
TerminationGracePeriodSeconds: &gracePeriodKillAfter,
RestartPolicy: kubev1.RestartPolicyNever,
Containers: containers,
NodeSelector: vm.Spec.NodeSelector,
Volumes: volumes,
},
}

Expand Down
8 changes: 6 additions & 2 deletions pkg/virt-controller/services/template_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ var _ = Describe("Template", func() {
"--name", "testvm",
"--namespace", "testns",
"--kubevirt-share-dir", "/var/run/kubevirt",
"--readiness-file", "/tmp/healthy"}))
"--readiness-file", "/tmp/healthy",
"--grace-period-seconds", "45"}))
Expect(*pod.Spec.TerminationGracePeriodSeconds).To(Equal(int64(60)))
})
})
Context("with node selectors", func() {
Expand Down Expand Up @@ -87,9 +89,11 @@ var _ = Describe("Template", func() {
"--name", "testvm",
"--namespace", "default",
"--kubevirt-share-dir", "/var/run/kubevirt",
"--readiness-file", "/tmp/healthy"}))
"--readiness-file", "/tmp/healthy",
"--grace-period-seconds", "45"}))
Expect(pod.Spec.Volumes[0].HostPath.Path).To(Equal("/var/run/kubevirt"))
Expect(pod.Spec.Containers[0].VolumeMounts[0].MountPath).To(Equal("/var/run/kubevirt"))
Expect(*pod.Spec.TerminationGracePeriodSeconds).To(Equal(int64(60)))
})

It("should add node affinity to pod", func() {
Expand Down
Loading

0 comments on commit b7af8a6

Please sign in to comment.