Skip to content

Commit

Permalink
monitoring: add prometheus metrics for migration phases
Browse files Browse the repository at this point in the history
Signed-off-by: João Vilaça <[email protected]>
  • Loading branch information
machadovilaca committed May 2, 2022
1 parent e82436f commit bfb2cc2
Show file tree
Hide file tree
Showing 12 changed files with 1,043 additions and 0 deletions.
14 changes: 14 additions & 0 deletions pkg/monitoring/migration/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")

go_library(
name = "go_default_library",
srcs = ["prometheus.go"],
importpath = "kubevirt.io/kubevirt/pkg/monitoring/migration",
visibility = ["//visibility:public"],
deps = [
"//staging/src/kubevirt.io/api/core/v1:go_default_library",
"//staging/src/kubevirt.io/client-go/log:go_default_library",
"//vendor/github.com/prometheus/client_golang/prometheus:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
],
)
150 changes: 150 additions & 0 deletions pkg/monitoring/migration/prometheus.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
/*
* This file is part of the KubeVirt project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright 2022 Red Hat, Inc.
*
*/

package migration

import (
"github.com/prometheus/client_golang/prometheus"
k8sv1 "k8s.io/api/core/v1"

virtv1 "kubevirt.io/api/core/v1"
"kubevirt.io/client-go/log"
)

type gaugeAction bool

const increase gaugeAction = true
const decrease gaugeAction = false

var (
migrationsLabels = []string{"vmi", "source", "target"}

CurrentPendingMigrations = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "kubevirt_migrate_vmi_pending_count",
Help: "Number of current pending migrations.",
},
migrationsLabels,
)

CurrentSchedulingMigrations = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "kubevirt_migrate_vmi_scheduling_count",
Help: "Number of current scheduling migrations.",
},
migrationsLabels,
)

CurrentRunningMigrations = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "kubevirt_migrate_vmi_running_count",
Help: "Number of current running migrations.",
},
migrationsLabels,
)

MigrationsSucceededTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "kubevirt_migrate_vmi_succeeded_total",
Help: "Number of migrations successfully executed.",
},
migrationsLabels,
)

MigrationsFailedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "kubevirt_migrate_vmi_failed_total",
Help: "Number of failed migrations.",
},
migrationsLabels,
)
)

func RegisterMigrationMetrics() {
log.Log.Infof("Starting migration metrics")
prometheus.MustRegister(CurrentPendingMigrations)
prometheus.MustRegister(CurrentSchedulingMigrations)
prometheus.MustRegister(CurrentRunningMigrations)
prometheus.MustRegister(MigrationsSucceededTotal)
prometheus.MustRegister(MigrationsFailedTotal)
}

func IncPendingMigrations(vmi *virtv1.VirtualMachineInstance, targetPod *k8sv1.Pod) {
updateMigrationGauge(vmi, targetPod, CurrentPendingMigrations, increase)
}

func IncSchedulingMigrations(vmi *virtv1.VirtualMachineInstance, targetPod *k8sv1.Pod) {
updateMigrationGauge(vmi, targetPod, CurrentSchedulingMigrations, increase)
}

func IncRunningMigrations(vmi *virtv1.VirtualMachineInstance, targetPod *k8sv1.Pod) {
updateMigrationGauge(vmi, targetPod, CurrentRunningMigrations, increase)
}

func DecPendingMigrations(vmi *virtv1.VirtualMachineInstance, targetPod *k8sv1.Pod) {
updateMigrationGauge(vmi, targetPod, CurrentPendingMigrations, decrease)
}

func DecSchedulingMigrations(vmi *virtv1.VirtualMachineInstance, targetPod *k8sv1.Pod) {
updateMigrationGauge(vmi, targetPod, CurrentSchedulingMigrations, decrease)
}

func DecRunningMigrations(vmi *virtv1.VirtualMachineInstance, targetPod *k8sv1.Pod) {
updateMigrationGauge(vmi, targetPod, CurrentRunningMigrations, decrease)
}

func IncSucceededMigrations(vmi *virtv1.VirtualMachineInstance, targetPod *k8sv1.Pod) {
incMigrationCounter(vmi, targetPod, MigrationsSucceededTotal)
}

func IncFailedMigrations(vmi *virtv1.VirtualMachineInstance, targetPod *k8sv1.Pod) {
incMigrationCounter(vmi, targetPod, MigrationsFailedTotal)
}

func getMigrationSourceAndTarget(vmi *virtv1.VirtualMachineInstance, targetPod *k8sv1.Pod) (source, target string) {
source = vmi.Status.NodeName
if targetPod != nil {
target = targetPod.Spec.NodeName
}

if vmi.Status.MigrationState != nil {
source = vmi.Status.MigrationState.SourceNode
target = vmi.Status.MigrationState.TargetNode
}

return
}

func updateMigrationGauge(vmi *virtv1.VirtualMachineInstance, targetPod *k8sv1.Pod, gauge *prometheus.GaugeVec, action gaugeAction) {
source, target := getMigrationSourceAndTarget(vmi, targetPod)
labelValues := []string{vmi.Name, source, target}

if action == increase {
gauge.WithLabelValues(labelValues...).Inc()
} else {
gauge.WithLabelValues(labelValues...).Dec()
}
}

func incMigrationCounter(vmi *virtv1.VirtualMachineInstance, targetPod *k8sv1.Pod, counter *prometheus.CounterVec) {
source, target := getMigrationSourceAndTarget(vmi, targetPod)
labelValues := []string{vmi.Name, source, target}

counter.WithLabelValues(labelValues...).Inc()
}
4 changes: 4 additions & 0 deletions pkg/virt-controller/watch/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ go_library(
"//pkg/controller:go_default_library",
"//pkg/flavor:go_default_library",
"//pkg/healthz:go_default_library",
"//pkg/monitoring/migration:go_default_library",
"//pkg/monitoring/perfscale:go_default_library",
"//pkg/monitoring/profiler:go_default_library",
"//pkg/monitoring/vmistats:go_default_library",
Expand Down Expand Up @@ -102,6 +103,7 @@ go_test(
deps = [
"//pkg/controller:go_default_library",
"//pkg/flavor:go_default_library",
"//pkg/monitoring/migration:go_default_library",
"//pkg/rest:go_default_library",
"//pkg/testutils:go_default_library",
"//pkg/util/types:go_default_library",
Expand Down Expand Up @@ -133,6 +135,8 @@ go_test(
"//vendor/github.com/onsi/gomega/gstruct:go_default_library",
"//vendor/github.com/onsi/gomega/types:go_default_library",
"//vendor/github.com/pborman/uuid:go_default_library",
"//vendor/github.com/prometheus/client_golang/prometheus:go_default_library",
"//vendor/github.com/prometheus/client_golang/prometheus/testutil:go_default_library",
"//vendor/github.com/prometheus/client_model/go:go_default_library",
"//vendor/k8s.io/api/apps/v1:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
Expand Down
3 changes: 3 additions & 0 deletions pkg/virt-controller/watch/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ import (
"runtime"
"time"

"kubevirt.io/kubevirt/pkg/monitoring/migration"

"kubevirt.io/kubevirt/pkg/flavor"

"github.com/emicklei/go-restful"
Expand Down Expand Up @@ -449,6 +451,7 @@ func (vca *VirtControllerApp) onStartedLeading() func(ctx context.Context) {

vmiprom.SetupVMICollector(vca.vmiInformer, vca.clusterConfig)
perfscale.RegisterPerfScaleMetrics(vca.vmiInformer)
migration.RegisterMigrationMetrics()

go vca.evacuationController.Run(vca.evacuationControllerThreads, stop)
go vca.disruptionBudgetController.Run(vca.disruptionBudgetControllerThreads, stop)
Expand Down
33 changes: 33 additions & 0 deletions pkg/virt-controller/watch/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ import (
"kubevirt.io/client-go/kubecli"
"kubevirt.io/client-go/log"
"kubevirt.io/kubevirt/pkg/controller"
metrics "kubevirt.io/kubevirt/pkg/monitoring/migration"
kubevirttypes "kubevirt.io/kubevirt/pkg/util/types"
"kubevirt.io/kubevirt/pkg/virt-controller/services"
)
Expand Down Expand Up @@ -369,6 +370,19 @@ func (c *MigrationController) canMigrateVMI(migration *virtv1.VirtualMachineInst

}

func handleFailedMigrationMetrics(migration *virtv1.VirtualMachineInstanceMigration, vmi *virtv1.VirtualMachineInstance, pod *k8sv1.Pod) {
switch migration.Status.Phase {
case virtv1.MigrationPending:
metrics.DecPendingMigrations(vmi, pod)
case virtv1.MigrationScheduling:
metrics.DecSchedulingMigrations(vmi, pod)
case virtv1.MigrationRunning:
metrics.DecRunningMigrations(vmi, pod)
}

metrics.IncFailedMigrations(vmi, pod)
}

func (c *MigrationController) updateStatus(migration *virtv1.VirtualMachineInstanceMigration, vmi *virtv1.VirtualMachineInstance, pods []*k8sv1.Pod) error {

var pod *k8sv1.Pod = nil
Expand Down Expand Up @@ -402,36 +416,43 @@ func (c *MigrationController) updateStatus(migration *virtv1.VirtualMachineInsta
} else if vmi == nil {
migrationCopy.Status.Phase = virtv1.MigrationFailed
c.recorder.Eventf(migration, k8sv1.EventTypeWarning, FailedMigrationReason, "Migration failed because vmi does not exist.")
handleFailedMigrationMetrics(migration, vmi, pod)
log.Log.Object(migration).Error("vmi does not exist")
} else if vmi.IsFinal() {
migrationCopy.Status.Phase = virtv1.MigrationFailed
c.recorder.Eventf(migration, k8sv1.EventTypeWarning, FailedMigrationReason, "Migration failed vmi shutdown during migration.")
handleFailedMigrationMetrics(migration, vmi, pod)
log.Log.Object(migration).Error("Unable to migrate vmi because vmi is shutdown.")
} else if podExists && podIsDown(pod) {
migrationCopy.Status.Phase = virtv1.MigrationFailed
c.recorder.Eventf(migration, k8sv1.EventTypeWarning, FailedMigrationReason, "Migration failed because target pod shutdown during migration")
handleFailedMigrationMetrics(migration, vmi, pod)
log.Log.Object(migration).Errorf("target pod %s/%s shutdown during migration", pod.Namespace, pod.Name)
} else if migration.TargetIsCreated() && !podExists {
migrationCopy.Status.Phase = virtv1.MigrationFailed
c.recorder.Eventf(migration, k8sv1.EventTypeWarning, FailedMigrationReason, "Migration target pod was removed during active migration.")
handleFailedMigrationMetrics(migration, vmi, pod)
log.Log.Object(migration).Error("target pod disappeared during migration")
} else if migration.TargetIsHandedOff() && vmi.Status.MigrationState == nil {
migrationCopy.Status.Phase = virtv1.MigrationFailed
c.recorder.Eventf(migration, k8sv1.EventTypeWarning, FailedMigrationReason, "VMI's migration state was cleared during the active migration.")
handleFailedMigrationMetrics(migration, vmi, pod)
log.Log.Object(migration).Error("vmi migration state cleared during migration")
} else if migration.TargetIsHandedOff() &&
vmi.Status.MigrationState != nil &&
vmi.Status.MigrationState.MigrationUID != migration.UID {

migrationCopy.Status.Phase = virtv1.MigrationFailed
c.recorder.Eventf(migration, k8sv1.EventTypeWarning, FailedMigrationReason, "VMI's migration state was taken over by another migration job during active migration.")
handleFailedMigrationMetrics(migration, vmi, pod)
log.Log.Object(migration).Error("vmi's migration state was taken over by another migration object")
} else if vmi.Status.MigrationState != nil &&
vmi.Status.MigrationState.MigrationUID == migration.UID &&
vmi.Status.MigrationState.Failed {

migrationCopy.Status.Phase = virtv1.MigrationFailed
c.recorder.Eventf(migration, k8sv1.EventTypeWarning, FailedMigrationReason, "Source node reported migration failed")
handleFailedMigrationMetrics(migration, vmi, pod)
log.Log.Object(migration).Errorf("VMI %s/%s reported migration failed.", vmi.Namespace, vmi.Name)
} else if migration.DeletionTimestamp != nil && !migration.IsFinal() &&
!conditionManager.HasCondition(migration, virtv1.VirtualMachineInstanceMigrationAbortRequested) {
Expand All @@ -444,6 +465,7 @@ func (c *MigrationController) updateStatus(migration *virtv1.VirtualMachineInsta
} else if attachmentPodExists && podIsDown(attachmentPod) {
migrationCopy.Status.Phase = virtv1.MigrationFailed
c.recorder.Eventf(migration, k8sv1.EventTypeWarning, FailedMigrationReason, "Migration failed because target attachment pod shutdown during migration")
handleFailedMigrationMetrics(migration, vmi, pod)
log.Log.Object(migration).Errorf("target attachment pod %s/%s shutdown during migration", attachmentPod.Namespace, attachmentPod.Name)
} else {

Expand All @@ -456,21 +478,27 @@ func (c *MigrationController) updateStatus(migration *virtv1.VirtualMachineInsta

if canMigrate {
migrationCopy.Status.Phase = virtv1.MigrationPending
metrics.IncPendingMigrations(vmi, pod)
} else {
// can not migrate because there is an active migration already
// in progress for this VMI.
migrationCopy.Status.Phase = virtv1.MigrationFailed
c.recorder.Eventf(migration, k8sv1.EventTypeWarning, FailedMigrationReason, "VMI is not eligible for migration because another migration job is in progress.")
metrics.IncFailedMigrations(vmi, pod)
log.Log.Object(migration).Error("Migration object ont eligible for migration because another job is in progress")
}
case virtv1.MigrationPending:
if podExists {
if controller.VMIHasHotplugVolumes(vmi) {
if attachmentPodExists {
migrationCopy.Status.Phase = virtv1.MigrationScheduling
metrics.DecPendingMigrations(vmi, pod)
metrics.IncSchedulingMigrations(vmi, pod)
}
} else {
migrationCopy.Status.Phase = virtv1.MigrationScheduling
metrics.DecPendingMigrations(vmi, pod)
metrics.IncSchedulingMigrations(vmi, pod)
}
}
case virtv1.MigrationScheduling:
Expand All @@ -479,9 +507,11 @@ func (c *MigrationController) updateStatus(migration *virtv1.VirtualMachineInsta
if attachmentPodExists && isPodReady(attachmentPod) {
log.Log.Object(migration).Infof("Attachment pod %s for vmi %s/%s is ready", attachmentPod.Name, vmi.Namespace, vmi.Name)
migrationCopy.Status.Phase = virtv1.MigrationScheduled
metrics.DecSchedulingMigrations(vmi, pod)
}
} else {
migrationCopy.Status.Phase = virtv1.MigrationScheduled
metrics.DecSchedulingMigrations(vmi, pod)
}
}
case virtv1.MigrationScheduled:
Expand All @@ -497,11 +527,14 @@ func (c *MigrationController) updateStatus(migration *virtv1.VirtualMachineInsta
case virtv1.MigrationTargetReady:
if vmi.Status.MigrationState.StartTimestamp != nil {
migrationCopy.Status.Phase = virtv1.MigrationRunning
metrics.IncRunningMigrations(vmi, pod)
}
case virtv1.MigrationRunning:
if vmi.Status.MigrationState.Completed {
migrationCopy.Status.Phase = virtv1.MigrationSucceeded
c.recorder.Eventf(migration, k8sv1.EventTypeNormal, SuccessfulMigrationReason, "Source node reported migration succeeded")
metrics.DecRunningMigrations(vmi, pod)
metrics.IncSucceededMigrations(vmi, pod)
log.Log.Object(migration).Infof("VMI reported migration succeeded.")
}
}
Expand Down
Loading

0 comments on commit bfb2cc2

Please sign in to comment.