Skip to content

Commit

Permalink
Merge pull request kubevirt#10700 from machadovilaca/refactor-monitor…
Browse files Browse the repository at this point in the history
…ing-alerts

Refactor monitoring alerts
  • Loading branch information
kubevirt-bot authored Dec 19, 2023
2 parents e53b124 + 7e3a5fb commit ef91d5a
Show file tree
Hide file tree
Showing 11 changed files with 671 additions and 466 deletions.
2 changes: 1 addition & 1 deletion pkg/monitoring/rules/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ go_library(
importpath = "kubevirt.io/kubevirt/pkg/monitoring/rules",
visibility = ["//visibility:public"],
deps = [
"//pkg/monitoring/rules/alerts:go_default_library",
"//pkg/monitoring/rules/recordingrules:go_default_library",
"//pkg/virt-operator/resource/generate/components:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules:go_default_library",
"//vendor/github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1:go_default_library",
],
Expand Down
23 changes: 23 additions & 0 deletions pkg/monitoring/rules/alerts/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")

go_library(
name = "go_default_library",
srcs = [
"alerts.go",
"system.go",
"virt-api.go",
"virt-controller.go",
"virt-handler.go",
"virt-operator.go",
"vms.go",
],
importpath = "kubevirt.io/kubevirt/pkg/monitoring/rules/alerts",
visibility = ["//visibility:public"],
deps = [
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules:go_default_library",
"//vendor/github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/intstr:go_default_library",
"//vendor/k8s.io/utils/ptr:go_default_library",
],
)
89 changes: 89 additions & 0 deletions pkg/monitoring/rules/alerts/alerts.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
Copyright 2023 The KubeVirt Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package alerts

import (
"errors"
"fmt"
"os"
"strings"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"

"github.com/machadovilaca/operator-observability/pkg/operatorrules"
)

const (
prometheusRunbookAnnotationKey = "runbook_url"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"

severityAlertLabelKey = "severity"
operatorHealthImpactLabelKey = "operator_health_impact"

partOfAlertLabelKey = "kubernetes_operator_part_of"
componentAlertLabelKey = "kubernetes_operator_component"
kubevirtLabelValue = "kubevirt"

durationFiveMinutes = "5 minutes"
)

func Register(namespace string) error {
alerts := [][]promv1.Rule{
systemAlerts(namespace),
virtApiAlerts(namespace),
virtControllerAlerts(namespace),
virtHandlerAlerts(namespace),
virtOperatorAlerts(namespace),
vmsAlerts,
}

runbookURLTemplate := getRunbookURLTemplate()
for _, alertGroup := range alerts {
for _, alert := range alertGroup {
alert.Labels[partOfAlertLabelKey] = kubevirtLabelValue
alert.Labels[componentAlertLabelKey] = kubevirtLabelValue

alert.Annotations[prometheusRunbookAnnotationKey] = fmt.Sprintf(runbookURLTemplate, alert.Alert)
}
}

return operatorrules.RegisterAlerts(alerts...)
}

func getRunbookURLTemplate() string {
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv)
if !exists {
runbookURLTemplate = defaultRunbookURLTemplate
}

if strings.Count(runbookURLTemplate, "%s") != 1 {
panic(errors.New("runbook URL template must have exactly 1 %s substring"))
}

return runbookURLTemplate
}

func getErrorRatio(ns string, podName string, errorCodeRegex string, durationInMinutes int) string {
errorRatioQuery := "sum ( rate ( rest_client_requests_total{namespace=\"%s\",pod=~\"%s-.*\",code=~\"%s\"} [%dm] ) ) / sum ( rate ( rest_client_requests_total{namespace=\"%s\",pod=~\"%s-.*\"} [%dm] ) )"
return fmt.Sprintf(errorRatioQuery, ns, podName, errorCodeRegex, durationInMinutes, ns, podName, durationInMinutes)
}

func getRestCallsFailedWarning(failingCallsPercentage int, component, duration string) string {
const restCallsFailWarningTemplate = "More than %d%% of the rest calls failed in %s for the last %s"
return fmt.Sprintf(restCallsFailWarningTemplate, failingCallsPercentage, component, duration)
}
86 changes: 86 additions & 0 deletions pkg/monitoring/rules/alerts/system.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/*
Copyright 2023 The KubeVirt Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package alerts

import (
"fmt"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)

func systemAlerts(namespace string) []promv1.Rule {
return []promv1.Rule{
{
Alert: "LowKVMNodesCount",
Expr: intstr.FromString("(kubevirt_allocatable_nodes > 1) and (kubevirt_nodes_with_kvm < 2)"),
For: ptr.To(promv1.Duration("5m")),
Annotations: map[string]string{
"description": "Low number of nodes with KVM resource available.",
"summary": "At least two nodes with kvm resource required for VM live migration.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "KubeVirtComponentExceedsRequestedMemory",
Expr: intstr.FromString(
// In 'container_memory_working_set_bytes', 'container=""' filters the accumulated metric for the pod slice to measure total Memory usage for all containers within the pod
fmt.Sprintf(`((kube_pod_container_resource_requests{namespace="%s",container=~"virt-controller|virt-api|virt-handler|virt-operator",resource="memory"}) - on(pod) group_left(node) container_memory_working_set_bytes{container="",namespace="%s"}) < 0`, namespace, namespace)),
For: ptr.To(promv1.Duration("5m")),
Annotations: map[string]string{
"description": "Container {{ $labels.container }} in pod {{ $labels.pod }} memory usage exceeds the memory requested",
"summary": "The container is using more memory than what is defined in the containers resource requests",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "none",
},
},
{
Alert: "KubeVirtComponentExceedsRequestedCPU",
Expr: intstr.FromString(
// In 'container_cpu_usage_seconds_total', 'container=""' filters the accumulated metric for the pod slice to measure total CPU usage for all containers within the pod
fmt.Sprintf(`((kube_pod_container_resource_requests{namespace="%s",container=~"virt-controller|virt-api|virt-handler|virt-operator",resource="cpu"}) - on(pod) sum(rate(container_cpu_usage_seconds_total{container="",namespace="%s"}[5m])) by (pod)) < 0`, namespace, namespace),
),
For: ptr.To(promv1.Duration("5m")),
Annotations: map[string]string{
"description": "Pod {{ $labels.pod }} cpu usage exceeds the CPU requested",
"summary": "The containers in the pod are using more CPU than what is defined in the containers resource requests",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "none",
},
},
{
Alert: "KubeVirtNoAvailableNodesToRunVMs",
Expr: intstr.FromString("((sum(kube_node_status_allocatable{resource='devices_kubevirt_io_kvm'}) or on() vector(0)) == 0 and (sum(kubevirt_configuration_emulation_enabled) or on() vector(0)) == 0) or (sum(kube_node_labels{label_kubevirt_io_schedulable='true'}) or on() vector(0)) == 0"),
For: ptr.To(promv1.Duration("5m")),
Annotations: map[string]string{
"summary": "There are no available nodes in the cluster to run VMs.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "critical",
},
},
}
}
87 changes: 87 additions & 0 deletions pkg/monitoring/rules/alerts/virt-api.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*
Copyright 2023 The KubeVirt Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package alerts

import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)

func virtApiAlerts(namespace string) []promv1.Rule {
return []promv1.Rule{
{
Alert: "VirtAPIDown",
Expr: intstr.FromString("kubevirt_virt_api_up == 0"),
For: ptr.To(promv1.Duration("10m")),
Annotations: map[string]string{
"summary": "All virt-api servers are down.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
operatorHealthImpactLabelKey: "critical",
},
},
{
Alert: "LowVirtAPICount",
Expr: intstr.FromString("(kubevirt_allocatable_nodes > 1) and (kubevirt_virt_api_up < 2)"),
For: ptr.To(promv1.Duration("60m")),
Annotations: map[string]string{
"summary": "More than one virt-api should be running if more than one worker nodes exist.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "VirtApiRESTErrorsHigh",
Expr: intstr.FromString(getErrorRatio(namespace, "virt-api", "(4|5)[0-9][0-9]", 60) + " >= 0.05"),
Annotations: map[string]string{
"summary": getRestCallsFailedWarning(5, "virt-api", "hour"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "VirtApiRESTErrorsBurst",
Expr: intstr.FromString(getErrorRatio(namespace, "virt-api", "(4|5)[0-9][0-9]", 5) + " >= 0.8"),
For: ptr.To(promv1.Duration("5m")),
Annotations: map[string]string{
"summary": getRestCallsFailedWarning(80, "virt-api", durationFiveMinutes),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
operatorHealthImpactLabelKey: "critical",
},
},
{
Alert: "KubeVirtDeprecatedAPIRequested",
Expr: intstr.FromString("sum by (resource,group,version) ((round(increase(kubevirt_api_request_deprecated_total{verb!~\"LIST|WATCH\"}[10m])) > 0 and kubevirt_api_request_deprecated_total{verb!~\"LIST|WATCH\"} offset 10m) or (kubevirt_api_request_deprecated_total{verb!~\"LIST|WATCH\"} != 0 unless kubevirt_api_request_deprecated_total{verb!~\"LIST|WATCH\"} offset 10m))"),
Annotations: map[string]string{
"description": "Detected requests to the deprecated {{ $labels.resource }}.{{ $labels.group }}/{{ $labels.version }} API.",
"summary": "Detected {{ $value }} requests in the last 10 minutes.",
},
Labels: map[string]string{
severityAlertLabelKey: "info",
operatorHealthImpactLabelKey: "none",
},
},
}
}
99 changes: 99 additions & 0 deletions pkg/monitoring/rules/alerts/virt-controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
Copyright 2023 The KubeVirt Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package alerts

import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)

func virtControllerAlerts(namespace string) []promv1.Rule {
return []promv1.Rule{
{
Alert: "LowReadyVirtControllersCount",
Expr: intstr.FromString("kubevirt_virt_controller_ready < kubevirt_virt_controller_up"),
For: ptr.To(promv1.Duration("10m")),
Annotations: map[string]string{
"summary": "Some virt controllers are running but not ready.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "NoReadyVirtController",
Expr: intstr.FromString("kubevirt_virt_controller_ready == 0"),
For: ptr.To(promv1.Duration("10m")),
Annotations: map[string]string{
"summary": "No ready virt-controller was detected for the last 10 min.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
operatorHealthImpactLabelKey: "critical",
},
},
{
Alert: "VirtControllerDown",
Expr: intstr.FromString("kubevirt_virt_controller_up == 0"),
For: ptr.To(promv1.Duration("10m")),
Annotations: map[string]string{
"summary": "No running virt-controller was detected for the last 10 min.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
operatorHealthImpactLabelKey: "critical",
},
},
{
Alert: "LowVirtControllersCount",
Expr: intstr.FromString("(kubevirt_allocatable_nodes > 1) and (kubevirt_virt_controller_ready < 2)"),
For: ptr.To(promv1.Duration("10m")),
Annotations: map[string]string{
"summary": "More than one virt-controller should be ready if more than one worker node.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "VirtControllerRESTErrorsHigh",
Expr: intstr.FromString(getErrorRatio(namespace, "virt-controller", "(4|5)[0-9][0-9]", 60) + " >= 0.05"),
Annotations: map[string]string{
"summary": getRestCallsFailedWarning(5, "virt-controller", "hour"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "VirtControllerRESTErrorsBurst",
Expr: intstr.FromString(getErrorRatio(namespace, "virt-controller", "(4|5)[0-9][0-9]", 5) + " >= 0.8"),
For: ptr.To(promv1.Duration("5m")),
Annotations: map[string]string{
"summary": getRestCallsFailedWarning(80, "virt-controller", durationFiveMinutes),
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
operatorHealthImpactLabelKey: "critical",
},
},
}
}
Loading

0 comments on commit ef91d5a

Please sign in to comment.