forked from kubevirt/kubevirt
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request kubevirt#10700 from machadovilaca/refactor-monitor…
…ing-alerts Refactor monitoring alerts
- Loading branch information
Showing
11 changed files
with
671 additions
and
466 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
load("@io_bazel_rules_go//go:def.bzl", "go_library") | ||
|
||
go_library( | ||
name = "go_default_library", | ||
srcs = [ | ||
"alerts.go", | ||
"system.go", | ||
"virt-api.go", | ||
"virt-controller.go", | ||
"virt-handler.go", | ||
"virt-operator.go", | ||
"vms.go", | ||
], | ||
importpath = "kubevirt.io/kubevirt/pkg/monitoring/rules/alerts", | ||
visibility = ["//visibility:public"], | ||
deps = [ | ||
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules:go_default_library", | ||
"//vendor/github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1:go_default_library", | ||
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", | ||
"//vendor/k8s.io/apimachinery/pkg/util/intstr:go_default_library", | ||
"//vendor/k8s.io/utils/ptr:go_default_library", | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
/* | ||
Copyright 2023 The KubeVirt Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package alerts | ||
|
||
import ( | ||
"errors" | ||
"fmt" | ||
"os" | ||
"strings" | ||
|
||
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" | ||
|
||
"github.com/machadovilaca/operator-observability/pkg/operatorrules" | ||
) | ||
|
||
const ( | ||
prometheusRunbookAnnotationKey = "runbook_url" | ||
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" | ||
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" | ||
|
||
severityAlertLabelKey = "severity" | ||
operatorHealthImpactLabelKey = "operator_health_impact" | ||
|
||
partOfAlertLabelKey = "kubernetes_operator_part_of" | ||
componentAlertLabelKey = "kubernetes_operator_component" | ||
kubevirtLabelValue = "kubevirt" | ||
|
||
durationFiveMinutes = "5 minutes" | ||
) | ||
|
||
func Register(namespace string) error { | ||
alerts := [][]promv1.Rule{ | ||
systemAlerts(namespace), | ||
virtApiAlerts(namespace), | ||
virtControllerAlerts(namespace), | ||
virtHandlerAlerts(namespace), | ||
virtOperatorAlerts(namespace), | ||
vmsAlerts, | ||
} | ||
|
||
runbookURLTemplate := getRunbookURLTemplate() | ||
for _, alertGroup := range alerts { | ||
for _, alert := range alertGroup { | ||
alert.Labels[partOfAlertLabelKey] = kubevirtLabelValue | ||
alert.Labels[componentAlertLabelKey] = kubevirtLabelValue | ||
|
||
alert.Annotations[prometheusRunbookAnnotationKey] = fmt.Sprintf(runbookURLTemplate, alert.Alert) | ||
} | ||
} | ||
|
||
return operatorrules.RegisterAlerts(alerts...) | ||
} | ||
|
||
func getRunbookURLTemplate() string { | ||
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv) | ||
if !exists { | ||
runbookURLTemplate = defaultRunbookURLTemplate | ||
} | ||
|
||
if strings.Count(runbookURLTemplate, "%s") != 1 { | ||
panic(errors.New("runbook URL template must have exactly 1 %s substring")) | ||
} | ||
|
||
return runbookURLTemplate | ||
} | ||
|
||
func getErrorRatio(ns string, podName string, errorCodeRegex string, durationInMinutes int) string { | ||
errorRatioQuery := "sum ( rate ( rest_client_requests_total{namespace=\"%s\",pod=~\"%s-.*\",code=~\"%s\"} [%dm] ) ) / sum ( rate ( rest_client_requests_total{namespace=\"%s\",pod=~\"%s-.*\"} [%dm] ) )" | ||
return fmt.Sprintf(errorRatioQuery, ns, podName, errorCodeRegex, durationInMinutes, ns, podName, durationInMinutes) | ||
} | ||
|
||
func getRestCallsFailedWarning(failingCallsPercentage int, component, duration string) string { | ||
const restCallsFailWarningTemplate = "More than %d%% of the rest calls failed in %s for the last %s" | ||
return fmt.Sprintf(restCallsFailWarningTemplate, failingCallsPercentage, component, duration) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
/* | ||
Copyright 2023 The KubeVirt Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package alerts | ||
|
||
import ( | ||
"fmt" | ||
|
||
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" | ||
"k8s.io/apimachinery/pkg/util/intstr" | ||
"k8s.io/utils/ptr" | ||
) | ||
|
||
func systemAlerts(namespace string) []promv1.Rule { | ||
return []promv1.Rule{ | ||
{ | ||
Alert: "LowKVMNodesCount", | ||
Expr: intstr.FromString("(kubevirt_allocatable_nodes > 1) and (kubevirt_nodes_with_kvm < 2)"), | ||
For: ptr.To(promv1.Duration("5m")), | ||
Annotations: map[string]string{ | ||
"description": "Low number of nodes with KVM resource available.", | ||
"summary": "At least two nodes with kvm resource required for VM live migration.", | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "warning", | ||
operatorHealthImpactLabelKey: "warning", | ||
}, | ||
}, | ||
{ | ||
Alert: "KubeVirtComponentExceedsRequestedMemory", | ||
Expr: intstr.FromString( | ||
// In 'container_memory_working_set_bytes', 'container=""' filters the accumulated metric for the pod slice to measure total Memory usage for all containers within the pod | ||
fmt.Sprintf(`((kube_pod_container_resource_requests{namespace="%s",container=~"virt-controller|virt-api|virt-handler|virt-operator",resource="memory"}) - on(pod) group_left(node) container_memory_working_set_bytes{container="",namespace="%s"}) < 0`, namespace, namespace)), | ||
For: ptr.To(promv1.Duration("5m")), | ||
Annotations: map[string]string{ | ||
"description": "Container {{ $labels.container }} in pod {{ $labels.pod }} memory usage exceeds the memory requested", | ||
"summary": "The container is using more memory than what is defined in the containers resource requests", | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "warning", | ||
operatorHealthImpactLabelKey: "none", | ||
}, | ||
}, | ||
{ | ||
Alert: "KubeVirtComponentExceedsRequestedCPU", | ||
Expr: intstr.FromString( | ||
// In 'container_cpu_usage_seconds_total', 'container=""' filters the accumulated metric for the pod slice to measure total CPU usage for all containers within the pod | ||
fmt.Sprintf(`((kube_pod_container_resource_requests{namespace="%s",container=~"virt-controller|virt-api|virt-handler|virt-operator",resource="cpu"}) - on(pod) sum(rate(container_cpu_usage_seconds_total{container="",namespace="%s"}[5m])) by (pod)) < 0`, namespace, namespace), | ||
), | ||
For: ptr.To(promv1.Duration("5m")), | ||
Annotations: map[string]string{ | ||
"description": "Pod {{ $labels.pod }} cpu usage exceeds the CPU requested", | ||
"summary": "The containers in the pod are using more CPU than what is defined in the containers resource requests", | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "warning", | ||
operatorHealthImpactLabelKey: "none", | ||
}, | ||
}, | ||
{ | ||
Alert: "KubeVirtNoAvailableNodesToRunVMs", | ||
Expr: intstr.FromString("((sum(kube_node_status_allocatable{resource='devices_kubevirt_io_kvm'}) or on() vector(0)) == 0 and (sum(kubevirt_configuration_emulation_enabled) or on() vector(0)) == 0) or (sum(kube_node_labels{label_kubevirt_io_schedulable='true'}) or on() vector(0)) == 0"), | ||
For: ptr.To(promv1.Duration("5m")), | ||
Annotations: map[string]string{ | ||
"summary": "There are no available nodes in the cluster to run VMs.", | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "warning", | ||
operatorHealthImpactLabelKey: "critical", | ||
}, | ||
}, | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
/* | ||
Copyright 2023 The KubeVirt Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package alerts | ||
|
||
import ( | ||
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" | ||
"k8s.io/apimachinery/pkg/util/intstr" | ||
"k8s.io/utils/ptr" | ||
) | ||
|
||
func virtApiAlerts(namespace string) []promv1.Rule { | ||
return []promv1.Rule{ | ||
{ | ||
Alert: "VirtAPIDown", | ||
Expr: intstr.FromString("kubevirt_virt_api_up == 0"), | ||
For: ptr.To(promv1.Duration("10m")), | ||
Annotations: map[string]string{ | ||
"summary": "All virt-api servers are down.", | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "critical", | ||
operatorHealthImpactLabelKey: "critical", | ||
}, | ||
}, | ||
{ | ||
Alert: "LowVirtAPICount", | ||
Expr: intstr.FromString("(kubevirt_allocatable_nodes > 1) and (kubevirt_virt_api_up < 2)"), | ||
For: ptr.To(promv1.Duration("60m")), | ||
Annotations: map[string]string{ | ||
"summary": "More than one virt-api should be running if more than one worker nodes exist.", | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "warning", | ||
operatorHealthImpactLabelKey: "warning", | ||
}, | ||
}, | ||
{ | ||
Alert: "VirtApiRESTErrorsHigh", | ||
Expr: intstr.FromString(getErrorRatio(namespace, "virt-api", "(4|5)[0-9][0-9]", 60) + " >= 0.05"), | ||
Annotations: map[string]string{ | ||
"summary": getRestCallsFailedWarning(5, "virt-api", "hour"), | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "warning", | ||
operatorHealthImpactLabelKey: "warning", | ||
}, | ||
}, | ||
{ | ||
Alert: "VirtApiRESTErrorsBurst", | ||
Expr: intstr.FromString(getErrorRatio(namespace, "virt-api", "(4|5)[0-9][0-9]", 5) + " >= 0.8"), | ||
For: ptr.To(promv1.Duration("5m")), | ||
Annotations: map[string]string{ | ||
"summary": getRestCallsFailedWarning(80, "virt-api", durationFiveMinutes), | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "critical", | ||
operatorHealthImpactLabelKey: "critical", | ||
}, | ||
}, | ||
{ | ||
Alert: "KubeVirtDeprecatedAPIRequested", | ||
Expr: intstr.FromString("sum by (resource,group,version) ((round(increase(kubevirt_api_request_deprecated_total{verb!~\"LIST|WATCH\"}[10m])) > 0 and kubevirt_api_request_deprecated_total{verb!~\"LIST|WATCH\"} offset 10m) or (kubevirt_api_request_deprecated_total{verb!~\"LIST|WATCH\"} != 0 unless kubevirt_api_request_deprecated_total{verb!~\"LIST|WATCH\"} offset 10m))"), | ||
Annotations: map[string]string{ | ||
"description": "Detected requests to the deprecated {{ $labels.resource }}.{{ $labels.group }}/{{ $labels.version }} API.", | ||
"summary": "Detected {{ $value }} requests in the last 10 minutes.", | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "info", | ||
operatorHealthImpactLabelKey: "none", | ||
}, | ||
}, | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
/* | ||
Copyright 2023 The KubeVirt Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package alerts | ||
|
||
import ( | ||
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" | ||
"k8s.io/apimachinery/pkg/util/intstr" | ||
"k8s.io/utils/ptr" | ||
) | ||
|
||
func virtControllerAlerts(namespace string) []promv1.Rule { | ||
return []promv1.Rule{ | ||
{ | ||
Alert: "LowReadyVirtControllersCount", | ||
Expr: intstr.FromString("kubevirt_virt_controller_ready < kubevirt_virt_controller_up"), | ||
For: ptr.To(promv1.Duration("10m")), | ||
Annotations: map[string]string{ | ||
"summary": "Some virt controllers are running but not ready.", | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "warning", | ||
operatorHealthImpactLabelKey: "warning", | ||
}, | ||
}, | ||
{ | ||
Alert: "NoReadyVirtController", | ||
Expr: intstr.FromString("kubevirt_virt_controller_ready == 0"), | ||
For: ptr.To(promv1.Duration("10m")), | ||
Annotations: map[string]string{ | ||
"summary": "No ready virt-controller was detected for the last 10 min.", | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "critical", | ||
operatorHealthImpactLabelKey: "critical", | ||
}, | ||
}, | ||
{ | ||
Alert: "VirtControllerDown", | ||
Expr: intstr.FromString("kubevirt_virt_controller_up == 0"), | ||
For: ptr.To(promv1.Duration("10m")), | ||
Annotations: map[string]string{ | ||
"summary": "No running virt-controller was detected for the last 10 min.", | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "critical", | ||
operatorHealthImpactLabelKey: "critical", | ||
}, | ||
}, | ||
{ | ||
Alert: "LowVirtControllersCount", | ||
Expr: intstr.FromString("(kubevirt_allocatable_nodes > 1) and (kubevirt_virt_controller_ready < 2)"), | ||
For: ptr.To(promv1.Duration("10m")), | ||
Annotations: map[string]string{ | ||
"summary": "More than one virt-controller should be ready if more than one worker node.", | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "warning", | ||
operatorHealthImpactLabelKey: "warning", | ||
}, | ||
}, | ||
{ | ||
Alert: "VirtControllerRESTErrorsHigh", | ||
Expr: intstr.FromString(getErrorRatio(namespace, "virt-controller", "(4|5)[0-9][0-9]", 60) + " >= 0.05"), | ||
Annotations: map[string]string{ | ||
"summary": getRestCallsFailedWarning(5, "virt-controller", "hour"), | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "warning", | ||
operatorHealthImpactLabelKey: "warning", | ||
}, | ||
}, | ||
{ | ||
Alert: "VirtControllerRESTErrorsBurst", | ||
Expr: intstr.FromString(getErrorRatio(namespace, "virt-controller", "(4|5)[0-9][0-9]", 5) + " >= 0.8"), | ||
For: ptr.To(promv1.Duration("5m")), | ||
Annotations: map[string]string{ | ||
"summary": getRestCallsFailedWarning(80, "virt-controller", durationFiveMinutes), | ||
}, | ||
Labels: map[string]string{ | ||
severityAlertLabelKey: "critical", | ||
operatorHealthImpactLabelKey: "critical", | ||
}, | ||
}, | ||
} | ||
} |
Oops, something went wrong.