From dfdd479649b7cf630aa43981730c07fab92c0239 Mon Sep 17 00:00:00 2001 From: Shirly Radco Date: Sun, 13 Aug 2023 15:38:46 +0300 Subject: [PATCH] Update metric names to fit metrics naming conventions Signed-off-by: Shirly Radco --- docs/metrics.md | 96 ++++++++++--------- hack/prom-rule-ci/prom-rules-tests.yaml | 14 +-- .../domainstats/prometheus/prometheus.go | 36 +++---- .../domainstats/prometheus/prometheus_test.go | 22 ++--- pkg/monitoring/migrationstats/collector.go | 10 +- pkg/virt-controller/watch/application.go | 2 +- .../workload-updater/workload-updater.go | 2 +- pkg/virt-operator/application.go | 4 +- .../generate/components/prometheus.go | 56 +++++------ tests/infrastructure/prometheus.go | 26 ++--- tests/monitoring/vm_monitoring.go | 16 ++-- tools/doc-generator/doc-generator.go | 10 ++ .../metrics_collector.go | 33 +------ 13 files changed, 159 insertions(+), 168 deletions(-) diff --git a/docs/metrics.md b/docs/metrics.md index 38b89b8b7f05..09b1b74cc420 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -12,7 +12,7 @@ All metrics documented here are auto-generated by the utility tool `tools/doc-ge ### kubevirt_info Version information. -### kubevirt_allocatable_nodes_count +### kubevirt_allocatable_nodes The number of nodes in the cluster that have the devices.kubevirt.io/kvm resource available. Type: Gauge. ### kubevirt_api_request_deprecated_total @@ -21,9 +21,6 @@ The total number of requests to deprecated KubeVirt APIs. Type: Counter. ### kubevirt_configuration_emulation_enabled Indicates whether the Software Emulation is enabled in the configuration. Type: Gauge. -### kubevirt_kvm_available_nodes_count -The number of nodes in the cluster that have the devices.kubevirt.io/kvm resource available. Type: Gauge. - ### kubevirt_migrate_vmi_data_processed_bytes The total Guest OS data processed and migrated to the new VM. Type: Gauge. @@ -36,52 +33,46 @@ The rate of memory being dirty in the Guest OS. Type: Gauge. ### kubevirt_migrate_vmi_disk_transfer_rate_bytes The rate at which the disk is being transferred. Type: Gauge. -### kubevirt_migrate_vmi_failed -Number of failed migrations. Type: Gauge. - ### kubevirt_migrate_vmi_memory_transfer_rate_bytes The rate at which the memory is being transferred. Type: Gauge. -### kubevirt_migrate_vmi_pending_count -Number of current pending migrations. Type: Gauge. - -### kubevirt_migrate_vmi_running_count -Number of current running migrations. Type: Gauge. - -### kubevirt_migrate_vmi_scheduling_count -Number of current scheduling migrations. Type: Gauge. - -### kubevirt_migrate_vmi_succeeded -Number of migrations successfully executed. Type: Gauge. +### kubevirt_nodes_with_kvm +The number of nodes in the cluster that have the devices.kubevirt.io/kvm resource available. Type: Gauge. ### kubevirt_number_of_vms The number of VMs in the cluster by namespace. Type: Gauge. -### kubevirt_virt_api_up_total +### kubevirt_virt_api_up The number of virt-api pods that are up. Type: Gauge. ### kubevirt_virt_controller_leading Indication for an operating virt-controller. Type: Gauge. ### kubevirt_virt_controller_ready -Indication for a virt-controller that is ready to take the lead. Type: Gauge. - -### kubevirt_virt_controller_ready_total The number of virt-controller pods that are ready. Type: Gauge. -### kubevirt_virt_controller_up_total +### kubevirt_virt_controller_ready_status +Indication for a virt-controller that is ready to take the lead. Type: Gauge. + +### kubevirt_virt_controller_up The number of virt-controller pods that are up. Type: Gauge. -### kubevirt_virt_handler_up_total +### kubevirt_virt_handler_up The number of virt-handler pods that are up. Type: Gauge. -### kubevirt_virt_operator_leading_total +### kubevirt_virt_operator_leading The number of virt-operator pods that are leading. Type: Gauge. -### kubevirt_virt_operator_ready_total +### kubevirt_virt_operator_leading_status +Indication for an operating virt-operator. Type: Gauge. + +### kubevirt_virt_operator_ready The number of virt-operator pods that are ready. Type: Gauge. -### kubevirt_virt_operator_up_total +### kubevirt_virt_operator_ready_status +Indication for a virt-operator that is ready to take the lead. Type: Gauge. + +### kubevirt_virt_operator_up The number of virt-operator pods that are up. Type: Gauge. ### kubevirt_vm_container_free_memory_bytes_based_on_rss @@ -114,7 +105,7 @@ Total CPU time spent in all modes (sum of both vcpu and hypervisor usage). Type: ### kubevirt_vmi_cpu_user_usage_seconds_total Total CPU time spent in user mode. Type: Counter. -### kubevirt_vmi_filesystem_capacity_bytes_total +### kubevirt_vmi_filesystem_capacity_bytes Total VM filesystem capacity in bytes. Type: Gauge. ### kubevirt_vmi_filesystem_used_bytes @@ -129,22 +120,22 @@ Amount of usable memory as seen by the domain. This value may not be accurate if ### kubevirt_vmi_memory_cached_bytes The amount of memory that is being used to cache I/O and is available to be reclaimed, corresponds to the sum of `Buffers` + `Cached` + `SwapCached` in `/proc/meminfo`. Type: Gauge. -### kubevirt_vmi_memory_domain_bytes_total +### kubevirt_vmi_memory_domain_bytes The amount of memory in bytes allocated to the domain. The `memory` value in domain xml file. Type: Gauge. -### kubevirt_vmi_memory_pgmajfault +### kubevirt_vmi_memory_pgmajfault_total The number of page faults when disk IO was required. Page faults occur when a process makes a valid access to virtual memory that is not available. When servicing the page fault, if disk IO is required, it is considered as major fault. Type: Counter. -### kubevirt_vmi_memory_pgminfault +### kubevirt_vmi_memory_pgminfault_total The number of other page faults, when disk IO was not required. Page faults occur when a process makes a valid access to virtual memory that is not available. When servicing the page fault, if disk IO is NOT required, it is considered as minor fault. Type: Counter. ### kubevirt_vmi_memory_resident_bytes Resident set size of the process running the domain. Type: Gauge. -### kubevirt_vmi_memory_swap_in_traffic_bytes_total +### kubevirt_vmi_memory_swap_in_traffic_bytes The total amount of data read from swap space of the guest in bytes. Type: Gauge. -### kubevirt_vmi_memory_swap_out_traffic_bytes_total +### kubevirt_vmi_memory_swap_out_traffic_bytes The total amount of memory written out to swap space of the guest in bytes. Type: Gauge. ### kubevirt_vmi_memory_unused_bytes @@ -159,6 +150,21 @@ Amount of `used` memory as seen by the domain. Type: Gauge. ### kubevirt_vmi_migration_phase_transition_time_from_creation_seconds Histogram of VM migration phase transitions duration from creation time in seconds. Type: Histogram. +### kubevirt_vmi_migrations_failed +Number of failed migrations. Type: Gauge. + +### kubevirt_vmi_migrations_pending +Number of current pending migrations. Type: Gauge. + +### kubevirt_vmi_migrations_running +Number of current running migrations. Type: Gauge. + +### kubevirt_vmi_migrations_scheduling +Number of current scheduling migrations. Type: Gauge. + +### kubevirt_vmi_migrations_succeeded +Number of migrations successfully executed. Type: Gauge. + ### kubevirt_vmi_network_receive_bytes_total Total network traffic received in bytes. Type: Counter. @@ -192,7 +198,7 @@ Number of VMI CPU affinities to node physical cores. Type: Gauge. ### kubevirt_vmi_non_evictable Indication for a VirtualMachine that its eviction strategy is set to Live Migration but is not migratable. Type: Gauge. -### kubevirt_vmi_outdated_count +### kubevirt_vmi_number_of_outdated Indication for the total number of VirtualMachineInstance workloads that are not running within the most up-to-date version of the virt-launcher environment. Type: Gauge. ### kubevirt_vmi_phase_count @@ -210,8 +216,8 @@ Histogram of VM phase transitions duration between different phases in seconds. ### kubevirt_vmi_storage_flush_requests_total Total storage flush requests. Type: Counter. -### kubevirt_vmi_storage_flush_times_ms_total -Total time (ms) spent on cache flushing. Type: Counter. +### kubevirt_vmi_storage_flush_times_seconds_total +Total time spent on cache flushing. Type: Counter. ### kubevirt_vmi_storage_iops_read_total Total number of I/O read operations. Type: Counter. @@ -219,14 +225,14 @@ Total number of I/O read operations. Type: Counter. ### kubevirt_vmi_storage_iops_write_total Total number of I/O write operations. Type: Counter. -### kubevirt_vmi_storage_read_times_ms_total -Total time (ms) spent on read operations. Type: Counter. +### kubevirt_vmi_storage_read_times_seconds_total +Total time spent on read operations. Type: Counter. ### kubevirt_vmi_storage_read_traffic_bytes_total Total number of bytes read from storage. Type: Counter. -### kubevirt_vmi_storage_write_times_ms_total -Total time (ms) spent on write operations. Type: Counter. +### kubevirt_vmi_storage_write_times_seconds_total +Total time spent on write operations. Type: Counter. ### kubevirt_vmi_storage_write_traffic_bytes_total Total number of written bytes. Type: Counter. @@ -234,18 +240,18 @@ Total number of written bytes. Type: Counter. ### kubevirt_vmi_vcpu_delay_seconds_total Amount of time spent by each vcpu waiting in the queue instead of running. Type: Counter. -### kubevirt_vmi_vcpu_seconds +### kubevirt_vmi_vcpu_seconds_total Total amount of time spent in each state by each vcpu (cpu_time excluding hypervisor time). Where `id` is the vcpu identifier and `state` can be one of the following: [`OFFLINE`, `RUNNING`, `BLOCKED`]. Type: Counter. -### kubevirt_vmi_vcpu_wait_seconds +### kubevirt_vmi_vcpu_wait_seconds_total Amount of time spent by each vcpu while waiting on I/O. Type: Counter. +### kubevirt_vmsnapshot_disks_restored_from_source +Returns the total number of virtual machine disks restored from the source virtual machine. Type: Gauge. + ### kubevirt_vmsnapshot_disks_restored_from_source_bytes Returns the amount of space in bytes restored from the source virtual machine. Type: Gauge. -### kubevirt_vmsnapshot_disks_restored_from_source_total -Returns the total number of virtual machine disks restored from the source virtual machine. Type: Gauge. - ### kubevirt_vmsnapshot_persistentvolumeclaim_labels Returns the labels of the persistent volume claims that are used for restoring virtual machines. Type: Info. diff --git a/hack/prom-rule-ci/prom-rules-tests.yaml b/hack/prom-rule-ci/prom-rules-tests.yaml index 9dd0e92b65a4..f969eadcb2ca 100644 --- a/hack/prom-rule-ci/prom-rules-tests.yaml +++ b/hack/prom-rule-ci/prom-rules-tests.yaml @@ -242,9 +242,9 @@ tests: # Some virt controllers are not ready - interval: 1m input_series: - - series: 'kubevirt_virt_controller_ready{namespace="ci", pod="virt-controller-1"}' + - series: 'kubevirt_virt_controller_ready_status{namespace="ci", pod="virt-controller-1"}' values: '1+0x11' - - series: 'kubevirt_virt_controller_ready{namespace="ci", pod="virt-controller-2"}' + - series: 'kubevirt_virt_controller_ready_status{namespace="ci", pod="virt-controller-2"}' values: '0+0x11' - series: 'up{namespace="ci", pod="virt-controller-1"}' values: '1+0x11' @@ -267,7 +267,7 @@ tests: # All virt controllers are not ready - interval: 1m input_series: - - series: 'kubevirt_virt_controller_ready{namespace="ci", pod="virt-controller-1"}' + - series: 'kubevirt_virt_controller_ready_status{namespace="ci", pod="virt-controller-1"}' values: "0 0 0 0 0 0 0 0 0 0 0" alert_rule_test: @@ -289,7 +289,7 @@ tests: # All virt controllers are not ready (ImagePullBackOff) - interval: 1m input_series: - - series: 'kubevirt_virt_controller_ready{namespace="ci", pod="virt-controller-1"}' + - series: 'kubevirt_virt_controller_ready_status{namespace="ci", pod="virt-controller-1"}' values: "stale stale stale stale stale stale stale stale stale stale" alert_rule_test: @@ -312,7 +312,7 @@ tests: # All virt operators are not ready (ImagePullBackOff) - interval: 1m input_series: - - series: 'kubevirt_virt_operator_ready{namespace="ci", pod="virt-operator-1"}' + - series: 'kubevirt_virt_operator_ready_status{namespace="ci", pod="virt-operator-1"}' values: "stale stale stale stale stale stale stale stale stale stale" alert_rule_test: @@ -335,7 +335,7 @@ tests: # All virt operators are not ready - interval: 1m input_series: - - series: 'kubevirt_virt_operator_ready{namespace="ci", pod="virt-operator-1"}' + - series: 'kubevirt_virt_operator_ready_status{namespace="ci", pod="virt-operator-1"}' values: "0 0 0 0 0 0 0 0 0 0 0" alert_rule_test: @@ -725,7 +725,7 @@ tests: # Excessive VMI Migrations in a period of time - interval: 1h input_series: - - series: 'kubevirt_migrate_vmi_succeeded{vmi="vmi-example-1"}' + - series: 'kubevirt_vmi_migrations_succeeded{vmi="vmi-example-1"}' # time: 0 1 2 3 4 5 values: "_ _ _ 1 7 13" diff --git a/pkg/monitoring/domainstats/prometheus/prometheus.go b/pkg/monitoring/domainstats/prometheus/prometheus.go index 3548e170fe13..60b236d38928 100644 --- a/pkg/monitoring/domainstats/prometheus/prometheus.go +++ b/pkg/monitoring/domainstats/prometheus/prometheus.go @@ -161,7 +161,7 @@ func (metrics *vmiMetrics) updateMemory(mem *stats.DomainStatsMemory) { if mem.SwapInSet { metrics.pushCommonMetric( - "kubevirt_vmi_memory_swap_in_traffic_bytes_total", + "kubevirt_vmi_memory_swap_in_traffic_bytes", "The total amount of data read from swap space of the guest in bytes.", prometheus.GaugeValue, float64(mem.SwapIn)*1024, @@ -170,7 +170,7 @@ func (metrics *vmiMetrics) updateMemory(mem *stats.DomainStatsMemory) { if mem.SwapOutSet { metrics.pushCommonMetric( - "kubevirt_vmi_memory_swap_out_traffic_bytes_total", + "kubevirt_vmi_memory_swap_out_traffic_bytes", "The total amount of memory written out to swap space of the guest in bytes.", prometheus.GaugeValue, float64(mem.SwapOut)*1024, @@ -179,7 +179,7 @@ func (metrics *vmiMetrics) updateMemory(mem *stats.DomainStatsMemory) { if mem.MajorFaultSet { metrics.pushCommonMetric( - "kubevirt_vmi_memory_pgmajfault", + "kubevirt_vmi_memory_pgmajfault_total", "The number of page faults when disk IO was required. Page faults occur when a process makes a valid access to virtual memory that is not available. When servicing the page fault, if disk IO is required, it is considered as major fault.", prometheus.CounterValue, float64(mem.MajorFault), @@ -188,7 +188,7 @@ func (metrics *vmiMetrics) updateMemory(mem *stats.DomainStatsMemory) { if mem.MinorFaultSet { metrics.pushCommonMetric( - "kubevirt_vmi_memory_pgminfault", + "kubevirt_vmi_memory_pgminfault_total", "The number of other page faults, when disk IO was not required. Page faults occur when a process makes a valid access to virtual memory that is not available. When servicing the page fault, if disk IO is NOT required, it is considered as minor fault.", prometheus.CounterValue, float64(mem.MinorFault), @@ -215,7 +215,7 @@ func (metrics *vmiMetrics) updateMemory(mem *stats.DomainStatsMemory) { if mem.TotalSet { metrics.pushCommonMetric( - "kubevirt_vmi_memory_domain_bytes_total", + "kubevirt_vmi_memory_domain_bytes", "The amount of memory in bytes allocated to the domain. The `memory` value in domain xml file.", prometheus.GaugeValue, float64(mem.Total)*1024, @@ -285,7 +285,7 @@ func (metrics *vmiMetrics) updateVcpu(vcpuStats []stats.DomainStatsVcpu) { if vcpu.StateSet && vcpu.TimeSet { metrics.pushCustomMetric( - "kubevirt_vmi_vcpu_seconds", + "kubevirt_vmi_vcpu_seconds_total", "Total amount of time spent in each state by each vcpu (cpu_time excluding hypervisor time). Where `id` is the vcpu identifier and `state` can be one of the following: [`OFFLINE`, `RUNNING`, `BLOCKED`].", prometheus.CounterValue, float64(vcpu.Time/1000000000), @@ -296,10 +296,10 @@ func (metrics *vmiMetrics) updateVcpu(vcpuStats []stats.DomainStatsVcpu) { if vcpu.WaitSet { metrics.pushCustomMetric( - "kubevirt_vmi_vcpu_wait_seconds", + "kubevirt_vmi_vcpu_wait_seconds_total", "Amount of time spent by each vcpu while waiting on I/O.", prometheus.CounterValue, - float64(vcpu.Wait)/float64(1000000), + float64(vcpu.Wait)/float64(1000000000), []string{"id"}, []string{stringVcpuIdx}, ) @@ -377,10 +377,10 @@ func (metrics *vmiMetrics) updateBlock(blkStats []stats.DomainStatsBlock) { if block.RdTimesSet { metrics.pushCustomMetric( - "kubevirt_vmi_storage_read_times_ms_total", - "Total time (ms) spent on read operations.", + "kubevirt_vmi_storage_read_times_seconds_total", + "Total time spent on read operations.", prometheus.CounterValue, - float64(block.RdTimes)/1000000, + float64(block.RdTimes)/1000000000, blkLabels, blkLabelValues, ) @@ -388,10 +388,10 @@ func (metrics *vmiMetrics) updateBlock(blkStats []stats.DomainStatsBlock) { if block.WrTimesSet { metrics.pushCustomMetric( - "kubevirt_vmi_storage_write_times_ms_total", - "Total time (ms) spent on write operations.", + "kubevirt_vmi_storage_write_times_seconds_total", + "Total time spent on write operations.", prometheus.CounterValue, - float64(block.WrTimes)/1000000, + float64(block.WrTimes)/1000000000, blkLabels, blkLabelValues, ) @@ -410,10 +410,10 @@ func (metrics *vmiMetrics) updateBlock(blkStats []stats.DomainStatsBlock) { if block.FlTimesSet { metrics.pushCustomMetric( - "kubevirt_vmi_storage_flush_times_ms_total", - "Total time (ms) spent on cache flushing.", + "kubevirt_vmi_storage_flush_times_seconds_total", + "Total time spent on cache flushing.", prometheus.CounterValue, - float64(block.FlTimes)/1000000, + float64(block.FlTimes)/1000000000, blkLabels, blkLabelValues, ) @@ -546,7 +546,7 @@ func (metrics *vmiMetrics) updateFilesystem(vmFSStats k6tv1.VirtualMachineInstan fsLabelValues := []string{fsStat.DiskName, fsStat.MountPoint, fsStat.FileSystemType} metrics.pushCustomMetric( - "kubevirt_vmi_filesystem_capacity_bytes_total", + "kubevirt_vmi_filesystem_capacity_bytes", "Total VM filesystem capacity in bytes.", prometheus.GaugeValue, float64(fsStat.TotalBytes), diff --git a/pkg/monitoring/domainstats/prometheus/prometheus_test.go b/pkg/monitoring/domainstats/prometheus/prometheus_test.go index 78d2f9721f4a..3365d4692a67 100644 --- a/pkg/monitoring/domainstats/prometheus/prometheus_test.go +++ b/pkg/monitoring/domainstats/prometheus/prometheus_test.go @@ -195,7 +195,7 @@ var _ = Describe("Prometheus", func() { result.Write(dto) Expect(result).ToNot(BeNil()) - Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_memory_swap_in_traffic_bytes_total")) + Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_memory_swap_in_traffic_bytes")) Expect(dto.Gauge.GetValue()).To(BeEquivalentTo(float64(1024))) }) @@ -220,7 +220,7 @@ var _ = Describe("Prometheus", func() { result.Write(dto) Expect(result).ToNot(BeNil()) - Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_memory_swap_out_traffic_bytes_total")) + Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_memory_swap_out_traffic_bytes")) Expect(dto.Gauge.GetValue()).To(BeEquivalentTo(float64(1024))) }) @@ -245,7 +245,7 @@ var _ = Describe("Prometheus", func() { result.Write(dto) Expect(result).ToNot(BeNil()) - Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_memory_pgmajfault")) + Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_memory_pgmajfault_total")) Expect(dto.Counter.GetValue()).To(BeEquivalentTo(float64(1024))) }) @@ -270,7 +270,7 @@ var _ = Describe("Prometheus", func() { result.Write(dto) Expect(result).ToNot(BeNil()) - Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_memory_pgminfault")) + Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_memory_pgminfault_total")) Expect(dto.Counter.GetValue()).To(BeEquivalentTo(float64(1024))) }) @@ -345,7 +345,7 @@ var _ = Describe("Prometheus", func() { result.Write(dto) Expect(result).ToNot(BeNil()) - Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_memory_domain_bytes_total")) + Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_memory_domain_bytes")) Expect(dto.Gauge.GetValue()).To(BeEquivalentTo(float64(1024))) }) @@ -428,7 +428,7 @@ var _ = Describe("Prometheus", func() { result := <-ch Expect(result).ToNot(BeNil()) - Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_vcpu_seconds")) + Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_vcpu_seconds_total")) }) It("should not expose vcpu metrics for invalid DomainStats", func() { @@ -674,7 +674,7 @@ var _ = Describe("Prometheus", func() { result := <-ch Expect(result).ToNot(BeNil()) - Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_storage_read_times_ms_total")) + Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_storage_read_times_seconds_total")) }) It("should handle block write time metrics", func() { @@ -701,7 +701,7 @@ var _ = Describe("Prometheus", func() { result := <-ch Expect(result).ToNot(BeNil()) - Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_storage_write_times_ms_total")) + Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_storage_write_times_seconds_total")) }) It("should handle block flush requests metrics", func() { @@ -755,7 +755,7 @@ var _ = Describe("Prometheus", func() { result := <-ch Expect(result).ToNot(BeNil()) - Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_storage_flush_times_ms_total")) + Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_storage_flush_times_seconds_total")) }) It("should use alias when alias is not empty", func() { @@ -1151,7 +1151,7 @@ var _ = Describe("Prometheus", func() { result := <-ch Expect(result).ToNot(BeNil()) - Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_vcpu_wait_seconds")) + Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_vcpu_wait_seconds_total")) }) It("should expose vcpu delay metric", func() { @@ -1234,7 +1234,7 @@ var _ = Describe("Prometheus", func() { ps.Report("test", &vmi, newVmStats(domainStats, fsStats)) result := <-ch Expect(result).ToNot(BeNil()) - Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_filesystem_capacity_bytes_total")) + Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_filesystem_capacity_bytes")) result = <-ch Expect(result).ToNot(BeNil()) Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_filesystem_used_bytes")) diff --git a/pkg/monitoring/migrationstats/collector.go b/pkg/monitoring/migrationstats/collector.go index 41d4f79f9e4b..6ccb3de948ef 100644 --- a/pkg/monitoring/migrationstats/collector.go +++ b/pkg/monitoring/migrationstats/collector.go @@ -28,11 +28,11 @@ import ( ) const ( - PendingMigrations = "kubevirt_migrate_vmi_pending_count" - SchedulingMigrations = "kubevirt_migrate_vmi_scheduling_count" - RunningMigrations = "kubevirt_migrate_vmi_running_count" - SucceededMigrations = "kubevirt_migrate_vmi_succeeded" - FailedMigrations = "kubevirt_migrate_vmi_failed" + PendingMigrations = "kubevirt_vmi_migrations_pending" + SchedulingMigrations = "kubevirt_vmi_migrations_scheduling" + RunningMigrations = "kubevirt_vmi_migrations_running" + SucceededMigrations = "kubevirt_vmi_migrations_succeeded" + FailedMigrations = "kubevirt_vmi_migrations_failed" ) var ( diff --git a/pkg/virt-controller/watch/application.go b/pkg/virt-controller/watch/application.go index b37deb996002..1b37ac0adc39 100644 --- a/pkg/virt-controller/watch/application.go +++ b/pkg/virt-controller/watch/application.go @@ -128,7 +128,7 @@ var ( readyGauge = prometheus.NewGauge( prometheus.GaugeOpts{ - Name: "kubevirt_virt_controller_ready", + Name: "kubevirt_virt_controller_ready_status", Help: "Indication for a virt-controller that is ready to take the lead.", }, ) diff --git a/pkg/virt-controller/watch/workload-updater/workload-updater.go b/pkg/virt-controller/watch/workload-updater/workload-updater.go index 71be34b570eb..8b41c40a40f2 100644 --- a/pkg/virt-controller/watch/workload-updater/workload-updater.go +++ b/pkg/virt-controller/watch/workload-updater/workload-updater.go @@ -47,7 +47,7 @@ const ( var ( outdatedVirtualMachineInstanceWorkloads = prometheus.NewGauge( prometheus.GaugeOpts{ - Name: "kubevirt_vmi_outdated_count", + Name: "kubevirt_vmi_number_of_outdated", Help: "Indication for the total number of VirtualMachineInstance workloads that are not running within the most up-to-date version of the virt-launcher environment.", }, ) diff --git a/pkg/virt-operator/application.go b/pkg/virt-operator/application.go index 8fe5710541d2..f7da62feeb2b 100644 --- a/pkg/virt-operator/application.go +++ b/pkg/virt-operator/application.go @@ -116,14 +116,14 @@ var ( leaderGauge = prometheus.NewGauge( prometheus.GaugeOpts{ - Name: "kubevirt_virt_operator_leading", + Name: "kubevirt_virt_operator_leading_status", Help: "Indication for an operating virt-operator.", }, ) readyGauge = prometheus.NewGauge( prometheus.GaugeOpts{ - Name: "kubevirt_virt_operator_ready", + Name: "kubevirt_virt_operator_ready_status", Help: "Indication for a virt-operator that is ready to take the lead.", }, ) diff --git a/pkg/virt-operator/resource/generate/components/prometheus.go b/pkg/virt-operator/resource/generate/components/prometheus.go index 3bd4663af01b..09a900546067 100644 --- a/pkg/virt-operator/resource/generate/components/prometheus.go +++ b/pkg/virt-operator/resource/generate/components/prometheus.go @@ -110,7 +110,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { kubevirtRules = append(kubevirtRules, []v1.Rule{ { Alert: "VirtAPIDown", - Expr: intstr.FromString("kubevirt_virt_api_up_total == 0"), + Expr: intstr.FromString("kubevirt_virt_api_up == 0"), For: "10m", Annotations: map[string]string{ "summary": "All virt-api servers are down.", @@ -123,7 +123,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "LowVirtAPICount", - Expr: intstr.FromString("(kubevirt_allocatable_nodes_count > 1) and (kubevirt_virt_api_up_total < 2)"), + Expr: intstr.FromString("(kubevirt_allocatable_nodes > 1) and (kubevirt_virt_api_up < 2)"), For: "60m", Annotations: map[string]string{ "summary": "More than one virt-api should be running if more than one worker nodes exist.", @@ -136,7 +136,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "LowKVMNodesCount", - Expr: intstr.FromString("(kubevirt_allocatable_nodes_count > 1) and (kubevirt_kvm_available_nodes_count < 2)"), + Expr: intstr.FromString("(kubevirt_allocatable_nodes > 1) and (kubevirt_nodes_with_kvm < 2)"), For: "5m", Annotations: map[string]string{ "description": "Low number of nodes with KVM resource available.", @@ -150,7 +150,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "LowReadyVirtControllersCount", - Expr: intstr.FromString("kubevirt_virt_controller_ready_total < kubevirt_virt_controller_up_total"), + Expr: intstr.FromString("kubevirt_virt_controller_ready < kubevirt_virt_controller_up"), For: "10m", Annotations: map[string]string{ "summary": "Some virt controllers are running but not ready.", @@ -163,7 +163,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "NoReadyVirtController", - Expr: intstr.FromString("kubevirt_virt_controller_ready_total == 0"), + Expr: intstr.FromString("kubevirt_virt_controller_ready == 0"), For: "10m", Annotations: map[string]string{ "summary": "No ready virt-controller was detected for the last 10 min.", @@ -176,7 +176,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "VirtControllerDown", - Expr: intstr.FromString("kubevirt_virt_controller_up_total == 0"), + Expr: intstr.FromString("kubevirt_virt_controller_up == 0"), For: "10m", Annotations: map[string]string{ "summary": "No running virt-controller was detected for the last 10 min.", @@ -189,7 +189,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "LowVirtControllersCount", - Expr: intstr.FromString("(kubevirt_allocatable_nodes_count > 1) and (kubevirt_virt_controller_ready_total < 2)"), + Expr: intstr.FromString("(kubevirt_allocatable_nodes > 1) and (kubevirt_virt_controller_ready < 2)"), For: "10m", Annotations: map[string]string{ "summary": "More than one virt-controller should be ready if more than one worker node.", @@ -227,7 +227,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "VirtOperatorDown", - Expr: intstr.FromString("kubevirt_virt_operator_up_total == 0"), + Expr: intstr.FromString("kubevirt_virt_operator_up == 0"), For: "10m", Annotations: map[string]string{ "summary": "All virt-operator servers are down.", @@ -240,7 +240,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "LowVirtOperatorCount", - Expr: intstr.FromString("(kubevirt_allocatable_nodes_count > 1) and (kubevirt_virt_operator_up_total < 2)"), + Expr: intstr.FromString("(kubevirt_allocatable_nodes > 1) and (kubevirt_virt_operator_up < 2)"), For: "60m", Annotations: map[string]string{ "summary": "More than one virt-operator should be running if more than one worker nodes exist.", @@ -278,7 +278,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "LowReadyVirtOperatorsCount", - Expr: intstr.FromString("kubevirt_virt_operator_ready_total < kubevirt_virt_operator_up_total"), + Expr: intstr.FromString("kubevirt_virt_operator_ready < kubevirt_virt_operator_up"), For: "10m", Annotations: map[string]string{ "summary": "Some virt-operators are running but not ready.", @@ -291,7 +291,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "NoReadyVirtOperator", - Expr: intstr.FromString("kubevirt_virt_operator_ready_total == 0"), + Expr: intstr.FromString("kubevirt_virt_operator_ready == 0"), For: "10m", Annotations: map[string]string{ "summary": "No ready virt-operator was detected for the last 10 min.", @@ -304,7 +304,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "NoLeadingVirtOperator", - Expr: intstr.FromString("kubevirt_virt_operator_leading_total == 0"), + Expr: intstr.FromString("kubevirt_virt_operator_leading == 0"), For: "10m", Annotations: map[string]string{ "summary": "No leading virt-operator was detected for the last 10 min.", @@ -457,7 +457,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "KubeVirtVMIExcessiveMigrations", - Expr: intstr.FromString("sum by (vmi) (max_over_time(kubevirt_migrate_vmi_succeeded[1d])) >= 12"), + Expr: intstr.FromString("sum by (vmi) (max_over_time(kubevirt_vmi_migrations_succeeded[1d])) >= 12"), Annotations: map[string]string{ "description": "VirtualMachineInstance {{ $labels.vmi }} has been migrated more than 12 times during the last 24 hours", "summary": "An excessive amount of migrations have been detected on a VirtualMachineInstance in the last 24 hours.", @@ -496,7 +496,7 @@ func NewPrometheusRuleSpec(ns string) *v1.PrometheusRuleSpec { }, { Alert: "OutdatedVirtualMachineInstanceWorkloads", - Expr: intstr.FromString("kubevirt_vmi_outdated_count != 0"), + Expr: intstr.FromString("kubevirt_vmi_number_of_outdated != 0"), For: "1440m", Annotations: map[string]string{ "summary": "Some running VMIs are still active in outdated pods after KubeVirt control plane update has completed.", @@ -541,7 +541,7 @@ func GetRecordingRules(namespace string) []KubevirtRecordingRule { return []KubevirtRecordingRule{ { Rule: v1.Rule{ - Record: "kubevirt_virt_api_up_total", + Record: "kubevirt_virt_api_up", Expr: intstr.FromString( fmt.Sprintf("sum(up{namespace='%s', pod=~'virt-api-.*'}) or vector(0)", namespace), ), @@ -551,7 +551,7 @@ func GetRecordingRules(namespace string) []KubevirtRecordingRule { }, { Rule: v1.Rule{ - Record: "kubevirt_allocatable_nodes_count", + Record: "kubevirt_allocatable_nodes", Expr: intstr.FromString("count(count (kube_node_status_allocatable) by (node))"), }, MType: prometheusv1.MetricTypeGauge, @@ -559,15 +559,15 @@ func GetRecordingRules(namespace string) []KubevirtRecordingRule { }, { Rule: v1.Rule{ - Record: "kubevirt_kvm_available_nodes_count", - Expr: intstr.FromString("kubevirt_allocatable_nodes_count - count(kube_node_status_allocatable{resource=\"devices_kubevirt_io_kvm\"} == 0)"), + Record: "kubevirt_nodes_with_kvm", + Expr: intstr.FromString("kubevirt_allocatable_nodes - count(kube_node_status_allocatable{resource=\"devices_kubevirt_io_kvm\"} == 0)"), }, MType: prometheusv1.MetricTypeGauge, Description: "The number of nodes in the cluster that have the devices.kubevirt.io/kvm resource available.", }, { Rule: v1.Rule{ - Record: "kubevirt_virt_controller_up_total", + Record: "kubevirt_virt_controller_up", Expr: intstr.FromString( fmt.Sprintf("sum(up{pod=~'virt-controller-.*', namespace='%s'}) or vector(0)", namespace), ), @@ -577,9 +577,9 @@ func GetRecordingRules(namespace string) []KubevirtRecordingRule { }, { Rule: v1.Rule{ - Record: "kubevirt_virt_controller_ready_total", + Record: "kubevirt_virt_controller_ready", Expr: intstr.FromString( - fmt.Sprintf("sum(kubevirt_virt_controller_ready{namespace='%s'}) or vector(0)", namespace), + fmt.Sprintf("sum(kubevirt_virt_controller_ready_status{namespace='%s'}) or vector(0)", namespace), ), }, MType: prometheusv1.MetricTypeGauge, @@ -587,7 +587,7 @@ func GetRecordingRules(namespace string) []KubevirtRecordingRule { }, { Rule: v1.Rule{ - Record: "kubevirt_virt_operator_up_total", + Record: "kubevirt_virt_operator_up", Expr: intstr.FromString( fmt.Sprintf("sum(up{namespace='%s', pod=~'virt-operator-.*'}) or vector(0)", namespace), ), @@ -597,9 +597,9 @@ func GetRecordingRules(namespace string) []KubevirtRecordingRule { }, { Rule: v1.Rule{ - Record: "kubevirt_virt_operator_ready_total", + Record: "kubevirt_virt_operator_ready", Expr: intstr.FromString( - fmt.Sprintf("sum(kubevirt_virt_operator_ready{namespace='%s'}) or vector(0)", namespace), + fmt.Sprintf("sum(kubevirt_virt_operator_ready_status{namespace='%s'}) or vector(0)", namespace), ), }, MType: prometheusv1.MetricTypeGauge, @@ -607,9 +607,9 @@ func GetRecordingRules(namespace string) []KubevirtRecordingRule { }, { Rule: v1.Rule{ - Record: "kubevirt_virt_operator_leading_total", + Record: "kubevirt_virt_operator_leading", Expr: intstr.FromString( - fmt.Sprintf("sum(kubevirt_virt_operator_leading{namespace='%s'})", namespace), + fmt.Sprintf("sum(kubevirt_virt_operator_leading_status{namespace='%s'})", namespace), ), }, MType: prometheusv1.MetricTypeGauge, @@ -617,7 +617,7 @@ func GetRecordingRules(namespace string) []KubevirtRecordingRule { }, { Rule: v1.Rule{ - Record: "kubevirt_virt_handler_up_total", + Record: "kubevirt_virt_handler_up", Expr: intstr.FromString(fmt.Sprintf("sum(up{pod=~'virt-handler-.*', namespace='%s'}) or vector(0)", namespace)), }, MType: prometheusv1.MetricTypeGauge, @@ -657,7 +657,7 @@ func GetRecordingRules(namespace string) []KubevirtRecordingRule { }, { Rule: v1.Rule{ - Record: "kubevirt_vmsnapshot_disks_restored_from_source_total", + Record: "kubevirt_vmsnapshot_disks_restored_from_source", Expr: intstr.FromString("sum by(vm_name, vm_namespace) (kubevirt_vmsnapshot_persistentvolumeclaim_labels)"), }, MType: prometheusv1.MetricTypeGauge, diff --git a/tests/infrastructure/prometheus.go b/tests/infrastructure/prometheus.go index 87a46241cbe0..03b0bd936337 100644 --- a/tests/infrastructure/prometheus.go +++ b/tests/infrastructure/prometheus.go @@ -349,9 +349,9 @@ var _ = DescribeInfra("[rfe_id:3187][crit:medium][vendor:cnv-qe@redhat.com][leve continue } switch data { - case "kubevirt_virt_controller_leading 1": + case "kubevirt_virt_controller_leading_status 1": foundMetrics["leading"]++ - case "kubevirt_virt_controller_ready 1": + case "kubevirt_virt_controller_ready_status 1": foundMetrics["ready"]++ } } @@ -384,9 +384,9 @@ var _ = DescribeInfra("[rfe_id:3187][crit:medium][vendor:cnv-qe@redhat.com][leve continue } switch data { - case "kubevirt_virt_operator_leading 1": + case "kubevirt_virt_operator_leading_status 1": foundMetrics["leading"]++ - case "kubevirt_virt_operator_ready 1": + case "kubevirt_virt_operator_ready_status 1": foundMetrics["ready"]++ } } @@ -519,18 +519,18 @@ var _ = DescribeInfra("[rfe_id:3187][crit:medium][vendor:cnv-qe@redhat.com][leve }, Entry("[test_id:4142] storage flush requests metric by using IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_storage_flush_requests_total", ">="), Entry("[test_id:6228] storage flush requests metric by using IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_storage_flush_requests_total", ">="), - Entry("[test_id:4142] time (ms) spent on cache flushing metric by using IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_storage_flush_times_ms_total", ">="), - Entry("[test_id:6229] time (ms) spent on cache flushing metric by using IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_storage_flush_times_ms_total", ">="), + Entry("[test_id:4142] time spent on cache flushing metric by using IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_storage_flush_times_seconds_total", ">="), + Entry("[test_id:6229] time spent on cache flushing metric by using IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_storage_flush_times_seconds_total", ">="), Entry("[test_id:4142] I/O read operations metric by using IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_storage_iops_read_total", ">="), Entry("[test_id:6230] I/O read operations metric by using IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_storage_iops_read_total", ">="), Entry("[test_id:4142] I/O write operations metric by using IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_storage_iops_write_total", ">="), Entry("[test_id:6231] I/O write operations metric by using IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_storage_iops_write_total", ">="), - Entry("[test_id:4142] storage read operation time metric by using IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_storage_read_times_ms_total", ">="), - Entry("[test_id:6232] storage read operation time metric by using IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_storage_read_times_ms_total", ">="), + Entry("[test_id:4142] storage read operation time metric by using IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_storage_read_times_seconds_total", ">="), + Entry("[test_id:6232] storage read operation time metric by using IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_storage_read_times_seconds_total", ">="), Entry("[test_id:4142] storage read traffic in bytes metric by using IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_storage_read_traffic_bytes_total", ">="), Entry("[test_id:6233] storage read traffic in bytes metric by using IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_storage_read_traffic_bytes_total", ">="), - Entry("[test_id:4142] storage write operation time metric by using IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_storage_write_times_ms_total", ">="), - Entry("[test_id:6234] storage write operation time metric by using IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_storage_write_times_ms_total", ">="), + Entry("[test_id:4142] storage write operation time metric by using IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_storage_write_times_seconds_total", ">="), + Entry("[test_id:6234] storage write operation time metric by using IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_storage_write_times_seconds_total", ">="), Entry("[test_id:4142] storage write traffic in bytes metric by using IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_storage_write_traffic_bytes_total", ">="), Entry("[test_id:6235] storage write traffic in bytes metric by using IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_storage_write_traffic_bytes_total", ">="), ) @@ -556,8 +556,8 @@ var _ = DescribeInfra("[rfe_id:3187][crit:medium][vendor:cnv-qe@redhat.com][leve Entry("[test_id:6237] memory metrics by IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_memory", ">="), Entry("[test_id:4553] vcpu wait by IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_vcpu_wait", "=="), Entry("[test_id:6238] vcpu wait by IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_vcpu_wait", "=="), - Entry("[test_id:4554] vcpu seconds by IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_vcpu_seconds", ">="), - Entry("[test_id:6239] vcpu seconds by IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_vcpu_seconds", ">="), + Entry("[test_id:4554] vcpu seconds by IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_vcpu_seconds_total", ">="), + Entry("[test_id:6239] vcpu seconds by IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_vcpu_seconds_total", ">="), Entry("[test_id:4556] vmi unused memory by IPv4", k8sv1.IPv4Protocol, "kubevirt_vmi_memory_unused_bytes", ">="), Entry("[test_id:6240] vmi unused memory by IPv6", k8sv1.IPv6Protocol, "kubevirt_vmi_memory_unused_bytes", ">="), ) @@ -644,7 +644,7 @@ var _ = DescribeInfra("[rfe_id:3187][crit:medium][vendor:cnv-qe@redhat.com][leve // Every VMI is labeled with kubevirt.io/nodeName, so just creating a VMI should // be enough to its metrics to contain a kubernetes label - metrics := collectMetrics(ip, "kubevirt_vmi_vcpu_seconds") + metrics := collectMetrics(ip, "kubevirt_vmi_vcpu_seconds_total") By("Checking collected metrics") keys := libinfra.GetKeysFromMetrics(metrics) containK8sLabel := false diff --git a/tests/monitoring/vm_monitoring.go b/tests/monitoring/vm_monitoring.go index 12981bc944ab..885badc59079 100644 --- a/tests/monitoring/vm_monitoring.go +++ b/tests/monitoring/vm_monitoring.go @@ -159,14 +159,14 @@ var _ = Describe("[Serial][sig-monitoring]VM Monitoring", Serial, decorators.Sig migration := tests.NewRandomMigration(vmi.Name, vmi.Namespace) tests.RunMigrationAndExpectCompletion(virtClient, migration, tests.MigrationWaitTime) - waitForMetricValue(virtClient, "kubevirt_migrate_vmi_pending_count", 0) - waitForMetricValue(virtClient, "kubevirt_migrate_vmi_scheduling_count", 0) - waitForMetricValue(virtClient, "kubevirt_migrate_vmi_running_count", 0) + waitForMetricValue(virtClient, "kubevirt_vmi_migrations_pending", 0) + waitForMetricValue(virtClient, "kubevirt_vmi_migrations_scheduling", 0) + waitForMetricValue(virtClient, "kubevirt_vmi_migrations_running", 0) labels := map[string]string{ "vmi": vmi.Name, } - waitForMetricValueWithLabels(virtClient, "kubevirt_migrate_vmi_succeeded", 1, labels) + waitForMetricValueWithLabels(virtClient, "kubevirt_vmi_migrations_succeeded", 1, labels) By("Delete VMIs") Expect(virtClient.VirtualMachineInstance(vmi.Namespace).Delete(context.Background(), vmi.Name, &metav1.DeleteOptions{})).To(Succeed()) @@ -190,12 +190,12 @@ var _ = Describe("[Serial][sig-monitoring]VM Monitoring", Serial, decorators.Sig migration.Annotations = map[string]string{v1.MigrationUnschedulablePodTimeoutSecondsAnnotation: "60"} migration = tests.RunMigration(virtClient, migration) - waitForMetricValue(virtClient, "kubevirt_migrate_vmi_scheduling_count", 1) + waitForMetricValue(virtClient, "kubevirt_vmi_migrations_scheduling", 1) Eventually(matcher.ThisMigration(migration), 2*time.Minute, 5*time.Second).Should(matcher.BeInPhase(v1.MigrationFailed), "migration creation should fail") - waitForMetricValue(virtClient, "kubevirt_migrate_vmi_scheduling_count", 0) - waitForMetricValueWithLabels(virtClient, "kubevirt_migrate_vmi_failed", 1, labels) + waitForMetricValue(virtClient, "kubevirt_vmi_migrations_scheduling", 0) + waitForMetricValueWithLabels(virtClient, "kubevirt_vmi_migrations_failed", 1, labels) By("Deleting the VMI") Expect(virtClient.VirtualMachineInstance(vmi.Namespace).Delete(context.Background(), vmi.Name, &metav1.DeleteOptions{})).To(Succeed()) @@ -228,7 +228,7 @@ var _ = Describe("[Serial][sig-monitoring]VM Monitoring", Serial, decorators.Sig } It("[test_id:8639]Number of disks restored and total restored bytes metric values should be correct", func() { - totalMetric := fmt.Sprintf("kubevirt_vmsnapshot_disks_restored_from_source_total{vm_name='simple-vm',vm_namespace='%s'}", util.NamespaceTestDefault) + totalMetric := fmt.Sprintf("kubevirt_vmsnapshot_disks_restored_from_source{vm_name='simple-vm',vm_namespace='%s'}", util.NamespaceTestDefault) bytesMetric := fmt.Sprintf("kubevirt_vmsnapshot_disks_restored_from_source_bytes{vm_name='simple-vm',vm_namespace='%s'}", util.NamespaceTestDefault) numPVCs := 2.0 diff --git a/tools/doc-generator/doc-generator.go b/tools/doc-generator/doc-generator.go index ff4a758e1930..2915d72d51af 100644 --- a/tools/doc-generator/doc-generator.go +++ b/tools/doc-generator/doc-generator.go @@ -174,6 +174,16 @@ func getMetricsNotIncludeInEndpointByDefault() metricList { description: "Histogram of VM phase transitions duration from deletion time in seconds.", mType: "Histogram", }, + { + name: "kubevirt_virt_operator_leading_status", + description: "Indication for an operating virt-operator.", + mType: "Gauge", + }, + { + name: "kubevirt_virt_operator_ready_status", + description: "Indication for a virt-operator that is ready to take the lead.", + mType: "Gauge", + }, } for _, rule := range components.GetRecordingRules("") { diff --git a/tools/prom-metrics-collector/metrics_collector.go b/tools/prom-metrics-collector/metrics_collector.go index 3cc741dcd9d5..2987de02b7e6 100644 --- a/tools/prom-metrics-collector/metrics_collector.go +++ b/tools/prom-metrics-collector/metrics_collector.go @@ -29,36 +29,11 @@ import ( dto "github.com/prometheus/client_model/go" ) -// excludedMetrics defines the metrics to ignore, open issue:https://github.com/kubevirt/kubevirt/issues/9714 -// Follow the Metrics Naming Guidelines: https://sdk.operatorframework.io/docs/best-practices/observability-best-practices/#metrics-guidelines -// Do not add metrics to this list! +// This should be used only for very rare cases where the naming conventions that are explained in the best practices: +// https://sdk.operatorframework.io/docs/best-practices/observability-best-practices/#metrics-guidelines +// should be ignored. var excludedMetrics = map[string]struct{}{ - "kubevirt_allocatable_nodes_count": struct{}{}, - "kubevirt_kvm_available_nodes_count": struct{}{}, - "kubevirt_migrate_vmi_pending_count": struct{}{}, - "kubevirt_migrate_vmi_running_count": struct{}{}, - "kubevirt_migrate_vmi_scheduling_count": struct{}{}, - "kubevirt_virt_api_up_total": struct{}{}, - "kubevirt_virt_controller_ready_total": struct{}{}, - "kubevirt_virt_controller_up_total": struct{}{}, - "kubevirt_virt_handler_up_total": struct{}{}, - "kubevirt_virt_operator_leading_total": struct{}{}, - "kubevirt_virt_operator_ready_total": struct{}{}, - "kubevirt_virt_operator_up_total": struct{}{}, - "kubevirt_vmi_filesystem_capacity_bytes_total": struct{}{}, - "kubevirt_vmi_memory_domain_bytes_total": struct{}{}, - "kubevirt_vmi_memory_pgmajfault": struct{}{}, - "kubevirt_vmi_memory_pgminfault": struct{}{}, - "kubevirt_vmi_memory_swap_in_traffic_bytes_total": struct{}{}, - "kubevirt_vmi_memory_swap_out_traffic_bytes_total": struct{}{}, - "kubevirt_vmi_outdated_count": struct{}{}, - "kubevirt_vmi_phase_count": struct{}{}, - "kubevirt_vmi_storage_flush_times_ms_total": struct{}{}, - "kubevirt_vmi_storage_read_times_ms_total": struct{}{}, - "kubevirt_vmi_storage_write_times_ms_total": struct{}{}, - "kubevirt_vmi_vcpu_seconds": struct{}{}, - "kubevirt_vmi_vcpu_wait_seconds": struct{}{}, - "kubevirt_vmsnapshot_disks_restored_from_source_total": struct{}{}, + "kubevirt_vmi_phase_count": struct{}{}, } // Extract the name, help, and type from the metrics doc file