Skip to content

Commit

Permalink
feature(ArtifactsTest): IOTune parameters validation
Browse files Browse the repository at this point in the history
This commits adds a new subtest inside Artifacts test, that does the
check of machine image io_properties.yaml by comparing them to the
actual machine values and showing the deviation

Fixes scylladb/qa-tasks#1787
  • Loading branch information
k0machi committed Feb 25, 2025
1 parent b11ba22 commit 4e35601
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 1 deletion.
17 changes: 17 additions & 0 deletions artifacts_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import yaml
import requests

from sdcm.argus_results import send_iotune_results_to_argus
from sdcm.sct_events import Severity
from sdcm.sct_events.database import ScyllaHousekeepingServiceEvent
from sdcm.provision.helpers.certificate import c_s_transport_str
Expand All @@ -30,6 +31,7 @@
from sdcm.utils.decorators import retrying
from sdcm.utils.issues import SkipPerIssues
from sdcm.utils.perftune_validator import PerftuneOutputChecker
from sdcm.utils.validators.iotune import IOTuneValidator
from utils.scylla_doctor import ScyllaDoctor

STRESS_CMD: str = "/usr/bin/cassandra-stress"
Expand Down Expand Up @@ -325,6 +327,21 @@ def test_scylla_service(self):
with self.subTest("check ENA support"):
assert self.node.ena_support, "ENA support is not enabled"

if backend in ["gce", "aws", "azure"] and self.params.get("use_preinstalled_scylla", False):
with self.subTest("check Scylla IO Params"):
try:
validator = IOTuneValidator(self.node)
validator.validate()
send_iotune_results_to_argus(
self.test_config.argus_client(),
validator.preset_io_properties,
validator.node_io_properties,
self.node,
self.params
)
except Exception: # pylint: disable=broad-except # noqa: BLE001
self.log.warning("IOTuneValidator failed", exc_info=True)

with self.subTest("verify write cache for NVMe devices"):
self.verify_nvme_write_cache()

Expand Down
84 changes: 83 additions & 1 deletion sdcm/argus_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,23 @@
#
# Copyright (c) 2024 ScyllaDB
import json
import logging
import time
from datetime import timezone, datetime

from argus.client import ArgusClient
from argus.client.base import ArgusClientError
from argus.client.generic_result import GenericResultTable, ColumnMetadata, ResultType, Status, ValidationRule

from sdcm.cluster import BaseNode
from sdcm.sct_config import SCTConfiguration
from sdcm.sct_events import Severity
from sdcm.sct_events.event_counter import STALL_INTERVALS
from sdcm.sct_events.system import FailedResultEvent
from sdcm.sct_events.system import FailedResultEvent, TestFrameworkEvent
from sdcm.utils.validators.iotune import IOProperties, IOTuneDiskParams


LOGGER = logging.getLogger(__name__)

LATENCY_ERROR_THRESHOLDS = {
"replace_node": {
Expand Down Expand Up @@ -255,3 +263,77 @@ def send_manager_benchmark_results_to_argus(argus_client: ArgusClient, result: d
for key, value in result.items():
result_table.add_result(column=key, row=row_name, value=value, status=Status.UNSET)
submit_results_to_argus(argus_client, result_table)


def send_iotune_results_to_argus(argus_client: ArgusClient, preset_io_props: IOProperties, io_properties: IOProperties, node: BaseNode, params: SCTConfiguration):
def _bottom_limit(metric_val, threshold=0.15):
"""
Determine disk metric deviation threshold and
use that as fixed limit
"""
return metric_val * threshold
if not argus_client:
LOGGER.warning("Will not submit to argus - no client initialized")
return
preset_disk: IOTuneDiskParams = next(iter(preset_io_props.get("disks", [])), None)
if not preset_disk:
LOGGER.warning("Unable to continue - node should have io_properties.yaml, but it doesn't.")
TestFrameworkEvent(source="send_iotune_results_to_argus",
message="Unable to continue - node should have io_properties.yaml, but it doesn't.",
severity=Severity.ERROR).publish()
return

class IOPropertiesResultsTable(GenericResultTable):
class Meta:
name = f"{params.get('cluster_backend')} - {node.db_node_instance_type} Disk Performance"
description = "io_properties.yaml comparison with live data"
Columns = [
ColumnMetadata(name="read_iops", unit="iops", type=ResultType.INTEGER, higher_is_better=True),
ColumnMetadata(name="read_bandwidth", unit="bps", type=ResultType.INTEGER, higher_is_better=True),
ColumnMetadata(name="write_iops", unit="iops", type=ResultType.INTEGER, higher_is_better=True),
ColumnMetadata(name="write_bandwidth", unit="bps", type=ResultType.INTEGER, higher_is_better=True),
]

ValidationRules = {

}

class IOPropertiesDeviationResultsTable(GenericResultTable):
class Meta:
name = f"{params.get('cluster_backend')} - {node.db_node_instance_type} Disk Performance Absolute deviation"
description = "io_properties.yaml absolute deviation from preset disk"
Columns = [
ColumnMetadata(name="read_iops_abs_deviation", unit="iops",
type=ResultType.INTEGER, higher_is_better=False),
ColumnMetadata(name="read_bandwidth_abs_deviation", unit="bps",
type=ResultType.INTEGER, higher_is_better=False),
ColumnMetadata(name="write_iops_abs_deviation", unit="iops",
type=ResultType.INTEGER, higher_is_better=False),
ColumnMetadata(name="write_bandwidth_abs_deviation", unit="bps",
type=ResultType.INTEGER, higher_is_better=False),
]

ValidationRules = {
"read_iops_abs_deviation": ValidationRule(fixed_limit=_bottom_limit(preset_disk.get("read_iops"))),
"read_bandwidth_abs_deviation": ValidationRule(fixed_limit=_bottom_limit(preset_disk.get("read_bandwidth"))),
"write_iops_abs_deviation": ValidationRule(fixed_limit=_bottom_limit(preset_disk.get("write_iops"))),
"write_bandwidth_abs_deviation": ValidationRule(fixed_limit=_bottom_limit(preset_disk.get("write_bandwidth"))),
}

tested_disk: IOTuneDiskParams = next(iter(io_properties.get("disks", [])))
tested_mountpoint = tested_disk.pop("mountpoint")
if tested_mountpoint != preset_disk["mountpoint"]:
LOGGER.warning("Disks differ - probably a mistake: %s vs %s, will not submit iotune results",
tested_mountpoint, preset_disk["mountpoint"])
return
table = IOPropertiesResultsTable()
for key, value in tested_disk.items():
table.add_result(column=key, row="#1", value=value, status=Status.PASS)
submit_results_to_argus(argus_client, table)

table = IOPropertiesDeviationResultsTable()
for key, value in tested_disk.items():
deviation_val = abs(preset_disk.get(key) - value)
table.add_result(column=f"{key}_abs_deviation", row="#1", value=deviation_val, status=Status.UNSET)

submit_results_to_argus(argus_client, table)
55 changes: 55 additions & 0 deletions sdcm/utils/validators/iotune.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from copy import deepcopy
import logging
from typing import TypedDict
import yaml
from sdcm.cluster import BaseNode
from sdcm.remote.remote_file import remote_file

LOGGER = logging.getLogger(__name__)


class IOTuneDiskParams(TypedDict):
mountpoint: str
read_iops: int
read_bandwidth: int
write_iops: int
write_bandwidth: int


class IOProperties(TypedDict):
disks: list[IOTuneDiskParams]


class IOTuneValidator:
def __init__(self, node: BaseNode):
self.node = node
self.node_io_properties: IOProperties = {}
self.preset_io_properties: IOProperties = {}

def validate(self):
self._run_io_tune()
self._format_results_to_console()

def _read_io_properties(self, io_props_path="/etc/scylla.d/io_properties.yaml") -> IOProperties:
with remote_file(self.node.remoter, io_props_path) as f:
return yaml.safe_load(f)

def _run_io_tune(self, temp_props_path="/tmp/io_properties.yaml") -> IOProperties:
self.node.remoter.sudo(f"iotune --evaluation-directory /var/lib/scylla --properties-file {temp_props_path}")
self.node_io_properties = self._read_io_properties(temp_props_path)
self.preset_io_properties = self._read_io_properties()

return self.node_io_properties

def _format_results_to_console(self):
preset_properties = deepcopy(self.preset_io_properties)
preset_disk = next(iter(preset_properties.get("disks")))

for disk in self.node_io_properties.get("disks"):
disk_copy = {**disk}
mountpoint = disk_copy.pop("mountpoint")
LOGGER.info("Disk performance values validation - testing %s", mountpoint)
for key, val in disk_copy.items():
preset_val = preset_disk.get(key)
diff = (val / preset_val - 1) * 100
LOGGER.info("[%s] %s: %s (%.0f)", mountpoint, key, val, diff)

0 comments on commit 4e35601

Please sign in to comment.