Remove CPU sampling

With this commit we remove the ability to sample CPU usage. For coarse-grained analysis we instead suggest to use system monitoring tools (e.g. Metricbeat) and for fine-grained analysis a profiler is a better choice. Relates elastic#696
osykora · May 23, 2019 · 42778f4 · 42778f4
1 parent b93cb00
commit 42778f4
Show file tree

Hide file tree

Showing 7 changed files with 9 additions and 76 deletions.
diff --git a/docs/metrics.rst b/docs/metrics.rst
@@ -132,7 +132,6 @@ Rally stores the following metrics:
 * ``merge_parts_total_docs_*``: See ``merge_parts_total_time_*``
 * ``disk_io_write_bytes``: number of bytes that have been written to disk during the benchmark. On Linux this metric reports only the bytes that have been written by Elasticsearch, on Mac OS X it reports the number of bytes written by all processes.
 * ``disk_io_read_bytes``: number of bytes that have been read from disk during the benchmark. The same caveats apply on Mac OS X as for ``disk_io_write_bytes``.
-* ``cpu_utilization_1s``: CPU usage in percent of the Elasticsearch process based on a one second sample period. The maximum value is N * 100% where N is the number of CPU cores available.
 * ``node_startup_time``: The time in seconds it took from process start until the node is up.
 * ``node_total_old_gen_gc_time``: The total runtime of the old generation garbage collector across the whole cluster as reported by the node stats API.
 * ``node_total_young_gen_gc_time``: The total runtime of the young generation garbage collector across the whole cluster as reported by the node stats API.

diff --git a/docs/migrate.rst b/docs/migrate.rst
@@ -1,6 +1,15 @@
 Migration Guide
 ===============
 
+Migrating to Rally 1.2.0
+------------------------
+
+CPU usage is not measured anymore
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+With Rally 1.2.0, CPU usage will neither be measured nor reported. We suggest to use system monitoring tools like ``mpstat``, ``sar`` or `Metricbeat <https://www.elastic.co/downloads/beats/metricbeat>`_ to measure CPU usage instead.
+
+
 Migrating to Rally 1.1.0
 ------------------------
 

diff --git a/docs/summary_report.rst b/docs/summary_report.rst
@@ -117,13 +117,6 @@ ML processing time
 * **Corresponding metrics key**: ``ml_processing_time``
 
 
-Median CPU usage
-----------------
-
-* **Definition**: Median CPU usage in percent of the Elasticsearch process during the whole race based on a one second sample period. The maximum value is N * 100% where N is the number of CPU cores available
-* **Corresponding metrics key**: ``cpu_utilization_1s``
-
-
 Total Young Gen GC
 ------------------
 

diff --git a/esrally/mechanic/launcher.py b/esrally/mechanic/launcher.py
@@ -226,7 +226,6 @@ def start(self, node_configurations):
             # only support a subset of telemetry for Docker hosts (specifically, we do not allow users to enable any devices)
             node_telemetry = [
                 telemetry.DiskIo(self.metrics_store, len(node_configurations)),
-                telemetry.CpuUsage(self.metrics_store),
                 telemetry.NodeEnvironmentInfo(self.metrics_store)
             ]
             t = telemetry.Telemetry(devices=node_telemetry)
@@ -328,7 +327,6 @@ def _start_node(self, node_configuration, node_count_on_host):
             telemetry.Gc(node_telemetry_dir, java_major_version),
             telemetry.PerfStat(node_telemetry_dir),
             telemetry.DiskIo(self.metrics_store, node_count_on_host),
-            telemetry.CpuUsage(self.metrics_store),
             telemetry.NodeEnvironmentInfo(self.metrics_store),
             telemetry.IndexSize(data_paths, self.metrics_store),
             telemetry.MergeParts(self.metrics_store, node_configuration.log_path),

diff --git a/esrally/mechanic/telemetry.py b/esrally/mechanic/telemetry.py
@@ -876,52 +876,6 @@ def on_benchmark_stop(self):
                 self.logger.exception("Could not determine I/O stats at benchmark end.")
 
 
-class CpuUsage(InternalTelemetryDevice):
-    """
-    Gathers CPU usage statistics.
-    """
-    def __init__(self, metrics_store):
-        super().__init__()
-        self.metrics_store = metrics_store
-        self.sampler = None
-        self.node = None
-
-    def attach_to_node(self, node):
-        self.node = node
-
-    def on_benchmark_start(self):
-        if self.node:
-            recorder = CpuUsageRecorder(self.node, self.metrics_store)
-            self.sampler = SamplerThread(recorder)
-            self.sampler.setDaemon(True)
-            self.sampler.start()
-
-    def on_benchmark_stop(self):
-        if self.sampler:
-            self.sampler.finish()
-
-
-class CpuUsageRecorder:
-    def __init__(self, node, metrics_store):
-        self.node = node
-        self.process = sysstats.setup_process_stats(node.process.pid)
-        self.metrics_store = metrics_store
-        # the call is blocking already; there is no need for additional waiting in the sampler thread.
-        self.sample_interval = 0
-
-    def record(self):
-        import psutil
-        try:
-            self.metrics_store.put_value_node_level(node_name=self.node.node_name, name="cpu_utilization_1s",
-                                                    value=sysstats.cpu_utilization(self.process), unit="%")
-        # this can happen when the Elasticsearch process has been terminated already and we were not quick enough to stop.
-        except psutil.NoSuchProcess:
-            pass
-
-    def __str__(self):
-        return "cpu utilization"
-
-
 def store_node_attribute_metadata(metrics_store, nodes_info):
     # push up all node level attributes to cluster level iff the values are identical for all nodes
     pseudo_cluster_attributes = {}

diff --git a/esrally/reporter.py b/esrally/reporter.py
@@ -198,9 +198,6 @@ def __call__(self):
         self.logger.debug("Gathering ML max processing times.")
         result.ml_processing_time = self.ml_processing_time_stats()
 
-        self.logger.debug("Gathering CPU usage metrics.")
-        result.median_cpu_usage = self.median("cpu_utilization_1s", sample_type=metrics.SampleType.Normal)
-
         self.logger.debug("Gathering garbage collection metrics.")
         result.young_gc_time = self.sum("node_total_young_gen_gc_time")
         result.old_gc_time = self.sum("node_total_old_gen_gc_time")
@@ -345,8 +342,6 @@ def __init__(self, d=None):
         self.merge_part_time_vectors = self.v(d, "merge_part_time_vectors")
         self.merge_part_time_points = self.v(d, "merge_part_time_points")
 
-        self.median_cpu_usage = self.v(d, "median_cpu_usage")
-
         self.young_gc_time = self.v(d, "young_gc_time")
         self.old_gc_time = self.v(d, "old_gc_time")
 
@@ -484,7 +479,6 @@ def report(self):
         metrics_table.extend(self.report_merge_part_times(stats))
         metrics_table.extend(self.report_ml_processing_times(stats))
 
-        metrics_table.extend(self.report_cpu_usage(stats))
         metrics_table.extend(self.report_gc_times(stats))
 
         metrics_table.extend(self.report_disk_usage(stats))
@@ -611,11 +605,6 @@ def report_ml_processing_times(self, stats):
             lines.append(self.line("Max ML processing time", job_name, processing_time["max"], unit))
         return lines
 
-    def report_cpu_usage(self, stats):
-        return self.join(
-            self.line("Median CPU usage", "", stats.median_cpu_usage, "%")
-        )
-
     def report_gc_times(self, stats):
         return self.join(
             self.line("Total Young Gen GC", "", stats.young_gc_time, "s", convert.ms_to_seconds),

diff --git a/esrally/utils/sysstats.py b/esrally/utils/sysstats.py
@@ -91,12 +91,3 @@ def setup_process_stats(pid):
     :return: An opaque handle that has to be provided for all subsequent calls to process stats APIs.
     """
     return psutil.Process(pid)
-
-
-def cpu_utilization(handle, interval=1.0):
-    """
-    :param handle: handle retrieved by calling setup_process_stats(pid).
-    :param interval: The measurement interval in seconds. Optional. Defaults to 1 second.
-    :return: The CPU usage in percent.
-    """
-    return handle.cpu_percent(interval=interval)