Skip to content

Commit

Permalink
Remove CPU sampling
Browse files Browse the repository at this point in the history
With this commit we remove the ability to sample CPU usage. For
coarse-grained analysis we instead suggest to use system monitoring
tools (e.g. Metricbeat) and for fine-grained analysis a profiler is a
better choice.

Relates elastic#696
  • Loading branch information
danielmitterdorfer authored May 23, 2019
1 parent b93cb00 commit 42778f4
Show file tree
Hide file tree
Showing 7 changed files with 9 additions and 76 deletions.
1 change: 0 additions & 1 deletion docs/metrics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ Rally stores the following metrics:
* ``merge_parts_total_docs_*``: See ``merge_parts_total_time_*``
* ``disk_io_write_bytes``: number of bytes that have been written to disk during the benchmark. On Linux this metric reports only the bytes that have been written by Elasticsearch, on Mac OS X it reports the number of bytes written by all processes.
* ``disk_io_read_bytes``: number of bytes that have been read from disk during the benchmark. The same caveats apply on Mac OS X as for ``disk_io_write_bytes``.
* ``cpu_utilization_1s``: CPU usage in percent of the Elasticsearch process based on a one second sample period. The maximum value is N * 100% where N is the number of CPU cores available.
* ``node_startup_time``: The time in seconds it took from process start until the node is up.
* ``node_total_old_gen_gc_time``: The total runtime of the old generation garbage collector across the whole cluster as reported by the node stats API.
* ``node_total_young_gen_gc_time``: The total runtime of the young generation garbage collector across the whole cluster as reported by the node stats API.
Expand Down
9 changes: 9 additions & 0 deletions docs/migrate.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
Migration Guide
===============

Migrating to Rally 1.2.0
------------------------

CPU usage is not measured anymore
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

With Rally 1.2.0, CPU usage will neither be measured nor reported. We suggest to use system monitoring tools like ``mpstat``, ``sar`` or `Metricbeat <https://www.elastic.co/downloads/beats/metricbeat>`_ to measure CPU usage instead.


Migrating to Rally 1.1.0
------------------------

Expand Down
7 changes: 0 additions & 7 deletions docs/summary_report.rst
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,6 @@ ML processing time
* **Corresponding metrics key**: ``ml_processing_time``


Median CPU usage
----------------

* **Definition**: Median CPU usage in percent of the Elasticsearch process during the whole race based on a one second sample period. The maximum value is N * 100% where N is the number of CPU cores available
* **Corresponding metrics key**: ``cpu_utilization_1s``


Total Young Gen GC
------------------

Expand Down
2 changes: 0 additions & 2 deletions esrally/mechanic/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ def start(self, node_configurations):
# only support a subset of telemetry for Docker hosts (specifically, we do not allow users to enable any devices)
node_telemetry = [
telemetry.DiskIo(self.metrics_store, len(node_configurations)),
telemetry.CpuUsage(self.metrics_store),
telemetry.NodeEnvironmentInfo(self.metrics_store)
]
t = telemetry.Telemetry(devices=node_telemetry)
Expand Down Expand Up @@ -328,7 +327,6 @@ def _start_node(self, node_configuration, node_count_on_host):
telemetry.Gc(node_telemetry_dir, java_major_version),
telemetry.PerfStat(node_telemetry_dir),
telemetry.DiskIo(self.metrics_store, node_count_on_host),
telemetry.CpuUsage(self.metrics_store),
telemetry.NodeEnvironmentInfo(self.metrics_store),
telemetry.IndexSize(data_paths, self.metrics_store),
telemetry.MergeParts(self.metrics_store, node_configuration.log_path),
Expand Down
46 changes: 0 additions & 46 deletions esrally/mechanic/telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -876,52 +876,6 @@ def on_benchmark_stop(self):
self.logger.exception("Could not determine I/O stats at benchmark end.")


class CpuUsage(InternalTelemetryDevice):
"""
Gathers CPU usage statistics.
"""
def __init__(self, metrics_store):
super().__init__()
self.metrics_store = metrics_store
self.sampler = None
self.node = None

def attach_to_node(self, node):
self.node = node

def on_benchmark_start(self):
if self.node:
recorder = CpuUsageRecorder(self.node, self.metrics_store)
self.sampler = SamplerThread(recorder)
self.sampler.setDaemon(True)
self.sampler.start()

def on_benchmark_stop(self):
if self.sampler:
self.sampler.finish()


class CpuUsageRecorder:
def __init__(self, node, metrics_store):
self.node = node
self.process = sysstats.setup_process_stats(node.process.pid)
self.metrics_store = metrics_store
# the call is blocking already; there is no need for additional waiting in the sampler thread.
self.sample_interval = 0

def record(self):
import psutil
try:
self.metrics_store.put_value_node_level(node_name=self.node.node_name, name="cpu_utilization_1s",
value=sysstats.cpu_utilization(self.process), unit="%")
# this can happen when the Elasticsearch process has been terminated already and we were not quick enough to stop.
except psutil.NoSuchProcess:
pass

def __str__(self):
return "cpu utilization"


def store_node_attribute_metadata(metrics_store, nodes_info):
# push up all node level attributes to cluster level iff the values are identical for all nodes
pseudo_cluster_attributes = {}
Expand Down
11 changes: 0 additions & 11 deletions esrally/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,6 @@ def __call__(self):
self.logger.debug("Gathering ML max processing times.")
result.ml_processing_time = self.ml_processing_time_stats()

self.logger.debug("Gathering CPU usage metrics.")
result.median_cpu_usage = self.median("cpu_utilization_1s", sample_type=metrics.SampleType.Normal)

self.logger.debug("Gathering garbage collection metrics.")
result.young_gc_time = self.sum("node_total_young_gen_gc_time")
result.old_gc_time = self.sum("node_total_old_gen_gc_time")
Expand Down Expand Up @@ -345,8 +342,6 @@ def __init__(self, d=None):
self.merge_part_time_vectors = self.v(d, "merge_part_time_vectors")
self.merge_part_time_points = self.v(d, "merge_part_time_points")

self.median_cpu_usage = self.v(d, "median_cpu_usage")

self.young_gc_time = self.v(d, "young_gc_time")
self.old_gc_time = self.v(d, "old_gc_time")

Expand Down Expand Up @@ -484,7 +479,6 @@ def report(self):
metrics_table.extend(self.report_merge_part_times(stats))
metrics_table.extend(self.report_ml_processing_times(stats))

metrics_table.extend(self.report_cpu_usage(stats))
metrics_table.extend(self.report_gc_times(stats))

metrics_table.extend(self.report_disk_usage(stats))
Expand Down Expand Up @@ -611,11 +605,6 @@ def report_ml_processing_times(self, stats):
lines.append(self.line("Max ML processing time", job_name, processing_time["max"], unit))
return lines

def report_cpu_usage(self, stats):
return self.join(
self.line("Median CPU usage", "", stats.median_cpu_usage, "%")
)

def report_gc_times(self, stats):
return self.join(
self.line("Total Young Gen GC", "", stats.young_gc_time, "s", convert.ms_to_seconds),
Expand Down
9 changes: 0 additions & 9 deletions esrally/utils/sysstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,3 @@ def setup_process_stats(pid):
:return: An opaque handle that has to be provided for all subsequent calls to process stats APIs.
"""
return psutil.Process(pid)


def cpu_utilization(handle, interval=1.0):
"""
:param handle: handle retrieved by calling setup_process_stats(pid).
:param interval: The measurement interval in seconds. Optional. Defaults to 1 second.
:return: The CPU usage in percent.
"""
return handle.cpu_percent(interval=interval)

0 comments on commit 42778f4

Please sign in to comment.