Skip to content

Commit

Permalink
[core][dashboard] Adds Dashboard RSS metrics (ray-project#47035)
Browse files Browse the repository at this point in the history
Dashboard has `ray_component_uss_mb` metrics but not
`ray_component_rss_mb` but the latter is used in the Ray Grafana dash
"Node Memory by Component". Adds it. Note gcs_server is still missing.

Signed-off-by: Ruiyang Wang <[email protected]>
  • Loading branch information
rynewang authored Aug 10, 2024
1 parent 6706311 commit 2572bba
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 22 deletions.
10 changes: 9 additions & 1 deletion python/ray/dashboard/dashboard_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,22 @@ def __init__(self, registry: Optional[CollectorRegistry] = None):
namespace="ray",
registry=self.registry,
)
self.metrics_dashboard_mem = Gauge(
self.metrics_dashboard_mem_uss = Gauge(
"component_uss",
"USS usage of all components on the node.",
tuple(COMPONENT_METRICS_TAG_KEYS),
unit="mb",
namespace="ray",
registry=self.registry,
)
self.metrics_dashboard_mem_rss = Gauge(
"component_rss",
"RSS usage of all components on the node.",
tuple(COMPONENT_METRICS_TAG_KEYS),
unit="mb",
namespace="ray",
registry=self.registry,
)

except ImportError:

Expand Down
40 changes: 19 additions & 21 deletions python/ray/dashboard/modules/metrics/metrics_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,30 +306,28 @@ def _create_default_prometheus_configs(self):

@dashboard_utils.async_loop_forever(METRICS_RECORD_INTERVAL_S)
async def record_dashboard_metrics(self):
self._dashboard_head.metrics.metrics_dashboard_cpu.labels(
ip=self._ip,
pid=self._pid,
Version=ray.__version__,
Component=self._component,
SessionName=self._session_name,
).set(float(self._dashboard_proc.cpu_percent()))
self._dashboard_head.metrics.metrics_dashboard_mem.labels(
ip=self._ip,
pid=self._pid,
Version=ray.__version__,
Component=self._component,
SessionName=self._session_name,
).set(float(self._dashboard_proc.memory_full_info().uss) / 1.0e6)
labels = {
"ip": self._ip,
"pid": self._pid,
"Version": ray.__version__,
"Component": self._component,
"SessionName": self._session_name,
}
self._dashboard_head.metrics.metrics_dashboard_cpu.labels(**labels).set(
float(self._dashboard_proc.cpu_percent())
)
self._dashboard_head.metrics.metrics_dashboard_mem_uss.labels(**labels).set(
float(self._dashboard_proc.memory_full_info().uss) / 1.0e6
)
self._dashboard_head.metrics.metrics_dashboard_mem_rss.labels(**labels).set(
float(self._dashboard_proc.memory_full_info().rss) / 1.0e6
)

# Report the max lag since the last export, if any.
if self._event_loop_lag_s_max is not None:
self._dashboard_head.metrics.metrics_event_loop_lag.labels(
ip=self._ip,
pid=self._pid,
Version=ray.__version__,
Component=self._component,
SessionName=self._session_name,
).set(float(self._event_loop_lag_s_max))
self._dashboard_head.metrics.metrics_event_loop_lag.labels(**labels).set(
float(self._event_loop_lag_s_max)
)
self._event_loop_lag_s_max = None

async def run(self, server):
Expand Down

0 comments on commit 2572bba

Please sign in to comment.