forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathk8s_utils.py
100 lines (81 loc) · 3.38 KB
/
k8s_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import logging
import ray._private.utils
logger = logging.getLogger(__name__)
CPU_SHARES_PATH = "/sys/fs/cgroup/cpu/cpu.shares"
CPU_USAGE_PATH = "/sys/fs/cgroup/cpuacct/cpuacct.usage"
PROC_STAT_PATH = "/proc/stat"
container_num_cpus = None
host_num_cpus = None
last_cpu_usage = None
last_system_usage = None
def cpu_percent():
"""Estimate CPU usage percent for Ray pod managed by Kubernetes
Operator.
Computed by the following steps
(1) Replicate the logic used by 'docker stats' cli command.
See https://github.com/docker/cli/blob/c0a6b1c7b30203fbc28cd619acb901a95a80e30e/cli/command/container/stats_helpers.go#L166.
(2) Divide by the number of CPUs available to the container, so that
e.g. full capacity use of 2 CPUs will read as 100%,
rather than 200%.
Step (1) above works by
dividing delta in cgroup's cpuacct.usage by
delta in total host cpu usage, averaged over host's cpus.
Since deltas are not initially available, return 0.0 on first call.
""" # noqa
global last_system_usage
global last_cpu_usage
try:
cpu_usage = _cpu_usage()
system_usage = _system_usage()
# Return 0.0 on first call.
if last_system_usage is None:
cpu_percent = 0.0
else:
cpu_delta = cpu_usage - last_cpu_usage
# "System time passed." (Typically close to clock time.)
system_delta = (
(system_usage - last_system_usage) / _host_num_cpus())
quotient = cpu_delta / system_delta
cpu_percent = round(
quotient * 100 / ray._private.utils.get_k8s_cpus(), 1)
last_system_usage = system_usage
last_cpu_usage = cpu_usage
# Computed percentage might be slightly above 100%.
return min(cpu_percent, 100.0)
except Exception as e:
logger.exception("Error computing CPU usage of Ray Kubernetes pod.", e)
return 0.0
def _cpu_usage():
"""Compute total cpu usage of the container in nanoseconds
by reading from cgroup/cpuacct."""
return int(open(CPU_USAGE_PATH).read())
def _system_usage():
"""
Computes total CPU usage of the host in nanoseconds.
Logic taken from here:
https://github.com/moby/moby/blob/b42ac8d370a8ef8ec720dff0ca9dfb3530ac0a6a/daemon/stats/collector_unix.go#L31
See also the /proc/stat entry here:
https://man7.org/linux/man-pages/man5/proc.5.html
""" # noqa
cpu_summary_str = open(PROC_STAT_PATH).read().split("\n")[0]
parts = cpu_summary_str.split()
assert parts[0] == "cpu"
usage_data = parts[1:8]
total_clock_ticks = sum(int(entry) for entry in usage_data)
# 100 clock ticks per second, 10^9 ns per second
usage_ns = total_clock_ticks * 10**7
return usage_ns
def _host_num_cpus():
"""Number of physical CPUs, obtained by parsing /proc/stat."""
global host_num_cpus
if host_num_cpus is None:
proc_stat_lines = open(PROC_STAT_PATH).read().split("\n")
split_proc_stat_lines = [line.split() for line in proc_stat_lines]
cpu_lines = [
split_line for split_line in split_proc_stat_lines
if len(split_line) > 0 and "cpu" in split_line[0]
]
# Number of lines starting with a word including 'cpu', subtracting
# 1 for the first summary line.
host_num_cpus = len(cpu_lines) - 1
return host_num_cpus