forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtester_container.py
269 lines (248 loc) · 9.95 KB
/
tester_container.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import json
import os
import platform
import random
import shutil
import string
import subprocess
from typing import List, Tuple, Optional
from os import path, listdir
from ci.ray_ci.utils import shard_tests, chunk_into_n
from ci.ray_ci.utils import logger
from ci.ray_ci.container import Container
from ray_release.test import TestResult, Test
from ray_release.test_automation.ci_state_machine import CITestStateMachine
from ray_release.configs.global_config import get_global_config
RUN_PER_FLAKY_TEST = 2
class TesterContainer(Container):
"""
A wrapper for running tests in ray ci docker container
"""
def __init__(
self,
shard_count: int = 1,
gpus: int = 0,
bazel_log_dir: str = "/tmp",
network: Optional[str] = None,
test_envs: Optional[List[str]] = None,
shard_ids: Optional[List[int]] = None,
skip_ray_installation: bool = False,
build_type: Optional[str] = None,
) -> None:
"""
:param gpu: Number of gpus to use in the container. If 0, used all gpus.
:param shard_count: The number of shards to split the tests into. This can be
used to run tests in a distributed fashion.
:param shard_ids: The list of shard ids to run. If none, run no shards.
"""
self.bazel_log_dir = bazel_log_dir
self.shard_count = shard_count
self.shard_ids = shard_ids or []
self.test_envs = test_envs or []
self.build_type = build_type
self.network = network
self.gpus = gpus
assert (
self.gpus == 0 or self.gpus >= self.shard_count
), f"Not enough gpus ({self.gpus} provided) for {self.shard_count} shards"
if not skip_ray_installation:
self.install_ray(build_type)
def _create_bazel_log_mount(self, tmp_dir: Optional[str] = None) -> Tuple[str, str]:
"""
Create a temporary directory in the current container to store bazel event logs
produced by the test runs. We do this by using the artifact mount directory from
the host machine as a shared directory between all containers.
"""
tmp_dir = tmp_dir or "".join(
random.choice(string.ascii_lowercase) for _ in range(5)
)
artifact_host, artifact_container = self.get_artifact_mount()
bazel_log_dir_host = os.path.join(artifact_host, tmp_dir)
bazel_log_dir_container = os.path.join(artifact_container, tmp_dir)
os.mkdir(bazel_log_dir_container)
return (bazel_log_dir_host, bazel_log_dir_container)
def run_tests(
self,
team: str,
test_targets: List[str],
test_arg: Optional[str] = None,
is_bisect_run: bool = False,
run_flaky_tests: bool = False,
) -> bool:
"""
Run tests parallelly in docker. Return whether all tests pass.
"""
# shard tests and remove empty chunks
chunks = list(
filter(
len,
[
shard_tests(test_targets, self.shard_count, i)
for i in self.shard_ids
],
)
)
if not chunks:
# no tests to run
return True
# divide gpus evenly among chunks
gpu_ids = chunk_into_n(list(range(self.gpus)), len(chunks))
bazel_log_dir_host, bazel_log_dir_container = self._create_bazel_log_mount()
runs = [
self._run_tests_in_docker(
chunks[i],
gpu_ids[i],
bazel_log_dir_host,
self.test_envs,
test_arg,
run_flaky_tests,
)
for i in range(len(chunks))
]
exits = [run.wait() for run in runs]
self._persist_test_results(team, bazel_log_dir_container, is_bisect_run)
self._cleanup_bazel_log_mount(bazel_log_dir_container)
return all(exit == 0 for exit in exits)
def _persist_test_results(
self, team: str, bazel_log_dir: str, is_bisect_run: bool = False
) -> None:
pipeline_id = os.environ.get("BUILDKITE_PIPELINE_ID")
branch = os.environ.get("BUILDKITE_BRANCH")
branch_pipelines = get_global_config()["ci_pipeline_postmerge"]
pr_pipelines = get_global_config()["ci_pipeline_premerge"]
if is_bisect_run:
logger.info(
"Skip upload test results. We do not upload results on bisect runs."
)
return
if pipeline_id not in branch_pipelines + pr_pipelines:
logger.info(
"Skip upload test results. "
"We only upload results on branch and PR pipelines",
)
return
if pipeline_id in branch_pipelines and branch != "master":
logger.info(
"Skip upload test results. "
"We only upload the master branch results on a branch pipeline",
)
return
self._upload_build_info(bazel_log_dir)
TesterContainer.upload_test_results(team, bazel_log_dir)
TesterContainer.move_test_state(team, bazel_log_dir)
def _upload_build_info(self, bazel_log_dir) -> None:
logger.info("Uploading bazel test logs")
subprocess.check_call(
[
"bash",
"ci/build/upload_build_info.sh",
bazel_log_dir,
]
)
@classmethod
def upload_test_results(cls, team: str, bazel_log_dir: str) -> None:
for test, result in cls.get_test_and_results(team, bazel_log_dir):
logger.info(f"Test {test.get_name()} run status is {result.status}")
test.update_from_s3()
test.persist_to_s3()
test.persist_test_result_to_s3(result)
@classmethod
def move_test_state(cls, team: str, bazel_log_dir: str) -> None:
if get_global_config()["state_machine_disabled"]:
return
pipeline_id = os.environ.get("BUILDKITE_PIPELINE_ID")
branch = os.environ.get("BUILDKITE_BRANCH")
if (
pipeline_id not in get_global_config()["ci_pipeline_postmerge"]
or branch != "master"
):
logger.info("Skip updating test state. We only update on master branch.")
return
for test, _ in cls.get_test_and_results(team, bazel_log_dir):
logger.info(f"Updating test state for {test.get_name()}")
test.update_from_s3()
logger.info(f"\tOld state: {test.get_state()}")
CITestStateMachine(test).move()
test.persist_to_s3()
logger.info(f"\tNew state: {test.get_state()}")
@classmethod
def get_test_and_results(
cls, team, bazel_log_dir: str
) -> List[Tuple[Test, TestResult]]:
bazel_logs = []
# Find all bazel logs
for file in listdir(bazel_log_dir):
log = path.join(bazel_log_dir, file)
if path.isfile(log) and file.startswith("bazel_log"):
bazel_logs.append(log)
tests = {}
# Parse bazel logs and print test results
for file in bazel_logs:
with open(file, "rb") as f:
for line in f:
event = json.loads(line.decode("utf-8"))
if "testResult" not in event:
continue
run_id = event["id"]["testResult"]["run"]
test = Test.from_bazel_event(event, team)
test_result = TestResult.from_bazel_event(event)
# Obtain only the final test result for a given test and run
# in case the test is retried.
tests[f"{run_id}-{test.get_name()}"] = (test, test_result)
return list(tests.values())
def _cleanup_bazel_log_mount(self, bazel_log_dir: str) -> None:
shutil.rmtree(bazel_log_dir)
def _run_tests_in_docker(
self,
test_targets: List[str],
gpu_ids: List[int],
bazel_log_dir_host: str,
test_envs: List[str],
test_arg: Optional[str] = None,
run_flaky_tests: bool = False,
) -> subprocess.Popen:
logger.info("Running tests: %s", test_targets)
commands = [
f'cleanup() {{ chmod -R a+r "{self.bazel_log_dir}"; }}',
"trap cleanup EXIT",
]
if platform.system() == "Windows":
# allow window tests to access aws services
commands.append(
"powershell ci/pipeline/fix-windows-container-networking.ps1"
)
if self.build_type == "ubsan":
# clang currently runs into problems with ubsan builds, this will revert to
# using GCC instead.
commands.append("unset CC CXX")
# note that we run tests serially within each docker, since we already use
# multiple dockers to shard tests
test_cmd = "bazel test --jobs=1 --config=ci $(./ci/run/bazel_export_options) "
if self.build_type == "debug":
test_cmd += "--config=ci-debug "
if self.build_type == "asan":
test_cmd += "--config=asan --config=asan-buildkite "
if self.build_type == "clang":
test_cmd += "--config=llvm "
if self.build_type == "asan-clang":
test_cmd += "--config=asan-clang "
if self.build_type == "ubsan":
test_cmd += "--config=ubsan "
if self.build_type == "tsan-clang":
test_cmd += "--config=tsan-clang "
for env in test_envs:
test_cmd += f"--test_env {env} "
if test_arg:
test_cmd += f"--test_arg {test_arg} "
if run_flaky_tests:
test_cmd += f"--runs_per_test {RUN_PER_FLAKY_TEST} "
test_cmd += f"{' '.join(test_targets)}"
commands.append(test_cmd)
return subprocess.Popen(
self.get_run_command(
commands,
network=self.network,
gpu_ids=gpu_ids,
volumes=[f"{bazel_log_dir_host}:{self.bazel_log_dir}"],
)
)