Skip to content

Commit

Permalink
[tune] Fix flaky test_controller_checkpointing_integration test sui…
Browse files Browse the repository at this point in the history
…te (ray-project#43880)

Signed-off-by: Justin Yu <[email protected]>
  • Loading branch information
justinvyu authored Mar 11, 2024
1 parent d34e818 commit 7e71789
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_callback_save_restore(
runner._callbacks.on_trial_result(
iteration=i, trials=None, trial=None, result=None
)
runner.checkpoint(force=True)
runner.checkpoint(force=True, wait=True)
callback = StatefulCallback()
runner2 = TuneController(callbacks=[callback], storage=storage)
assert callback.counter == 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def get_checkpoint_dirs(trial: Trial):
assert len(cp_dirs) == 2, f"Checkpoint dirs: {cp_dirs}"

# Re-instantiate trial runner and resume
runner.checkpoint(force=True)
runner.checkpoint(force=True, wait=True)
runner = TuneController(
resource_manager_factory=lambda: resource_manager_cls(),
storage=STORAGE,
Expand Down Expand Up @@ -427,9 +427,7 @@ def test_checkpoint_user_checkpoint(
{"TUNE_RESULT_BUFFER_LENGTH": "1", "TUNE_MAX_PENDING_TRIALS_PG": "1"},
):
runner = TuneController(
resource_manager_factory=lambda: resource_manager_cls(),
storage=STORAGE,
checkpoint_period=0,
resource_manager_factory=lambda: resource_manager_cls(), storage=STORAGE
)
runner.add_trial(
Trial("__fake", config={"user_checkpoint_freq": 2}, storage=STORAGE)
Expand All @@ -452,6 +450,7 @@ def test_checkpoint_user_checkpoint(
runner.step()

assert trials[0].has_checkpoint()
runner.checkpoint(force=True, wait=True)

runner2 = TuneController(
resource_manager_factory=lambda: resource_manager_cls(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def test_controller_restore_no_error_resume(
while not runner.is_finished():
runner.step()

runner.checkpoint(force=True)
runner.checkpoint(force=True, wait=True)

assert trials[0].status == Trial.ERROR
del runner
Expand Down Expand Up @@ -195,7 +195,7 @@ def test_controller_restore_error_only_resume(
while not runner.is_finished():
runner.step()

runner.checkpoint(force=True)
runner.checkpoint(force=True, wait=True)

assert trials[0].status == Trial.ERROR
del runner
Expand Down Expand Up @@ -508,7 +508,7 @@ def create_trial_config():
)
runner.add_trial(trial)
# Req: TrialRunner checkpointing shouldn't error
runner.checkpoint(force=True)
runner.checkpoint(force=True, wait=True)

# Manually clear all block refs that may have been created
ray.shutdown()
Expand Down
2 changes: 1 addition & 1 deletion python/ray/tune/tests/tune_test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def create_tune_experiment_checkpoint(trials: list, **runner_kwargs) -> str:
for trial in trials:
runner.add_trial(trial)

runner.checkpoint(force=True)
runner.checkpoint(force=True, wait=True)
finally:
os.environ.clear()
os.environ.update(orig_env)
Expand Down

0 comments on commit 7e71789

Please sign in to comment.