Skip to content

Commit

Permalink
[tune] resume=False by default but print a tip to set resume="prompt"…
Browse files Browse the repository at this point in the history
… + jenkins fix (ray-project#3681)
  • Loading branch information
ericl authored and richardliaw committed Jan 5, 2019
1 parent 747b117 commit 7db1f3b
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 22 deletions.
2 changes: 1 addition & 1 deletion doc/source/tune-usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ of a trial, you can additionally set the checkpoint_at_end to True. An example i
Recovering From Failures (Experimental)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Tune automatically persists the progress of your experiments, so if an experiment crashes or is otherwise cancelled, it can be resumed after prompting. The default setting of `resume=None` will cause Tune to prompt you for whether you want to resume. Prompting can be turned off with ``resume=True``. If ``resume=False``, a new experiment will be created instead. You can always force a new experiment to be created by changing the experiment name.
Tune automatically persists the progress of your experiments, so if an experiment crashes or is otherwise cancelled, it can be resumed with ``resume=True``. The default setting of ``resume=False`` creates a new experiment, and ``resume="prompt"`` will cause Tune to prompt you for whether you want to resume. You can always force a new experiment to be created by changing the experiment name.

Note that trials will be restored to their last checkpoint. If trial checkpointing is not enabled, unfinished trials will be restarted from scratch.

Expand Down
2 changes: 1 addition & 1 deletion python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,4 @@ def _value(self, obs):
with self.lock:
obs = torch.from_numpy(obs).float().unsqueeze(0)
_, _, vf, _ = self.model({"obs": obs}, [])
return vf.numpy().squeeze()
return vf.detach().numpy().squeeze()
2 changes: 0 additions & 2 deletions python/ray/tune/test/cluster_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ def _start_new_cluster():
@pytest.fixture
def start_connected_cluster():
# Start the Ray processes.
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
cluster = _start_new_cluster()
yield cluster
# The code after the yield will run as teardown code.
Expand All @@ -74,7 +73,6 @@ def start_connected_cluster():
def start_connected_emptyhead_cluster():
"""Starts head with no resources."""

os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
cluster = Cluster(
initialize_head=True,
connect=True,
Expand Down
6 changes: 0 additions & 6 deletions python/ray/tune/test/trial_runner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@

class TrainableFunctionApiTest(unittest.TestCase):
def setUp(self):
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
ray.init(num_cpus=4, num_gpus=0)

def tearDown(self):
Expand Down Expand Up @@ -545,7 +544,6 @@ def _restore(self, state):

class RunExperimentTest(unittest.TestCase):
def setUp(self):
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
ray.init()

def tearDown(self):
Expand Down Expand Up @@ -759,7 +757,6 @@ def sync_func(local, remote):

class VariantGeneratorTest(unittest.TestCase):
def setUp(self):
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
ray.init()

def tearDown(self):
Expand Down Expand Up @@ -963,9 +960,6 @@ def on_trial_complete(self, trial_id, error=False, **kwargs):


class TrialRunnerTest(unittest.TestCase):
def setUp(self):
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"

def tearDown(self):
ray.shutdown()
_register_all() # re-register the evicted objects
Expand Down
23 changes: 11 additions & 12 deletions python/ray/tune/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def run_experiments(experiments,
with_server=False,
server_port=TuneServer.DEFAULT_PORT,
verbose=True,
resume=None,
resume=False,
queue_trials=False,
trial_executor=None,
raise_on_failed_trial=True):
Expand All @@ -76,8 +76,8 @@ def run_experiments(experiments,
using the Client API.
server_port (int): Port number for launching TuneServer.
verbose (bool): How much output should be printed for each trial.
resume (bool|None): If checkpoint exists, the experiment will
resume from there. If resume is None, Tune will prompt if
resume (bool|"prompt"): If checkpoint exists, the experiment will
resume from there. If resume is "prompt", Tune will prompt if
checkpoint detected.
queue_trials (bool): Whether to queue trials when the cluster does
not currently have enough resources to launch one. This should
Expand Down Expand Up @@ -116,25 +116,24 @@ def run_experiments(experiments,
runner = None
restore = False

# TUNE_RESUME_PROMPT_OFF is for testing purposes and defaults
# `resume=False.`
if os.environ.get("TUNE_RESUME_PROMPT_OFF"):
resume = resume or False

if os.path.exists(
os.path.join(checkpoint_dir, TrialRunner.CKPT_FILE_NAME)):
if resume:
restore = True
elif resume is None:
if resume == "prompt":
msg = ("Found incomplete experiment at {}. "
"Would you like to resume it?".format(checkpoint_dir))
restore = click.confirm(msg, default=True)
restore = click.confirm(msg, default=False)
if restore:
logger.info("Tip: to always resume, "
"pass resume=True to run_experiments()")
else:
logger.info("Tip: to always start a new experiment, "
"pass resume=False to run_experiments()")
elif resume:
restore = True
else:
logger.info(
"Tip: to resume incomplete experiments, "
"pass resume='prompt' or resume=True to run_experiments()")
else:
logger.info(
"Did not find checkpoint file in {}.".format(checkpoint_dir))
Expand Down

0 comments on commit 7db1f3b

Please sign in to comment.