[tune] resume=False by default but print a tip to set resume="prompt"…

… + jenkins fix (ray-project#3681)
lmco · Jan 5, 2019 · 7db1f3b · 7db1f3b
1 parent 747b117
commit 7db1f3b
Show file tree

Hide file tree

Showing 5 changed files with 13 additions and 22 deletions.
diff --git a/doc/source/tune-usage.rst b/doc/source/tune-usage.rst
@@ -299,7 +299,7 @@ of a trial, you can additionally set the checkpoint_at_end to True. An example i
 Recovering From Failures (Experimental)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Tune automatically persists the progress of your experiments, so if an experiment crashes or is otherwise cancelled, it can be resumed after prompting. The default setting of `resume=None` will cause Tune to prompt you for whether you want to resume. Prompting can be turned off with ``resume=True``. If ``resume=False``, a new experiment will be created instead. You can always force a new experiment to be created by changing the experiment name.
+Tune automatically persists the progress of your experiments, so if an experiment crashes or is otherwise cancelled, it can be resumed with ``resume=True``. The default setting of ``resume=False`` creates a new experiment, and ``resume="prompt"`` will cause Tune to prompt you for whether you want to resume. You can always force a new experiment to be created by changing the experiment name.
 
 Note that trials will be restored to their last checkpoint. If trial checkpointing is not enabled, unfinished trials will be restarted from scratch.
 

diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py
@@ -82,4 +82,4 @@ def _value(self, obs):
         with self.lock:
             obs = torch.from_numpy(obs).float().unsqueeze(0)
             _, _, vf, _ = self.model({"obs": obs}, [])
-            return vf.numpy().squeeze()
+            return vf.detach().numpy().squeeze()
diff --git a/python/ray/tune/test/cluster_tests.py b/python/ray/tune/test/cluster_tests.py
@@ -62,7 +62,6 @@ def _start_new_cluster():
 @pytest.fixture
 def start_connected_cluster():
     # Start the Ray processes.
-    os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
     cluster = _start_new_cluster()
     yield cluster
     # The code after the yield will run as teardown code.
@@ -74,7 +73,6 @@ def start_connected_cluster():
 def start_connected_emptyhead_cluster():
     """Starts head with no resources."""
 
-    os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
     cluster = Cluster(
         initialize_head=True,
         connect=True,

diff --git a/python/ray/tune/test/trial_runner_test.py b/python/ray/tune/test/trial_runner_test.py
@@ -39,7 +39,6 @@
 
 class TrainableFunctionApiTest(unittest.TestCase):
     def setUp(self):
-        os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
         ray.init(num_cpus=4, num_gpus=0)
 
     def tearDown(self):
@@ -545,7 +544,6 @@ def _restore(self, state):
 
 class RunExperimentTest(unittest.TestCase):
     def setUp(self):
-        os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
         ray.init()
 
     def tearDown(self):
@@ -759,7 +757,6 @@ def sync_func(local, remote):
 
 class VariantGeneratorTest(unittest.TestCase):
     def setUp(self):
-        os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
         ray.init()
 
     def tearDown(self):
@@ -963,9 +960,6 @@ def on_trial_complete(self, trial_id, error=False, **kwargs):
 
 
 class TrialRunnerTest(unittest.TestCase):
-    def setUp(self):
-        os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
-
     def tearDown(self):
         ray.shutdown()
         _register_all()  # re-register the evicted objects

diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py
@@ -58,7 +58,7 @@ def run_experiments(experiments,
                     with_server=False,
                     server_port=TuneServer.DEFAULT_PORT,
                     verbose=True,
-                    resume=None,
+                    resume=False,
                     queue_trials=False,
                     trial_executor=None,
                     raise_on_failed_trial=True):
@@ -76,8 +76,8 @@ def run_experiments(experiments,
             using the Client API.
         server_port (int): Port number for launching TuneServer.
         verbose (bool): How much output should be printed for each trial.
-        resume (bool|None): If checkpoint exists, the experiment will
-            resume from there. If resume is None, Tune will prompt if
+        resume (bool|"prompt"): If checkpoint exists, the experiment will
+            resume from there. If resume is "prompt", Tune will prompt if
             checkpoint detected.
         queue_trials (bool): Whether to queue trials when the cluster does
             not currently have enough resources to launch one. This should
@@ -116,25 +116,24 @@ def run_experiments(experiments,
     runner = None
     restore = False
 
-    # TUNE_RESUME_PROMPT_OFF is for testing purposes and defaults
-    # `resume=False.`
-    if os.environ.get("TUNE_RESUME_PROMPT_OFF"):
-        resume = resume or False
-
     if os.path.exists(
             os.path.join(checkpoint_dir, TrialRunner.CKPT_FILE_NAME)):
-        if resume:
-            restore = True
-        elif resume is None:
+        if resume == "prompt":
             msg = ("Found incomplete experiment at {}. "
                    "Would you like to resume it?".format(checkpoint_dir))
-            restore = click.confirm(msg, default=True)
+            restore = click.confirm(msg, default=False)
             if restore:
                 logger.info("Tip: to always resume, "
                             "pass resume=True to run_experiments()")
             else:
                 logger.info("Tip: to always start a new experiment, "
                             "pass resume=False to run_experiments()")
+        elif resume:
+            restore = True
+        else:
+            logger.info(
+                "Tip: to resume incomplete experiments, "
+                "pass resume='prompt' or resume=True to run_experiments()")
     else:
         logger.info(
             "Did not find checkpoint file in {}.".format(checkpoint_dir))