[tune] Clean up result logging: move out of /tmp, add timestamp (ray-…

…project#1297)
zzcThomas · Dec 15, 2017 · fbf1806 · fbf1806
1 parent 12fdb3f
commit fbf1806
Show file tree

Hide file tree

Showing 11 changed files with 64 additions and 26 deletions.
diff --git a/doc/source/example-a3c.rst b/doc/source/example-a3c.rst
@@ -153,6 +153,6 @@ workers, we can train the agent in around 25 minutes.
 
 You can visualize performance by running
 :code:`tensorboard --logdir [directory]` in a separate screen, where
-:code:`[directory]` is defaulted to :code:`/tmp/ray/`. If you are running
+:code:`[directory]` is defaulted to :code:`~/ray_results/`. If you are running
 multiple experiments, be sure to vary the directory to which Tensorflow saves
 its progress (found in :code:`a3c.py`).
diff --git a/doc/source/example-policy-gradient.rst b/doc/source/example-policy-gradient.rst
@@ -28,7 +28,7 @@ TensorBoard to the log output directory as follows.
 
 .. code-block:: bash
 
-  tensorboard --logdir=/tmp/ray
+  tensorboard --logdir=~/ray_results
 
 Many of the TensorBoard metrics are also printed to the console, but you might
 find it easier to visualize and compare between runs using the TensorBoard UI.

diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
@@ -59,15 +59,15 @@ You can train a simple DQN agent with the following command
 
     python ray/python/ray/rllib/train.py --run DQN --env CartPole-v0
 
-By default, the results will be logged to a subdirectory of ``/tmp/ray``.
+By default, the results will be logged to a subdirectory of ``~/ray_results``.
 This subdirectory will contain a file ``params.json`` which contains the
 hyperparameters, a file ``result.json`` which contains a training summary
 for each episode and a TensorBoard file that can be used to visualize
 training process with TensorBoard by running
 
 ::
 
-     tensorboard --logdir=/tmp/ray
+     tensorboard --logdir=~/ray_results
 
 
 The ``train.py`` script has a number of options you can show by running

diff --git a/doc/source/tune.rst b/doc/source/tune.rst
@@ -50,7 +50,7 @@ This script runs a small grid search over the ``my_func`` function using ray.tun
     == Status ==
     Using FIFO scheduling algorithm.
     Resources used: 4/8 CPUs, 0/0 GPUs
-    Result logdir: /tmp/ray/my_experiment
+    Result logdir: ~/ray_results/my_experiment
      - my_func_0_alpha=0.2,beta=1:	RUNNING [pid=6778], 209 s, 20604 ts, 7.29 acc
      - my_func_1_alpha=0.4,beta=1:	RUNNING [pid=6780], 208 s, 20522 ts, 53.1 acc
      - my_func_2_alpha=0.6,beta=1:	TERMINATED [pid=6789], 21 s, 2190 ts, 101 acc
@@ -63,14 +63,14 @@ In order to report incremental progress, ``my_func`` periodically calls the ``re
 Visualizing Results
 -------------------
 
-Ray.tune logs trial results to a unique directory per experiment, e.g. ``/tmp/ray/my_experiment`` in the above example. The log records are compatible with a number of visualization tools:
+Ray.tune logs trial results to a unique directory per experiment, e.g. ``~/ray_results/my_experiment`` in the above example. The log records are compatible with a number of visualization tools:
 
 To visualize learning in tensorboard, run:
 
 ::
 
     $ pip install tensorboard
-    $ tensorboard --logdir=/tmp/ray/my_experiment
+    $ tensorboard --logdir=~/ray_results/my_experiment
 
 .. image:: ray-tune-tensorboard.png
 
@@ -79,7 +79,7 @@ To use rllab's VisKit (you may have to install some dependencies), run:
 ::
 
     $ git clone https://github.com/rll/rllab.git
-    $ python rllab/rllab/viskit/frontend.py /tmp/ray/my_experiment
+    $ python rllab/rllab/viskit/frontend.py ~/ray_results/my_experiment
 
 .. image:: ray-tune-viskit.png
 

diff --git a/python/ray/rllib/agent.py b/python/ray/rllib/agent.py
@@ -18,7 +18,7 @@
 import tensorflow as tf
 from ray.tune.logger import UnifiedLogger
 from ray.tune.registry import ENV_CREATOR
-from ray.tune.result import TrainingResult
+from ray.tune.result import DEFAULT_RESULTS_DIR, TrainingResult
 from ray.tune.trainable import Trainable
 
 logger = logging.getLogger(__name__)
@@ -72,7 +72,6 @@ class Agent(Trainable):
 
     _allow_unknown_configs = False
     _allow_unknown_subkeys = []
-    _default_logdir = "/tmp/ray"
 
     def __init__(
             self, config={}, env=None, registry=None, logger_creator=None):
@@ -111,10 +110,10 @@ def __init__(
             logdir_suffix = "{}_{}_{}".format(
                 env, self._agent_name,
                 datetime.today().strftime("%Y-%m-%d_%H-%M-%S"))
-            if not os.path.exists(self._default_logdir):
-                os.makedirs(self._default_logdir)
+            if not os.path.exists(DEFAULT_RESULTS_DIR):
+                os.makedirs(DEFAULT_RESULTS_DIR)
             self.logdir = tempfile.mkdtemp(
-                prefix=logdir_suffix, dir=self._default_logdir)
+                prefix=logdir_suffix, dir=DEFAULT_RESULTS_DIR)
             self._result_logger = UnifiedLogger(self.config, self.logdir, None)
 
         self._iteration = 0
@@ -155,8 +154,11 @@ def train(self):
         self._time_total += time_this_iter
         self._timesteps_total += result.timesteps_this_iter
 
+        now = datetime.today()
         result = result._replace(
             experiment_id=self._experiment_id,
+            date=now.strftime("%Y-%m-%d_%H-%M-%S"),
+            timestamp=int(time.mktime(now.timetuple())),
             training_iteration=self._iteration,
             timesteps_total=self._timesteps_total,
             time_this_iter_s=time_this_iter,

diff --git a/python/ray/rllib/train.py b/python/ray/rllib/train.py
@@ -57,7 +57,7 @@
     else:
         # Note: keep this in sync with tune/config_parser.py
         experiments = {
-            args.experiment_name: {  # i.e. log to /tmp/ray/default
+            args.experiment_name: {  # i.e. log to ~/ray_results/default
                 "run": args.run,
                 "checkpoint_freq": args.checkpoint_freq,
                 "local_dir": args.local_dir,

diff --git a/python/ray/tune/ParallelCoordinatesVisualization.ipynb b/python/ray/tune/ParallelCoordinatesVisualization.ipynb
@@ -24,6 +24,7 @@
    },
    "outputs": [],
    "source": [
+    "import os\n",
     "import pandas as pd\n",
     "from ray.tune.visual_utils import load_results_to_df, generate_plotly_dim_dict\n",
     "import plotly\n",
@@ -46,7 +47,7 @@
    },
    "outputs": [],
    "source": [
-    "RESULTS_DIR = \"/tmp/ray/\"\n",
+    "RESULTS_DIR = os.path.expanduser(\"~/ray_results\")\n",
     "df = load_results_to_df(RESULTS_DIR)\n",
     "[key for key in df]"
    ]

diff --git a/python/ray/tune/config_parser.py b/python/ray/tune/config_parser.py
@@ -7,6 +7,7 @@
 import json
 
 from ray.tune import TuneError
+from ray.tune.result import DEFAULT_RESULTS_DIR
 from ray.tune.trial import Resources
 
 
@@ -63,8 +64,9 @@ def make_parser(**kwargs):
         "--repeat", default=1, type=int,
         help="Number of times to repeat each trial.")
     parser.add_argument(
-        "--local-dir", default="/tmp/ray", type=str,
-        help="Local dir to save training results to. Defaults to '/tmp/ray'.")
+        "--local-dir", default=DEFAULT_RESULTS_DIR, type=str,
+        help="Local dir to save training results to. Defaults to '{}'.".format(
+            DEFAULT_RESULTS_DIR))
     parser.add_argument(
         "--upload-dir", default="", type=str,
         help="Optional URI to upload training results to.")

diff --git a/python/ray/tune/result.py b/python/ray/tune/result.py
@@ -4,6 +4,7 @@
 
 from collections import namedtuple
 import json
+import os
 
 try:
     import yaml
@@ -20,6 +21,9 @@
 In RLlib, the supplied algorithms fill in TrainingResult for you.
 """
 
+# Where ray.tune writes result files by default
+DEFAULT_RESULTS_DIR = os.path.expanduser("~/ray_results")
+
 
 TrainingResult = namedtuple("TrainingResult", [
     # (Required) Accumulated timesteps for this entire experiment.
@@ -40,9 +44,12 @@
     # (Optional) The number of episodes total.
     "episodes_total",
 
-    # (Optional) The current training accuracy if applicable>
+    # (Optional) The current training accuracy if applicable.
     "mean_accuracy",
 
+    # (Optional) The current validation accuracy if applicable.
+    "mean_validation_accuracy",
+
     # (Optional) The current training loss if applicable.
     "mean_loss",
 
@@ -69,6 +76,12 @@
     # (Auto-filled) The pid of the training process.
     "pid",
 
+    # (Auto-filled) A formatted date of when the result was processed.
+    "date",
+
+    # (Auto-filled) A UNIX timestamp of when the result was processed.
+    "timestamp",
+
     # (Auto-filled) The hostname of the machine hosting the training process.
     "hostname",
 ])

diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py
@@ -2,6 +2,7 @@
 from __future__ import division
 from __future__ import print_function
 
+from datetime import datetime
 import tempfile
 import traceback
 import ray
@@ -10,7 +11,7 @@
 from collections import namedtuple
 from ray.tune import TuneError
 from ray.tune.logger import NoopLogger, UnifiedLogger
-from ray.tune.result import TrainingResult
+from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR
 from ray.tune.registry import _default_registry, get_registry, TRAINABLE_CLASS
 
 
@@ -62,7 +63,7 @@ class Trial(object):
     ERROR = "ERROR"
 
     def __init__(
-            self, trainable_name, config={}, local_dir='/tmp/ray',
+            self, trainable_name, config={}, local_dir=DEFAULT_RESULTS_DIR,
             experiment_tag=None, resources=Resources(cpu=1, gpu=0),
             stopping_criterion={}, checkpoint_freq=0,
             restore_path=None, upload_dir=None):
@@ -295,16 +296,22 @@ def _setup_runner(self):
             if not os.path.exists(self.local_dir):
                 os.makedirs(self.local_dir)
             self.logdir = tempfile.mkdtemp(
-                prefix=str(self), dir=self.local_dir)
+                prefix=str(self), dir=self.local_dir,
+                suffix=datetime.today().strftime("_%Y-%m-%d_%H-%M-%S"))
             self.result_logger = UnifiedLogger(
                 self.config, self.logdir, self.upload_dir)
         remote_logdir = self.logdir
+
+        def logger_creator(config):
+            # Set the working dir in the remote process, for user file writes
+            os.chdir(remote_logdir)
+            return NoopLogger(config, remote_logdir)
+
         # Logging for trials is handled centrally by TrialRunner, so
         # configure the remote runner to use a noop-logger.
         self.runner = cls.remote(
-            config=self.config,
-            registry=get_registry(),
-            logger_creator=lambda config: NoopLogger(config, remote_logdir))
+            config=self.config, registry=get_registry(),
+            logger_creator=logger_creator)
 
     def __str__(self):
         if "env" in self.config:

diff --git a/test/trial_runner_test.py b/test/trial_runner_test.py
@@ -12,6 +12,7 @@
 from ray.tune import Trainable, TuneError
 from ray.tune import register_env, register_trainable, run_experiments
 from ray.tune.registry import _default_registry, TRAINABLE_CLASS
+from ray.tune.result import DEFAULT_RESULTS_DIR
 from ray.tune.trial import Trial, Resources
 from ray.tune.trial_runner import TrialRunner
 from ray.tune.variant_generator import generate_trials, grid_search, \
@@ -63,6 +64,17 @@ def train(config, reporter):
             "config": {"a": "b"},
         }})
 
+    def testLogdir(self):
+        def train(config, reporter):
+            assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd()
+            reporter(timesteps_total=1)
+        register_trainable("f1", train)
+        run_experiments({"foo": {
+            "run": "f1",
+            "local_dir": "/tmp/logdir",
+            "config": {"a": "b"},
+        }})
+
     def testBadParams(self):
         def f():
             run_experiments({"foo": {}})
@@ -191,7 +203,9 @@ def testParseToTrials(self):
         self.assertEqual(trials[0].config, {"foo": "bar", "env": "Pong-v0"})
         self.assertEqual(trials[0].trainable_name, "PPO")
         self.assertEqual(trials[0].experiment_tag, "0")
-        self.assertEqual(trials[0].local_dir, "/tmp/ray/tune-pong")
+        self.assertEqual(
+            trials[0].local_dir,
+            os.path.join(DEFAULT_RESULTS_DIR, "tune-pong"))
         self.assertEqual(trials[1].experiment_tag, "1")
 
     def testEval(self):
@@ -207,7 +221,6 @@ def testEval(self):
         self.assertEqual(len(trials), 1)
         self.assertEqual(trials[0].config, {"foo": 4})
         self.assertEqual(trials[0].experiment_tag, "0_foo=4")
-        self.assertEqual(trials[0].local_dir, "/tmp/ray/")
 
     def testGridSearch(self):
         trials = generate_trials({