From fbf1806b8a0dab73b909eea78b57739353551be7 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 15 Dec 2017 14:19:08 -0800 Subject: [PATCH] [tune] Clean up result logging: move out of /tmp, add timestamp (#1297) --- doc/source/example-a3c.rst | 2 +- doc/source/example-policy-gradient.rst | 2 +- doc/source/rllib.rst | 4 ++-- doc/source/tune.rst | 8 ++++---- python/ray/rllib/agent.py | 12 +++++++----- python/ray/rllib/train.py | 2 +- .../ParallelCoordinatesVisualization.ipynb | 3 ++- python/ray/tune/config_parser.py | 6 ++++-- python/ray/tune/result.py | 15 ++++++++++++++- python/ray/tune/trial.py | 19 +++++++++++++------ test/trial_runner_test.py | 17 +++++++++++++++-- 11 files changed, 64 insertions(+), 26 deletions(-) diff --git a/doc/source/example-a3c.rst b/doc/source/example-a3c.rst index 38fc9600f3acc..096619f19dfbd 100644 --- a/doc/source/example-a3c.rst +++ b/doc/source/example-a3c.rst @@ -153,6 +153,6 @@ workers, we can train the agent in around 25 minutes. You can visualize performance by running :code:`tensorboard --logdir [directory]` in a separate screen, where -:code:`[directory]` is defaulted to :code:`/tmp/ray/`. If you are running +:code:`[directory]` is defaulted to :code:`~/ray_results/`. If you are running multiple experiments, be sure to vary the directory to which Tensorflow saves its progress (found in :code:`a3c.py`). diff --git a/doc/source/example-policy-gradient.rst b/doc/source/example-policy-gradient.rst index a0d98821f361d..02f47a5703205 100644 --- a/doc/source/example-policy-gradient.rst +++ b/doc/source/example-policy-gradient.rst @@ -28,7 +28,7 @@ TensorBoard to the log output directory as follows. .. code-block:: bash - tensorboard --logdir=/tmp/ray + tensorboard --logdir=~/ray_results Many of the TensorBoard metrics are also printed to the console, but you might find it easier to visualize and compare between runs using the TensorBoard UI. diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 7075b4106f302..8cd5499b1faea 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -59,7 +59,7 @@ You can train a simple DQN agent with the following command python ray/python/ray/rllib/train.py --run DQN --env CartPole-v0 -By default, the results will be logged to a subdirectory of ``/tmp/ray``. +By default, the results will be logged to a subdirectory of ``~/ray_results``. This subdirectory will contain a file ``params.json`` which contains the hyperparameters, a file ``result.json`` which contains a training summary for each episode and a TensorBoard file that can be used to visualize @@ -67,7 +67,7 @@ training process with TensorBoard by running :: - tensorboard --logdir=/tmp/ray + tensorboard --logdir=~/ray_results The ``train.py`` script has a number of options you can show by running diff --git a/doc/source/tune.rst b/doc/source/tune.rst index c4d8769a3e438..6752b7f109f77 100644 --- a/doc/source/tune.rst +++ b/doc/source/tune.rst @@ -50,7 +50,7 @@ This script runs a small grid search over the ``my_func`` function using ray.tun == Status == Using FIFO scheduling algorithm. Resources used: 4/8 CPUs, 0/0 GPUs - Result logdir: /tmp/ray/my_experiment + Result logdir: ~/ray_results/my_experiment - my_func_0_alpha=0.2,beta=1: RUNNING [pid=6778], 209 s, 20604 ts, 7.29 acc - my_func_1_alpha=0.4,beta=1: RUNNING [pid=6780], 208 s, 20522 ts, 53.1 acc - my_func_2_alpha=0.6,beta=1: TERMINATED [pid=6789], 21 s, 2190 ts, 101 acc @@ -63,14 +63,14 @@ In order to report incremental progress, ``my_func`` periodically calls the ``re Visualizing Results ------------------- -Ray.tune logs trial results to a unique directory per experiment, e.g. ``/tmp/ray/my_experiment`` in the above example. The log records are compatible with a number of visualization tools: +Ray.tune logs trial results to a unique directory per experiment, e.g. ``~/ray_results/my_experiment`` in the above example. The log records are compatible with a number of visualization tools: To visualize learning in tensorboard, run: :: $ pip install tensorboard - $ tensorboard --logdir=/tmp/ray/my_experiment + $ tensorboard --logdir=~/ray_results/my_experiment .. image:: ray-tune-tensorboard.png @@ -79,7 +79,7 @@ To use rllab's VisKit (you may have to install some dependencies), run: :: $ git clone https://github.com/rll/rllab.git - $ python rllab/rllab/viskit/frontend.py /tmp/ray/my_experiment + $ python rllab/rllab/viskit/frontend.py ~/ray_results/my_experiment .. image:: ray-tune-viskit.png diff --git a/python/ray/rllib/agent.py b/python/ray/rllib/agent.py index a889e06777fb4..c43e2357ceac3 100644 --- a/python/ray/rllib/agent.py +++ b/python/ray/rllib/agent.py @@ -18,7 +18,7 @@ import tensorflow as tf from ray.tune.logger import UnifiedLogger from ray.tune.registry import ENV_CREATOR -from ray.tune.result import TrainingResult +from ray.tune.result import DEFAULT_RESULTS_DIR, TrainingResult from ray.tune.trainable import Trainable logger = logging.getLogger(__name__) @@ -72,7 +72,6 @@ class Agent(Trainable): _allow_unknown_configs = False _allow_unknown_subkeys = [] - _default_logdir = "/tmp/ray" def __init__( self, config={}, env=None, registry=None, logger_creator=None): @@ -111,10 +110,10 @@ def __init__( logdir_suffix = "{}_{}_{}".format( env, self._agent_name, datetime.today().strftime("%Y-%m-%d_%H-%M-%S")) - if not os.path.exists(self._default_logdir): - os.makedirs(self._default_logdir) + if not os.path.exists(DEFAULT_RESULTS_DIR): + os.makedirs(DEFAULT_RESULTS_DIR) self.logdir = tempfile.mkdtemp( - prefix=logdir_suffix, dir=self._default_logdir) + prefix=logdir_suffix, dir=DEFAULT_RESULTS_DIR) self._result_logger = UnifiedLogger(self.config, self.logdir, None) self._iteration = 0 @@ -155,8 +154,11 @@ def train(self): self._time_total += time_this_iter self._timesteps_total += result.timesteps_this_iter + now = datetime.today() result = result._replace( experiment_id=self._experiment_id, + date=now.strftime("%Y-%m-%d_%H-%M-%S"), + timestamp=int(time.mktime(now.timetuple())), training_iteration=self._iteration, timesteps_total=self._timesteps_total, time_this_iter_s=time_this_iter, diff --git a/python/ray/rllib/train.py b/python/ray/rllib/train.py index bb4625a931b10..23bcf935413b2 100755 --- a/python/ray/rllib/train.py +++ b/python/ray/rllib/train.py @@ -57,7 +57,7 @@ else: # Note: keep this in sync with tune/config_parser.py experiments = { - args.experiment_name: { # i.e. log to /tmp/ray/default + args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "local_dir": args.local_dir, diff --git a/python/ray/tune/ParallelCoordinatesVisualization.ipynb b/python/ray/tune/ParallelCoordinatesVisualization.ipynb index 6bb9222685a94..ab6254969f586 100644 --- a/python/ray/tune/ParallelCoordinatesVisualization.ipynb +++ b/python/ray/tune/ParallelCoordinatesVisualization.ipynb @@ -24,6 +24,7 @@ }, "outputs": [], "source": [ + "import os\n", "import pandas as pd\n", "from ray.tune.visual_utils import load_results_to_df, generate_plotly_dim_dict\n", "import plotly\n", @@ -46,7 +47,7 @@ }, "outputs": [], "source": [ - "RESULTS_DIR = \"/tmp/ray/\"\n", + "RESULTS_DIR = os.path.expanduser(\"~/ray_results\")\n", "df = load_results_to_df(RESULTS_DIR)\n", "[key for key in df]" ] diff --git a/python/ray/tune/config_parser.py b/python/ray/tune/config_parser.py index 05552c3b1f517..985c54c25a732 100644 --- a/python/ray/tune/config_parser.py +++ b/python/ray/tune/config_parser.py @@ -7,6 +7,7 @@ import json from ray.tune import TuneError +from ray.tune.result import DEFAULT_RESULTS_DIR from ray.tune.trial import Resources @@ -63,8 +64,9 @@ def make_parser(**kwargs): "--repeat", default=1, type=int, help="Number of times to repeat each trial.") parser.add_argument( - "--local-dir", default="/tmp/ray", type=str, - help="Local dir to save training results to. Defaults to '/tmp/ray'.") + "--local-dir", default=DEFAULT_RESULTS_DIR, type=str, + help="Local dir to save training results to. Defaults to '{}'.".format( + DEFAULT_RESULTS_DIR)) parser.add_argument( "--upload-dir", default="", type=str, help="Optional URI to upload training results to.") diff --git a/python/ray/tune/result.py b/python/ray/tune/result.py index 453ac31172893..3234ae031bcb6 100644 --- a/python/ray/tune/result.py +++ b/python/ray/tune/result.py @@ -4,6 +4,7 @@ from collections import namedtuple import json +import os try: import yaml @@ -20,6 +21,9 @@ In RLlib, the supplied algorithms fill in TrainingResult for you. """ +# Where ray.tune writes result files by default +DEFAULT_RESULTS_DIR = os.path.expanduser("~/ray_results") + TrainingResult = namedtuple("TrainingResult", [ # (Required) Accumulated timesteps for this entire experiment. @@ -40,9 +44,12 @@ # (Optional) The number of episodes total. "episodes_total", - # (Optional) The current training accuracy if applicable> + # (Optional) The current training accuracy if applicable. "mean_accuracy", + # (Optional) The current validation accuracy if applicable. + "mean_validation_accuracy", + # (Optional) The current training loss if applicable. "mean_loss", @@ -69,6 +76,12 @@ # (Auto-filled) The pid of the training process. "pid", + # (Auto-filled) A formatted date of when the result was processed. + "date", + + # (Auto-filled) A UNIX timestamp of when the result was processed. + "timestamp", + # (Auto-filled) The hostname of the machine hosting the training process. "hostname", ]) diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index 02661be1f534b..0d6aa9fb3611d 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -2,6 +2,7 @@ from __future__ import division from __future__ import print_function +from datetime import datetime import tempfile import traceback import ray @@ -10,7 +11,7 @@ from collections import namedtuple from ray.tune import TuneError from ray.tune.logger import NoopLogger, UnifiedLogger -from ray.tune.result import TrainingResult +from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR from ray.tune.registry import _default_registry, get_registry, TRAINABLE_CLASS @@ -62,7 +63,7 @@ class Trial(object): ERROR = "ERROR" def __init__( - self, trainable_name, config={}, local_dir='/tmp/ray', + self, trainable_name, config={}, local_dir=DEFAULT_RESULTS_DIR, experiment_tag=None, resources=Resources(cpu=1, gpu=0), stopping_criterion={}, checkpoint_freq=0, restore_path=None, upload_dir=None): @@ -295,16 +296,22 @@ def _setup_runner(self): if not os.path.exists(self.local_dir): os.makedirs(self.local_dir) self.logdir = tempfile.mkdtemp( - prefix=str(self), dir=self.local_dir) + prefix=str(self), dir=self.local_dir, + suffix=datetime.today().strftime("_%Y-%m-%d_%H-%M-%S")) self.result_logger = UnifiedLogger( self.config, self.logdir, self.upload_dir) remote_logdir = self.logdir + + def logger_creator(config): + # Set the working dir in the remote process, for user file writes + os.chdir(remote_logdir) + return NoopLogger(config, remote_logdir) + # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. self.runner = cls.remote( - config=self.config, - registry=get_registry(), - logger_creator=lambda config: NoopLogger(config, remote_logdir)) + config=self.config, registry=get_registry(), + logger_creator=logger_creator) def __str__(self): if "env" in self.config: diff --git a/test/trial_runner_test.py b/test/trial_runner_test.py index 1a7c56a1a1cb2..26be34db8b57f 100644 --- a/test/trial_runner_test.py +++ b/test/trial_runner_test.py @@ -12,6 +12,7 @@ from ray.tune import Trainable, TuneError from ray.tune import register_env, register_trainable, run_experiments from ray.tune.registry import _default_registry, TRAINABLE_CLASS +from ray.tune.result import DEFAULT_RESULTS_DIR from ray.tune.trial import Trial, Resources from ray.tune.trial_runner import TrialRunner from ray.tune.variant_generator import generate_trials, grid_search, \ @@ -63,6 +64,17 @@ def train(config, reporter): "config": {"a": "b"}, }}) + def testLogdir(self): + def train(config, reporter): + assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd() + reporter(timesteps_total=1) + register_trainable("f1", train) + run_experiments({"foo": { + "run": "f1", + "local_dir": "/tmp/logdir", + "config": {"a": "b"}, + }}) + def testBadParams(self): def f(): run_experiments({"foo": {}}) @@ -191,7 +203,9 @@ def testParseToTrials(self): self.assertEqual(trials[0].config, {"foo": "bar", "env": "Pong-v0"}) self.assertEqual(trials[0].trainable_name, "PPO") self.assertEqual(trials[0].experiment_tag, "0") - self.assertEqual(trials[0].local_dir, "/tmp/ray/tune-pong") + self.assertEqual( + trials[0].local_dir, + os.path.join(DEFAULT_RESULTS_DIR, "tune-pong")) self.assertEqual(trials[1].experiment_tag, "1") def testEval(self): @@ -207,7 +221,6 @@ def testEval(self): self.assertEqual(len(trials), 1) self.assertEqual(trials[0].config, {"foo": 4}) self.assertEqual(trials[0].experiment_tag, "0_foo=4") - self.assertEqual(trials[0].local_dir, "/tmp/ray/") def testGridSearch(self): trials = generate_trials({