Skip to content

Commit

Permalink
[tune] Clean up result logging: move out of /tmp, add timestamp (ray-…
Browse files Browse the repository at this point in the history
  • Loading branch information
ericl authored Dec 15, 2017
1 parent 12fdb3f commit fbf1806
Show file tree
Hide file tree
Showing 11 changed files with 64 additions and 26 deletions.
2 changes: 1 addition & 1 deletion doc/source/example-a3c.rst
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,6 @@ workers, we can train the agent in around 25 minutes.

You can visualize performance by running
:code:`tensorboard --logdir [directory]` in a separate screen, where
:code:`[directory]` is defaulted to :code:`/tmp/ray/`. If you are running
:code:`[directory]` is defaulted to :code:`~/ray_results/`. If you are running
multiple experiments, be sure to vary the directory to which Tensorflow saves
its progress (found in :code:`a3c.py`).
2 changes: 1 addition & 1 deletion doc/source/example-policy-gradient.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ TensorBoard to the log output directory as follows.

.. code-block:: bash
tensorboard --logdir=/tmp/ray
tensorboard --logdir=~/ray_results
Many of the TensorBoard metrics are also printed to the console, but you might
find it easier to visualize and compare between runs using the TensorBoard UI.
Expand Down
4 changes: 2 additions & 2 deletions doc/source/rllib.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,15 @@ You can train a simple DQN agent with the following command

python ray/python/ray/rllib/train.py --run DQN --env CartPole-v0

By default, the results will be logged to a subdirectory of ``/tmp/ray``.
By default, the results will be logged to a subdirectory of ``~/ray_results``.
This subdirectory will contain a file ``params.json`` which contains the
hyperparameters, a file ``result.json`` which contains a training summary
for each episode and a TensorBoard file that can be used to visualize
training process with TensorBoard by running

::

tensorboard --logdir=/tmp/ray
tensorboard --logdir=~/ray_results


The ``train.py`` script has a number of options you can show by running
Expand Down
8 changes: 4 additions & 4 deletions doc/source/tune.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ This script runs a small grid search over the ``my_func`` function using ray.tun
== Status ==
Using FIFO scheduling algorithm.
Resources used: 4/8 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/my_experiment
Result logdir: ~/ray_results/my_experiment
- my_func_0_alpha=0.2,beta=1: RUNNING [pid=6778], 209 s, 20604 ts, 7.29 acc
- my_func_1_alpha=0.4,beta=1: RUNNING [pid=6780], 208 s, 20522 ts, 53.1 acc
- my_func_2_alpha=0.6,beta=1: TERMINATED [pid=6789], 21 s, 2190 ts, 101 acc
Expand All @@ -63,14 +63,14 @@ In order to report incremental progress, ``my_func`` periodically calls the ``re
Visualizing Results
-------------------

Ray.tune logs trial results to a unique directory per experiment, e.g. ``/tmp/ray/my_experiment`` in the above example. The log records are compatible with a number of visualization tools:
Ray.tune logs trial results to a unique directory per experiment, e.g. ``~/ray_results/my_experiment`` in the above example. The log records are compatible with a number of visualization tools:

To visualize learning in tensorboard, run:

::

$ pip install tensorboard
$ tensorboard --logdir=/tmp/ray/my_experiment
$ tensorboard --logdir=~/ray_results/my_experiment

.. image:: ray-tune-tensorboard.png

Expand All @@ -79,7 +79,7 @@ To use rllab's VisKit (you may have to install some dependencies), run:
::

$ git clone https://github.com/rll/rllab.git
$ python rllab/rllab/viskit/frontend.py /tmp/ray/my_experiment
$ python rllab/rllab/viskit/frontend.py ~/ray_results/my_experiment

.. image:: ray-tune-viskit.png

Expand Down
12 changes: 7 additions & 5 deletions python/ray/rllib/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import tensorflow as tf
from ray.tune.logger import UnifiedLogger
from ray.tune.registry import ENV_CREATOR
from ray.tune.result import TrainingResult
from ray.tune.result import DEFAULT_RESULTS_DIR, TrainingResult
from ray.tune.trainable import Trainable

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -72,7 +72,6 @@ class Agent(Trainable):

_allow_unknown_configs = False
_allow_unknown_subkeys = []
_default_logdir = "/tmp/ray"

def __init__(
self, config={}, env=None, registry=None, logger_creator=None):
Expand Down Expand Up @@ -111,10 +110,10 @@ def __init__(
logdir_suffix = "{}_{}_{}".format(
env, self._agent_name,
datetime.today().strftime("%Y-%m-%d_%H-%M-%S"))
if not os.path.exists(self._default_logdir):
os.makedirs(self._default_logdir)
if not os.path.exists(DEFAULT_RESULTS_DIR):
os.makedirs(DEFAULT_RESULTS_DIR)
self.logdir = tempfile.mkdtemp(
prefix=logdir_suffix, dir=self._default_logdir)
prefix=logdir_suffix, dir=DEFAULT_RESULTS_DIR)
self._result_logger = UnifiedLogger(self.config, self.logdir, None)

self._iteration = 0
Expand Down Expand Up @@ -155,8 +154,11 @@ def train(self):
self._time_total += time_this_iter
self._timesteps_total += result.timesteps_this_iter

now = datetime.today()
result = result._replace(
experiment_id=self._experiment_id,
date=now.strftime("%Y-%m-%d_%H-%M-%S"),
timestamp=int(time.mktime(now.timetuple())),
training_iteration=self._iteration,
timesteps_total=self._timesteps_total,
time_this_iter_s=time_this_iter,
Expand Down
2 changes: 1 addition & 1 deletion python/ray/rllib/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
else:
# Note: keep this in sync with tune/config_parser.py
experiments = {
args.experiment_name: { # i.e. log to /tmp/ray/default
args.experiment_name: { # i.e. log to ~/ray_results/default
"run": args.run,
"checkpoint_freq": args.checkpoint_freq,
"local_dir": args.local_dir,
Expand Down
3 changes: 2 additions & 1 deletion python/ray/tune/ParallelCoordinatesVisualization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"from ray.tune.visual_utils import load_results_to_df, generate_plotly_dim_dict\n",
"import plotly\n",
Expand All @@ -46,7 +47,7 @@
},
"outputs": [],
"source": [
"RESULTS_DIR = \"/tmp/ray/\"\n",
"RESULTS_DIR = os.path.expanduser(\"~/ray_results\")\n",
"df = load_results_to_df(RESULTS_DIR)\n",
"[key for key in df]"
]
Expand Down
6 changes: 4 additions & 2 deletions python/ray/tune/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import json

from ray.tune import TuneError
from ray.tune.result import DEFAULT_RESULTS_DIR
from ray.tune.trial import Resources


Expand Down Expand Up @@ -63,8 +64,9 @@ def make_parser(**kwargs):
"--repeat", default=1, type=int,
help="Number of times to repeat each trial.")
parser.add_argument(
"--local-dir", default="/tmp/ray", type=str,
help="Local dir to save training results to. Defaults to '/tmp/ray'.")
"--local-dir", default=DEFAULT_RESULTS_DIR, type=str,
help="Local dir to save training results to. Defaults to '{}'.".format(
DEFAULT_RESULTS_DIR))
parser.add_argument(
"--upload-dir", default="", type=str,
help="Optional URI to upload training results to.")
Expand Down
15 changes: 14 additions & 1 deletion python/ray/tune/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from collections import namedtuple
import json
import os

try:
import yaml
Expand All @@ -20,6 +21,9 @@
In RLlib, the supplied algorithms fill in TrainingResult for you.
"""

# Where ray.tune writes result files by default
DEFAULT_RESULTS_DIR = os.path.expanduser("~/ray_results")


TrainingResult = namedtuple("TrainingResult", [
# (Required) Accumulated timesteps for this entire experiment.
Expand All @@ -40,9 +44,12 @@
# (Optional) The number of episodes total.
"episodes_total",

# (Optional) The current training accuracy if applicable>
# (Optional) The current training accuracy if applicable.
"mean_accuracy",

# (Optional) The current validation accuracy if applicable.
"mean_validation_accuracy",

# (Optional) The current training loss if applicable.
"mean_loss",

Expand All @@ -69,6 +76,12 @@
# (Auto-filled) The pid of the training process.
"pid",

# (Auto-filled) A formatted date of when the result was processed.
"date",

# (Auto-filled) A UNIX timestamp of when the result was processed.
"timestamp",

# (Auto-filled) The hostname of the machine hosting the training process.
"hostname",
])
Expand Down
19 changes: 13 additions & 6 deletions python/ray/tune/trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import division
from __future__ import print_function

from datetime import datetime
import tempfile
import traceback
import ray
Expand All @@ -10,7 +11,7 @@
from collections import namedtuple
from ray.tune import TuneError
from ray.tune.logger import NoopLogger, UnifiedLogger
from ray.tune.result import TrainingResult
from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR
from ray.tune.registry import _default_registry, get_registry, TRAINABLE_CLASS


Expand Down Expand Up @@ -62,7 +63,7 @@ class Trial(object):
ERROR = "ERROR"

def __init__(
self, trainable_name, config={}, local_dir='/tmp/ray',
self, trainable_name, config={}, local_dir=DEFAULT_RESULTS_DIR,
experiment_tag=None, resources=Resources(cpu=1, gpu=0),
stopping_criterion={}, checkpoint_freq=0,
restore_path=None, upload_dir=None):
Expand Down Expand Up @@ -295,16 +296,22 @@ def _setup_runner(self):
if not os.path.exists(self.local_dir):
os.makedirs(self.local_dir)
self.logdir = tempfile.mkdtemp(
prefix=str(self), dir=self.local_dir)
prefix=str(self), dir=self.local_dir,
suffix=datetime.today().strftime("_%Y-%m-%d_%H-%M-%S"))
self.result_logger = UnifiedLogger(
self.config, self.logdir, self.upload_dir)
remote_logdir = self.logdir

def logger_creator(config):
# Set the working dir in the remote process, for user file writes
os.chdir(remote_logdir)
return NoopLogger(config, remote_logdir)

# Logging for trials is handled centrally by TrialRunner, so
# configure the remote runner to use a noop-logger.
self.runner = cls.remote(
config=self.config,
registry=get_registry(),
logger_creator=lambda config: NoopLogger(config, remote_logdir))
config=self.config, registry=get_registry(),
logger_creator=logger_creator)

def __str__(self):
if "env" in self.config:
Expand Down
17 changes: 15 additions & 2 deletions test/trial_runner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ray.tune import Trainable, TuneError
from ray.tune import register_env, register_trainable, run_experiments
from ray.tune.registry import _default_registry, TRAINABLE_CLASS
from ray.tune.result import DEFAULT_RESULTS_DIR
from ray.tune.trial import Trial, Resources
from ray.tune.trial_runner import TrialRunner
from ray.tune.variant_generator import generate_trials, grid_search, \
Expand Down Expand Up @@ -63,6 +64,17 @@ def train(config, reporter):
"config": {"a": "b"},
}})

def testLogdir(self):
def train(config, reporter):
assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd()
reporter(timesteps_total=1)
register_trainable("f1", train)
run_experiments({"foo": {
"run": "f1",
"local_dir": "/tmp/logdir",
"config": {"a": "b"},
}})

def testBadParams(self):
def f():
run_experiments({"foo": {}})
Expand Down Expand Up @@ -191,7 +203,9 @@ def testParseToTrials(self):
self.assertEqual(trials[0].config, {"foo": "bar", "env": "Pong-v0"})
self.assertEqual(trials[0].trainable_name, "PPO")
self.assertEqual(trials[0].experiment_tag, "0")
self.assertEqual(trials[0].local_dir, "/tmp/ray/tune-pong")
self.assertEqual(
trials[0].local_dir,
os.path.join(DEFAULT_RESULTS_DIR, "tune-pong"))
self.assertEqual(trials[1].experiment_tag, "1")

def testEval(self):
Expand All @@ -207,7 +221,6 @@ def testEval(self):
self.assertEqual(len(trials), 1)
self.assertEqual(trials[0].config, {"foo": 4})
self.assertEqual(trials[0].experiment_tag, "0_foo=4")
self.assertEqual(trials[0].local_dir, "/tmp/ray/")

def testGridSearch(self):
trials = generate_trials({
Expand Down

0 comments on commit fbf1806

Please sign in to comment.