[RLlib] Upgrade gym version to 0.21 and deprecate pendulum-v0. (ray-p…

…roject#19535) * Fix QMix, SAC, and MADDPA too. * Unpin gym and deprecate pendulum v0 Many tests in rllib depended on pendulum v0, however in gym 0.21, pendulum v0 was deprecated in favor of pendulum v1. This may change reward thresholds, so will have to potentially rerun all of the pendulum v1 benchmarks, or use another environment in favor. The same applies to frozen lake v0 and frozen lake v1 Lastly, all of the RLlib tests and have been moved to python 3.7 * Add gym installation based on python version. Pin python<= 3.6 to gym 0.19 due to install issues with atari roms in gym 0.20 * Reformatting * Fixing tests * Move atari-py install conditional to req.txt * migrate to new ale install method * Fix QMix, SAC, and MADDPA too. * Unpin gym and deprecate pendulum v0 Many tests in rllib depended on pendulum v0, however in gym 0.21, pendulum v0 was deprecated in favor of pendulum v1. This may change reward thresholds, so will have to potentially rerun all of the pendulum v1 benchmarks, or use another environment in favor. The same applies to frozen lake v0 and frozen lake v1 Lastly, all of the RLlib tests and have been moved to python 3.7 * Add gym installation based on python version. Pin python<= 3.6 to gym 0.19 due to install issues with atari roms in gym 0.20 Move atari-py install conditional to req.txt migrate to new ale install method Make parametric_actions_cartpole return float32 actions/obs Adding type conversions if obs/actions don't match space Add utils to make elements match gym space dtypes Co-authored-by: Jun Gong <[email protected]> Co-authored-by: sven1977 <[email protected]>
dmatrix · Nov 3, 2021 · 026bf01 · 026bf01
1 parent e1e0cb5
commit 026bf01
Show file tree

Hide file tree

Showing 70 changed files with 266 additions and 145 deletions.
diff --git a/.buildkite/pipeline.gpu.yml b/.buildkite/pipeline.gpu.yml
@@ -12,7 +12,9 @@
   conditions: ["RAY_CI_RLLIB_AFFECTED"]
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
-    - RLLIB_TESTING=1 ./ci/travis/install-dependencies.sh
+    - RLLIB_TESTING=1 PYTHON=3.7 ./ci/travis/install-dependencies.sh
+    # Because Python version changed, we need to re-install Ray here
+    - rm -rf ./python/ray/thirdparty_files; rm -rf ./python/ray/pickle5_files; ./ci/travis/ci.sh build
     - pip install -Ur ./python/requirements_ml_docker.txt
     - ./ci/travis/env_info.sh
     # --jobs 1 is necessary as we only have 1 GPU on the machine and running tests in parallel

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -578,19 +578,15 @@
   conditions: ["RAY_CI_TUNE_AFFECTED"]
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
-    - TUNE_TESTING=1 PYTHON=3.7 ./ci/travis/install-dependencies.sh
-    # Because Python version changed, we need to re-install Ray here
-    - rm -rf ./python/ray/thirdparty_files; rm -rf ./python/ray/pickle5_files; ./ci/travis/ci.sh build
+    - TUNE_TESTING=1 ./ci/travis/install-dependencies.sh
     - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-example,-flaky,-py37,-soft_imports,-gpu_only python/ray/tune/...
     - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=example,-tf,-pytorch,-py37,-flaky,-soft_imports,-gpu_only python/ray/tune/...
 
 - label: ":octopus: Tune tests and examples {2/2}"
   conditions: ["RAY_CI_TUNE_AFFECTED"]
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
-    - TUNE_TESTING=1 PYTHON=3.7 ./ci/travis/install-dependencies.sh
-    # Because Python version changed, we need to re-install Ray here
-    - rm -rf ./python/ray/thirdparty_files; rm -rf ./python/ray/pickle5_files; ./ci/travis/ci.sh build
+    - TUNE_TESTING=1 ./ci/travis/install-dependencies.sh
     - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37,-flaky,-soft_imports,-gpu_only python/ray/tune/...
     - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37,-flaky,-soft_imports,-gpu_only python/ray/tune/...
 

diff --git a/ci/travis/install-dependencies.sh b/ci/travis/install-dependencies.sh
@@ -358,12 +358,12 @@ install_dependencies() {
     # install the following packages for testing on travis only
     pip install 'recsim>=0.2.4'
 
-    # Install Atari ROMs. Previously these have been shipped with atari_py
-    if [[ "${OSTYPE}" = linux* ]]; then
-      bash "${WORKSPACE_DIR}"/rllib/utils/install_atari_roms.sh
-    else
-      echo "Not installing Atari roms on ${OSTYPE}"
-    fi
+    # # Install Atari ROMs. Previously these have been shipped with atari_py
+    # if [[ "${OSTYPE}" = linux* ]]; then
+    #   bash "${WORKSPACE_DIR}"/rllib/utils/install_atari_roms.sh
+    # else
+    #   echo "Not installing Atari roms on ${OSTYPE}"
+    # fi
   fi
 
   # Additional Tune/SGD/Doc test dependencies.

diff --git a/dashboard/client/package-lock.json b/dashboard/client/package-lock.json
diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
@@ -87,7 +87,7 @@ Ape-X variations of DQN and DDPG (`APEX_DQN <https://github.com/ray-project/ray/
 
     Ape-X architecture
 
-Tuned examples: `PongNoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dqn/pong-apex.yaml>`__, `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/pendulum-apex-ddpg.yaml>`__, `MountainCarContinuous-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/mountaincarcontinuous-apex-ddpg.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dqn/atari-apex.yaml>`__.
+Tuned examples: `PongNoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dqn/pong-apex.yaml>`__, `Pendulum-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/pendulum-apex-ddpg.yaml>`__, `MountainCarContinuous-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/mountaincarcontinuous-apex-ddpg.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dqn/atari-apex.yaml>`__.
 
 **Atari results @10M steps**: `more details <https://github.com/ray-project/rl-experiments>`__
 
@@ -275,7 +275,7 @@ DDPG is implemented similarly to DQN (below). The algorithm can be scaled by inc
 
     DDPG architecture (same as DQN)
 
-Tuned examples: `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/pendulum-ddpg.yaml>`__, `MountainCarContinuous-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/mountaincarcontinuous-ddpg.yaml>`__, `HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml>`__, `TD3 Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/pendulum-td3.yaml>`__, `TD3 InvertedPendulum-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/invertedpendulum-td3.yaml>`__, `TD3 Mujoco suite (Ant-v2, HalfCheetah-v2, Hopper-v2, Walker2d-v2) <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/mujoco-td3.yaml>`__.
+Tuned examples: `Pendulum-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/pendulum-ddpg.yaml>`__, `MountainCarContinuous-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/mountaincarcontinuous-ddpg.yaml>`__, `HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml>`__, `TD3 Pendulum-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/pendulum-td3.yaml>`__, `TD3 InvertedPendulum-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/invertedpendulum-td3.yaml>`__, `TD3 Mujoco suite (Ant-v2, HalfCheetah-v2, Hopper-v2, Walker2d-v2) <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ddpg/mujoco-td3.yaml>`__.
 
 **DDPG-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
 
@@ -384,7 +384,7 @@ Tuned examples:
 `Unity3D Soccer (multi-agent: Strikers vs Goalie) <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml>`__,
 `Humanoid-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml>`__,
 `Hopper-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/hopper-ppo.yaml>`__,
-`Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pendulum-ppo.yaml>`__,
+`Pendulum-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pendulum-ppo.yaml>`__,
 `PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pong-ppo.yaml>`__,
 `Walker2d-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/walker2d-ppo.yaml>`__,
 `HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml>`__,
@@ -439,7 +439,7 @@ RLlib's soft-actor critic implementation is ported from the `official SAC repo <
 Note that SAC has two fields to configure for custom models: ``policy_model`` and ``Q_model``, the ``model`` field of the config will be ignored.
 
 Tuned examples (continuous actions):
-`Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/sac/pendulum-sac.yaml>`__,
+`Pendulum-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/sac/pendulum-sac.yaml>`__,
 `HalfCheetah-v3 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/sac/halfcheetah-sac.yaml>`__,
 Tuned examples (discrete actions):
 `CartPole-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/sac/cartpole-sac.yaml>`__
@@ -491,7 +491,7 @@ RLlib's MBMPO implementation is a Dyna-styled model-based RL method that learns
 Additional statistics are logged in MBMPO. Each MBMPO iteration corresponds to multiple MAML iterations, and ``MAMLIter$i$_DynaTrajInner_$j$_episode_reward_mean`` measures the agent's returns across the dynamics models at iteration ``i`` of MAML and step ``j`` of inner adaptation. Examples can be seen `here <https://github.com/ray-project/rl-experiments/tree/master/mbmpo>`__.
 
 Tuned examples (continuous actions):
-`Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/mbmpo/pendulum-mbmpo.yaml>`__,
+`Pendulum-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/mbmpo/pendulum-mbmpo.yaml>`__,
 `HalfCheetah <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/mbmpo/halfcheetah-mbmpo.yaml>`__,
 `Hopper <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/mbmpo/hopper-mbmpo.yaml>`__,
 Tuned examples (discrete actions):

diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst
@@ -98,7 +98,7 @@ There are two ways to scale experience collection with Gym environments:
 
 .. image:: throughput.png
 
-You can also combine vectorization and distributed execution, as shown in the above figure. Here we plot just the throughput of RLlib policy evaluation from 1 to 128 CPUs. PongNoFrameskip-v4 on GPU scales from 2.4k to ∼200k actions/s, and Pendulum-v0 on CPU from 15k to 1.5M actions/s. One machine was used for 1-16 workers, and a Ray cluster of four machines for 32-128 workers. Each worker was configured with ``num_envs_per_worker=64``.
+You can also combine vectorization and distributed execution, as shown in the above figure. Here we plot just the throughput of RLlib policy evaluation from 1 to 128 CPUs. PongNoFrameskip-v4 on GPU scales from 2.4k to ∼200k actions/s, and Pendulum-v1 on CPU from 15k to 1.5M actions/s. One machine was used for 1-16 workers, and a Ray cluster of four machines for 32-128 workers. Each worker was configured with ``num_envs_per_worker=64``.
 
 Expensive Environments
 ~~~~~~~~~~~~~~~~~~~~~~

diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile
@@ -56,13 +56,11 @@ RUN sudo apt-get update -y && sudo apt-get upgrade -y \
         numpy==1.19.5 \
         psutil \
         blist \ 
-        atari-py \
     # blist is needed for numpy (which is re-installed when ray is installed)
-    # atari-py is built from source for Python 3.8 (requires g++ & zlib1g-dev)
     # To avoid the following error on Jenkins:
     # AttributeError: 'numpy.ufunc' object has no attribute '__module__'
     && $HOME/anaconda3/bin/pip uninstall -y dask \ 
-    # We install cmake temporarily to get psutil, blist & atari-py
+    # We install cmake temporarily to get psutil, blist
     && sudo apt-get autoremove -y cmake zlib1g-dev \
         # We keep g++ on GPU images, because uninstalling removes CUDA Devel tooling
         $(if [ "$BASE_IMAGE" = "ubuntu:focal" ]; then echo \

diff --git a/docker/examples/Dockerfile b/docker/examples/Dockerfile
@@ -5,6 +5,7 @@ FROM rayproject/ray:latest
 # Needed to run Tune example with a 'plot' call - which does not actually render a plot, but throws an error.
 RUN apt-get update && apt-get install -y zlib1g-dev libgl1-mesa-dev libgtk2.0-dev && apt-get clean
 RUN pip install --no-cache-dir -U pip \
+    autorom[accept-rom-license] \
     gym[atari] \
     scikit-image \
     tensorflow \

diff --git a/docker/ray-ml/Dockerfile b/docker/ray-ml/Dockerfile
@@ -37,4 +37,4 @@ RUN sudo apt-get update \
 RUN python -c "import tensorflow_probability"
 
 # Install Atari ROMs. Previously these have been shipped with atari_py \
-RUN ./install_atari_roms.sh
+# RUN ./install_atari_roms.sh
diff --git a/docker/ray-worker-container/Dockerfile b/docker/ray-worker-container/Dockerfile
@@ -45,13 +45,11 @@ RUN apt-get update -y && sudo apt-get upgrade -y \
         numpy==1.15.4 \
         psutil \
         blist \
-        atari-py \
     # blist is needed for numpy (which is re-installed when ray is installed)
-    # atari-py is built from source for Python 3.8 (requires g++ & zlib1g-dev)
     # To avoid the following error on Jenkins:
     # AttributeError: 'numpy.ufunc' object has no attribute '__module__'
     && $HOME/anaconda3/bin/pip uninstall -y dask \
-    # We install cmake temporarily to get psutil, blist & atari-py
+    # We install cmake temporarily to get psutil, blist
     && sudo apt-get autoremove -y cmake zlib1g-dev \
         # We keep g++ on GPU images, because uninstalling removes CUDA Devel tooling
         $(if [ "$BASE_IMAGE" = "ubuntu:focal" ]; then echo \

diff --git a/python/requirements.txt b/python/requirements.txt
@@ -27,7 +27,8 @@ requests
 ## setup.py extras
 dm_tree
 flask
-gym==0.19
+gym>=0.21.0; python_version >= '3.7'
+gym==0.19.0; python_version < '3.7'
 lz4
 scikit-image
 opencv-python-headless==4.3.0.36

diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt
@@ -3,8 +3,9 @@
 # Environment adapters.
 # ---------------------
 # Atari
-atari_py==0.2.9
-gym[atari]==0.18.3
+autorom[accept-rom-license]
+gym[atari]>=0.21.0; python_version >= '3.7'
+gym[atari]==0.19.0; python_version < '3.7'
 # Kaggle envs.
 kaggle_environments==1.7.11
 # Unity3D testing

diff --git a/python/requirements/ml/requirements_tune.txt b/python/requirements/ml/requirements_tune.txt
@@ -10,7 +10,9 @@ flaml==0.6.7
 freezegun==1.1.0
 gluoncv==0.10.1.post0
 gpy==1.10.0
-gym[atari]==0.18.3
+autorom[accept-rom-license]
+gym[atari]>=0.21.0; python_version >= '3.7'
+gym[atari]==0.19.0; python_version < '3.7'
 h5py==3.1.0
 hpbandster==0.7.4
 pymoo<0.5.0  # this is a HEBO dependency, remove after https://github.com/huawei-noah/noah-research/issues/41 is fixed

diff --git a/release/long_running_tests/app_config.yaml b/release/long_running_tests/app_config.yaml
@@ -13,6 +13,6 @@ post_build_cmds:
   - pip uninstall -y numpy ray || true
   - sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
   - pip3 install numpy || true
-  - pip3 install -U ray[all] gym[atari]
+  - pip3 install -U ray[all] gym[atari] autorom[accept-rom-license]
   - pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
   - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
diff --git a/release/long_running_tests/app_config_np.yaml b/release/long_running_tests/app_config_np.yaml
@@ -11,6 +11,6 @@ post_build_cmds:
   - pip uninstall -y numpy ray || true
   - sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
   - pip3 install numpy==1.19 || true
-  - pip3 install -U ray[all] gym[atari]
+  - pip3 install -U ray[all] gym[atari] autorom[accept-rom-license]
   - pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
   - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
diff --git a/rllib/BUILD b/rllib/BUILD
@@ -808,7 +808,7 @@ py_test(
     main = "train.py", srcs = ["train.py"],
     tags = ["team:ml", "quick_train"],
     args = [
-        "--env", "Pendulum-v0",
+        "--env", "Pendulum-v1",
         "--run", "APEX_DDPG",
         "--stop", "'{\"training_iteration\": 1}'",
         "--config", "'{\"framework\": \"tf\", \"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1, \"batch_mode\": \"complete_episodes\"}'",
@@ -824,7 +824,7 @@ py_test(
     size = "small",
     tags = ["team:ml", "quick_train"],
     args = [
-        "--env", "FrozenLake-v0",
+        "--env", "FrozenLake-v1",
         "--run", "DQN",
         "--config", "'{\"framework\": \"tf\"}'",
         "--stop", "'{\"training_iteration\": 1}'"
@@ -1011,7 +1011,7 @@ py_test(
     main = "train.py", srcs = ["train.py"],
     tags = ["team:ml", "quick_train"],
     args = [
-        "--env", "Pendulum-v0",
+        "--env", "Pendulum-v1",
         "--run", "APPO",
         "--stop", "'{\"training_iteration\": 1}'",
         "--config", "'{\"framework\": \"tf\", \"num_workers\": 2, \"num_gpus\": 0}'",
@@ -1268,6 +1268,15 @@ py_test(
     srcs = ["utils/tests/test_framework_agnostic_components.py"]
 )
 
+# Spaces/Space utils.
+py_test(
+    name = "test_space_utils",
+    tags = ["team:ml", "utils"],
+    size = "large",
+    srcs = ["utils/spaces/tests/test_space_utils.py"]
+)
+
+
 # TaskPool
 py_test(
     name = "test_taskpool",

diff --git a/rllib/agents/a3c/tests/test_a3c.py b/rllib/agents/a3c/tests/test_a3c.py
@@ -28,7 +28,7 @@ def test_a3c_compilation(self):
 
         # Test against all frameworks.
         for _ in framework_iterator(config):
-            for env in ["CartPole-v1", "Pendulum-v0", "PongDeterministic-v0"]:
+            for env in ["CartPole-v1", "Pendulum-v1", "PongDeterministic-v0"]:
                 print("env={}".format(env))
                 config["model"]["use_lstm"] = env == "CartPole-v1"
                 trainer = a3c.A3CTrainer(config=config, env=env)

diff --git a/rllib/agents/cql/tests/test_cql.py b/rllib/agents/cql/tests/test_cql.py
@@ -27,7 +27,7 @@ def test_cql_compilation(self):
 
         # Learns from a historic-data file.
         # To generate this data, first run:
-        # $ ./train.py --run=SAC --env=Pendulum-v0 \
+        # $ ./train.py --run=SAC --env=Pendulum-v1 \
         #   --stop='{"timesteps_total": 50000}' \
         #   --config='{"output": "/tmp/out"}'
         rllib_dir = Path(__file__).parent.parent.parent.parent
@@ -37,7 +37,7 @@ def test_cql_compilation(self):
                                               os.path.isfile(data_file)))
 
         config = cql.CQL_DEFAULT_CONFIG.copy()
-        config["env"] = "Pendulum-v0"
+        config["env"] = "Pendulum-v1"
         config["input"] = [data_file]
 
         # In the files, we use here for testing, actions have already

diff --git a/rllib/agents/ddpg/tests/test_apex_ddpg.py b/rllib/agents/ddpg/tests/test_apex_ddpg.py
@@ -27,7 +27,7 @@ def test_apex_ddpg_compilation_and_per_worker_epsilon_values(self):
         for _ in framework_iterator(config):
             plain_config = config.copy()
             trainer = apex_ddpg.ApexDDPGTrainer(
-                config=plain_config, env="Pendulum-v0")
+                config=plain_config, env="Pendulum-v1")
 
             # Test per-worker scale distribution.
             infos = trainer.workers.foreach_policy(