[rllib] Enable object store memory limit by default (ray-project#5534)

rsohlot · Aug 26, 2019 · 97ccd75 · 97ccd75
1 parent 7d28bbb
commit 97ccd75
Show file tree

Hide file tree

Showing 8 changed files with 25 additions and 19 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -172,12 +172,6 @@ script:
   # `cluster_tests.py` runs on Jenkins, not Travis.
   - if [ $RAY_CI_TUNE_AFFECTED == "1" ]; then python -m pytest --durations=10 --timeout=300 --ignore=python/ray/tune/tests/test_cluster.py --ignore=python/ray/tune/tests/test_tune_restore.py --ignore=python/ray/tune/tests/test_actor_reuse.py python/ray/tune/tests; fi
 
-  # ray rllib tests
-  - if [ $RAY_CI_RLLIB_AFFECTED == "1" ]; then ./ci/suppress_output python python/ray/rllib/tests/test_catalog.py; fi
-  - if [ $RAY_CI_RLLIB_AFFECTED == "1" ]; then ./ci/suppress_output python python/ray/rllib/tests/test_filters.py; fi
-  - if [ $RAY_CI_RLLIB_AFFECTED == "1" ]; then ./ci/suppress_output python python/ray/rllib/tests/test_optimizers.py; fi
-  - if [ $RAY_CI_RLLIB_AFFECTED == "1" ]; then ./ci/suppress_output python python/ray/rllib/tests/test_evaluators.py; fi
-
   # ray tests
   # Python3.5+ only. Otherwise we will get `SyntaxError` regardless of how we set the tester.
   - if [ $RAY_CI_PYTHON_AFFECTED == "1" ]; then python -c 'import sys;exit(sys.version_info>=(3,5))' || python -m pytest -v --durations=5 --timeout=300 python/ray/experimental/test/async_test.py; fi

diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh
@@ -1,3 +1,15 @@
+docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
+    /ray/ci/suppress_output python /ray/rllib/tests/test_catalog.py
+
+docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
+    /ray/ci/suppress_output python /ray/rllib/tests/test_optimizers.py
+
+docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
+    /ray/ci/suppress_output python /ray/rllib/tests/test_filters.py
+
+docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
+    /ray/ci/suppress_output python /ray/rllib/tests/test_evaluators.py
+
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
     /ray/ci/suppress_output python /ray/rllib/tests/test_eager_support.py
 

diff --git a/python/ray/resource_spec.py b/python/ray/resource_spec.py
@@ -183,7 +183,7 @@ def resolve(self, is_head):
         if memory is None:
             memory = (avail_memory - object_store_memory - (redis_max_memory
                                                             if is_head else 0))
-            if memory < 500e6 and memory < 0.05 * system_memory:
+            if memory < 100e6 and memory < 0.05 * system_memory:
                 raise ValueError(
                     "After taking into account object store and redis memory "
                     "usage, the amount of memory on this node available for "

diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py
@@ -149,14 +149,14 @@
     # Object store memory to reserve for the trainer process. Being large
     # enough to fit a few copies of the model weights should be sufficient.
     # This is enabled by default since models are typically quite small.
-    "object_store_memory": 0,
+    "object_store_memory": 200 * 1024 * 1024,
     # Heap memory to reserve for each worker. Should generally be small unless
     # your environment is very heavyweight.
     "memory_per_worker": 0,
     # Object store memory to reserve for each worker. This only needs to be
     # large enough to fit a few sample batches at a time. This is enabled
     # by default since it almost never needs to be larger than ~200MB.
-    "object_store_memory_per_worker": 0,
+    "object_store_memory_per_worker": 200 * 1024 * 1024,
 
     # === Execution ===
     # Number of environments to evaluate vectorwise per worker.

diff --git a/rllib/tests/test_catalog.py b/rllib/tests/test_catalog.py
@@ -63,7 +63,7 @@ def testGymPreprocessors(self):
         self.assertEqual(type(p2), OneHotPreprocessor)
 
     def testTuplePreprocessor(self):
-        ray.init()
+        ray.init(object_store_memory=1000 * 1024 * 1024)
 
         class TupleEnv(object):
             def __init__(self):
@@ -78,7 +78,7 @@ def __init__(self):
             [float(x) for x in [1, 0, 0, 0, 0, 1, 2, 3]])
 
     def testCustomPreprocessor(self):
-        ray.init()
+        ray.init(object_store_memory=1000 * 1024 * 1024)
         ModelCatalog.register_custom_preprocessor("foo", CustomPreprocessor)
         ModelCatalog.register_custom_preprocessor("bar", CustomPreprocessor2)
         env = gym.make("CartPole-v0")
@@ -90,7 +90,7 @@ def testCustomPreprocessor(self):
         self.assertEqual(type(p3), NoPreprocessor)
 
     def testDefaultModels(self):
-        ray.init()
+        ray.init(object_store_memory=1000 * 1024 * 1024)
 
         with tf.variable_scope("test1"):
             p1 = ModelCatalog.get_model({
@@ -106,7 +106,7 @@ def testDefaultModels(self):
             self.assertEqual(type(p2), VisionNetwork)
 
     def testCustomModel(self):
-        ray.init()
+        ray.init(object_store_memory=1000 * 1024 * 1024)
         ModelCatalog.register_custom_model("foo", CustomModel)
         p1 = ModelCatalog.get_model({
             "obs": tf.constant([1, 2, 3])
@@ -118,7 +118,7 @@ def testCustomActionDistribution(self):
         class Model():
             pass
 
-        ray.init()
+        ray.init(object_store_memory=1000 * 1024 * 1024)
         # registration
         ModelCatalog.register_custom_action_dist("test",
                                                  CustomActionDistribution)

diff --git a/rllib/tests/test_evaluators.py b/rllib/tests/test_evaluators.py
@@ -34,7 +34,7 @@ def env_creator(env_config):
         agent_classes = [DQNTrainer, A3CTrainer]
 
         for agent_cls in agent_classes:
-            ray.init()
+            ray.init(object_store_memory=1000 * 1024 * 1024)
             register_env("CartPoleWrapped-v0", env_creator)
             agent = agent_cls(
                 env="CartPoleWrapped-v0",

diff --git a/rllib/tests/test_filters.py b/rllib/tests/test_filters.py
@@ -75,7 +75,7 @@ def testBasic(self):
 
 class FilterManagerTest(unittest.TestCase):
     def setUp(self):
-        ray.init(num_cpus=1)
+        ray.init(num_cpus=1, object_store_memory=1000 * 1024 * 1024)
 
     def tearDown(self):
         ray.shutdown()

diff --git a/rllib/tests/test_optimizers.py b/rllib/tests/test_optimizers.py
@@ -26,7 +26,7 @@ def tearDown(self):
         ray.shutdown()
 
     def testBasic(self):
-        ray.init(num_cpus=4)
+        ray.init(num_cpus=4, object_store_memory=1000 * 1024 * 1024)
         local = _MockWorker()
         remotes = ray.remote(_MockWorker)
         remote_workers = [remotes.remote() for i in range(5)]
@@ -41,7 +41,7 @@ def tearDown(self):
         ray.shutdown()
 
     def testPPOSampleWaste(self):
-        ray.init(num_cpus=4)
+        ray.init(num_cpus=4, object_store_memory=1000 * 1024 * 1024)
 
         # Check we at least collect the initial wave of samples
         ppo = PPOTrainer(
@@ -101,7 +101,7 @@ def tearDownClass(cls):
 
     @classmethod
     def setUpClass(cls):
-        ray.init(num_cpus=8)
+        ray.init(num_cpus=8, object_store_memory=1000 * 1024 * 1024)
 
     def testSimple(self):
         local, remotes = self._make_envs()