[train] enable new persistence mode for core and serve tests (ray-pro…

…ject#38938) Signed-off-by: Matthew Deng <[email protected]>
lmco · Aug 26, 2023 · 4dac931 · 4dac931
1 parent cd3d7b6
commit 4dac931
Show file tree

Hide file tree

Showing 10 changed files with 39 additions and 27 deletions.
diff --git a/.buildkite/pipeline.build.yml b/.buildkite/pipeline.build.yml
@@ -95,7 +95,6 @@
       --test_env=DOCKER_CERT_PATH=/certs/client
       --test_env=DOCKER_TLS_CERTDIR=/certs
       --test_env=RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=0
-      --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=0
       $(cat test_shard.txt)
 
 - label: ":serverless: Serve Tests (streaming and routing FFs off)"
@@ -130,7 +129,6 @@
       --test_env=DOCKER_TLS_CERTDIR=/certs
       --test_env=RAY_SERVE_ENABLE_NEW_ROUTING=0
       --test_env=RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=0
-      --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=0
       $(cat test_shard.txt)
 
 - label: ":python: Minimal install Python {{matrix}}"
@@ -213,7 +211,6 @@
       --test_env=CONDA_SHLVL
       --test_env=CONDA_PREFIX
       --test_env=CONDA_DEFAULT_ENV
-      --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=0
       python/ray/tests/...
 
 - label: ":book: Doctest (CPU)"

diff --git a/.buildkite/pipeline.build_py37.yml b/.buildkite/pipeline.build_py37.yml
@@ -141,5 +141,4 @@
       --test_env=DOCKER_TLS_VERIFY=1
       --test_env=DOCKER_CERT_PATH=/certs/client
       --test_env=DOCKER_TLS_CERTDIR=/certs
-      --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=0
       $(cat test_shard.txt)
diff --git a/.buildkite/pipeline.build_redis.yml b/.buildkite/pipeline.build_redis.yml
@@ -22,7 +22,6 @@
     - DL=1 ./ci/env/install-dependencies.sh
     - ./ci/env/env_info.sh
     - ./ci/ci.sh test_large --test_env=TEST_EXTERNAL_REDIS=1
-      --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=0
 
 - label: ":redis: (External Redis) (Medium A-J)"
   conditions: ["RAY_CI_PYTHON_AFFECTED"]

diff --git a/.buildkite/pipeline.gpu.yml b/.buildkite/pipeline.gpu.yml
@@ -43,7 +43,6 @@
     - ./ci/env/install-dependencies.sh
     - pip install -Ur ./python/requirements/ml/dl-gpu-requirements.txt
     - bazel test --config=ci $(./ci/run/bazel_export_options) --test_tag_filters=gpu 
-      --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=0
       python/ray/serve/...
 
 # Todo: enable once tests pass

diff --git a/ci/ray_ci/container.py b/ci/ray_ci/container.py
@@ -59,8 +59,6 @@ def _run_tests_in_docker(test_targets: List[str], team: str) -> subprocess.Popen
         )
     commands.append(
         "bazel test --config=ci "
-        # TODO(matthewdeng): Remove this env var as part of #38570.
-        "--test_env=RAY_AIR_NEW_PERSISTENCE_MODE=0 "
         "$(./ci/run/bazel_export_options) "
         f"{' '.join(test_targets)}",
     )

diff --git a/python/ray/serve/tests/test_air_integrations.py b/python/ray/serve/tests/test_air_integrations.py
@@ -1,6 +1,6 @@
 import os
 import tempfile
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 
 import numpy as np
 import pandas as pd
@@ -10,6 +10,7 @@
 from fastapi import Depends, FastAPI
 
 import ray
+import ray.cloudpickle as ray_pickle
 from ray import serve
 from ray.train import Checkpoint
 from ray.serve.air_integrations import _BatchingManager
@@ -128,6 +129,22 @@ def test_unpack_dataframe(self, batched_df, expected):
         )
 
 
+def create_dict_checkpoint(
+    data: Dict[str, Any], directory: Optional[str] = None
+) -> Checkpoint:
+    if not directory:
+        directory = tempfile.mkdtemp()
+    with open(os.path.join(directory, "data.pkl"), "wb") as f:
+        ray_pickle.dump(data, f)
+    return Checkpoint.from_directory(directory)
+
+
+def load_dict_checkpoint(checkpoint: Checkpoint) -> Dict[str, Any]:
+    with checkpoint.as_directory() as checkpoint_dir:
+        with open(os.path.join(checkpoint_dir, "data.pkl"), "rb") as f:
+            return ray_pickle.load(f)
+
+
 class AdderPredictor(Predictor):
     def __init__(self, increment: int, do_double: bool) -> None:
         self.increment = increment
@@ -137,7 +154,7 @@ def __init__(self, increment: int, do_double: bool) -> None:
     def from_checkpoint(
         cls, checkpoint: Checkpoint, do_double: bool = False
     ) -> "AdderPredictor":
-        return cls(checkpoint.to_dict()["increment"], do_double)
+        return cls(load_dict_checkpoint(checkpoint)["increment"], do_double)
 
     def predict(
         self, data: np.ndarray, override_increment: Optional[int] = None
@@ -170,7 +187,7 @@ async def __call__(self, request: Request):
             return self.predictor.predict(np.array(data["array"]))
 
     AdderDeployment.options(name="Adder").deploy(
-        checkpoint=Checkpoint.from_dict({"increment": 2}),
+        checkpoint=create_dict_checkpoint({"increment": 2}),
     )
     resp = ray.get(send_request.remote(json={"array": [40]}))
     assert resp == [{"value": 42, "batch_size": 1}]
@@ -189,7 +206,7 @@ async def __call__(self, request: Request):
             )
 
     AdderDeployment.options(name="Adder").deploy(
-        checkpoint=Checkpoint.from_dict({"increment": 2}),
+        checkpoint=create_dict_checkpoint({"increment": 2}),
     )
 
     resp = ray.get(send_request.remote(json={"array": [40]}))
@@ -207,7 +224,7 @@ async def __call__(self, request: Request):
             return self.predictor.predict(np.array(data["array"]))
 
     AdderDeployment.options(name="Adder").deploy(
-        checkpoint=Checkpoint.from_dict({"increment": 2}),
+        checkpoint=create_dict_checkpoint({"increment": 2}),
     )
     resp = ray.get(send_request.remote(json={"array": [40]}))
     assert resp == [{"value": 84, "batch_size": 1}]
@@ -226,7 +243,7 @@ async def __call__(self, requests: List[Request]):
             return self.predictor.predict(batch)
 
     AdderDeployment.options(name="Adder").deploy(
-        checkpoint=Checkpoint.from_dict({"increment": 2}),
+        checkpoint=create_dict_checkpoint({"increment": 2}),
     )
 
     refs = [send_request.remote(json={"array": [40]}) for _ in range(2)]
@@ -250,8 +267,7 @@ async def predict(self, data=Depends(json_to_ndarray)):
 
 def test_air_integrations_in_pipeline(serve_instance):
     path = tempfile.mkdtemp()
-    uri = f"file://{path}/test_uri"
-    Checkpoint.from_dict({"increment": 2}).to_uri(uri)
+    create_dict_checkpoint({"increment": 2}, path)
 
     @serve.deployment
     class AdderDeployment:
@@ -263,7 +279,7 @@ async def __call__(self, data):
 
     with InputNode() as dag_input:
         m1 = AdderDeployment.bind(
-            checkpoint=Checkpoint.from_uri(uri),
+            checkpoint=Checkpoint.from_directory(path),
         )
         dag = m1.__call__.bind(dag_input)
     deployments = build(Ingress.bind(dag), "")
@@ -278,8 +294,7 @@ async def __call__(self, data):
 
 def test_air_integrations_reconfigure(serve_instance):
     path = tempfile.mkdtemp()
-    uri = f"file://{path}/test_uri"
-    Checkpoint.from_dict({"increment": 2}).to_uri(uri)
+    create_dict_checkpoint({"increment": 2}, path)
 
     @serve.deployment
     class AdderDeployment:
@@ -288,7 +303,7 @@ def __init__(self, checkpoint: Checkpoint):
 
         def reconfigure(self, config):
             self.predictor = AdderPredictor.from_checkpoint(
-                Checkpoint.from_dict(config["checkpoint"])
+                create_dict_checkpoint(config["checkpoint"])
             )
 
         async def __call__(self, data):
@@ -300,7 +315,7 @@ async def __call__(self, data):
 
     with InputNode() as dag_input:
         m1 = AdderDeployment.options(user_config=additional_config).bind(
-            checkpoint=Checkpoint.from_uri(uri),
+            checkpoint=Checkpoint.from_directory(path),
         )
         dag = m1.__call__.bind(dag_input)
     deployments = build(Ingress.bind(dag), "")

diff --git a/python/ray/serve/tests/test_air_integrations_gpu.py b/python/ray/serve/tests/test_air_integrations_gpu.py
@@ -38,9 +38,14 @@ def __init__(self, checkpoint):
         async def __call__(self, data):
             return self.predictor.predict(data)
 
+    import tempfile
+
+    tmpdir = tempfile.mkdtemp()
+    checkpoint = Checkpoint.from_directory(tmpdir)
+
     serve.run(
         DAGDriver.bind(
-            DummyGPUDeployment.options(name="GPU").bind(Checkpoint.from_dict({"x": 1})),
+            DummyGPUDeployment.options(name="GPU").bind(checkpoint),
             http_adapter=json_to_ndarray,
         )
     )

diff --git a/python/ray/tests/test_multi_node_3.py b/python/ray/tests/test_multi_node_3.py
@@ -277,14 +277,15 @@ def test_run_driver_twice(ray_start_regular):
     address_info = ray_start_regular
     driver_script = """
 import ray
+import ray.train
 import ray.tune as tune
 import os
 import time
 
-def train_func(config, reporter):  # add a reporter arg
+def train_func(config):
     for i in range(2):
         time.sleep(0.1)
-        reporter(timesteps_total=i, mean_accuracy=i+97)  # report metrics
+        ray.train.report(dict(timesteps_total=i, mean_accuracy=i+97))  # report metrics
 
 os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
 ray.init(address="{}", namespace="default_test_namespace")

diff --git a/python/ray/tests/test_task_events.py b/python/ray/tests/test_task_events.py
@@ -434,6 +434,7 @@ def test_parent_task_id_tune_e2e(shutdown_only):
     script = """
 import numpy as np
 import ray
+import ray.train
 from ray import tune
 import time
 
@@ -448,7 +449,7 @@ def train_function(config):
     for i in range(5):
         loss = config["mean"] * np.random.randn() + ray.get(
             train_step_1.remote())
-        tune.report(loss=loss, nodes=ray.nodes())
+        ray.train.report(dict(loss=loss, nodes=ray.nodes()))
 
 
 def tune_function():

diff --git a/python/ray/tests/test_usage_stats.py b/python/ray/tests/test_usage_stats.py
@@ -1206,9 +1206,7 @@ def run_usage_stats_server(reporter):
         if os.environ.get("RAY_MINIMAL") != "1":
             expected_payload["tune_scheduler"] = "FIFOScheduler"
             expected_payload["tune_searcher"] = "BasicVariantGenerator"
-            expected_payload["air_storage_configuration"] = "driver"
             expected_payload["air_entrypoint"] = "Tuner.fit"
-            expected_payload["air_env_vars"] = '["RAY_AIR_NEW_PERSISTENCE_MODE"]'
         assert payload["extra_usage_tags"] == expected_payload
         assert payload["total_num_nodes"] == 1
         assert payload["total_num_running_jobs"] == 1