[kuberay] Test Ray client and update autoscaler image (ray-project#24195

) This PR adds KubeRay e2e testing for Ray client and updates the suggested autoscaler image to one running the merge commit of PR ray-project#23883 .
missdiog · Apr 28, 2022 · d68c1ec · d68c1ec
1 parent cc86440
commit d68c1ec
Show file tree

Hide file tree

Showing 10 changed files with 258 additions and 85 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -391,6 +391,13 @@
   conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - echo "--- Setting up Python 3.7 environment."
+    - PYTHON=3.7 ./ci/env/install-dependencies.sh
+    # Specifying PYTHON=3.7 above somehow messes up the Ray install.
+    # Uninstall and re-install Ray so that we can use Ray Client.
+    # (Remove thirdparty_files to sidestep an issue with psutil.)
+    - pip uninstall -y ray && rm -rf /ray/python/ray/thirdparty_files
+    - pip install -e /ray/python
     - echo "--- Setting up local kind cluster."
     - ./ci/k8s/prep-k8s-environment.sh
     - echo "--- Building py37-cpu Ray image for the test."

diff --git a/doc/source/cluster/kuberay.md b/doc/source/cluster/kuberay.md
@@ -57,6 +57,12 @@ ray.init("auto")
 ray.autoscaler.sdk.request_resources(num_cpus=4)
 ```
 
+> **_NOTE:_**  The example config ray-cluster.complete.yaml specifies rayproject/ray:8c5fe4
+> as the Ray autoscaler image. This image carries the latest improvements to KubeRay autoscaling
+> support. This autoscaler image is confirmed to be compatible with Ray versions >= 1.11.0.
+> Once Ray autoscaler support is stable, the recommended pattern will be to use the same
+> Ray version in the autoscaler and Ray containers.
+
 ## Uninstalling the KubeRay operator
 
 You can uninstall the KubeRay operator using
@@ -83,7 +89,7 @@ Here is one procedure to test development autoscaler code.
 ```dockerfile
 # Use the latest Ray master as base.
 FROM rayproject/ray:nightly
-# Invalidate cache so that fresh code is pulled in the next step.
+# Invalidate the cache so that fresh code is pulled in the next step.
 ARG BUILD_DATE
 # Retrieve your development code.
 RUN git clone -b <my-dev-branch> https://github.com/<my-git-handle>/ray

diff --git a/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml b/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml
@@ -8,7 +8,7 @@ metadata:
     # An unique identifier for the head node and workers of this cluster.
   name: raycluster-complete
 spec:
-  rayVersion: '1.11.0'
+  rayVersion: '1.12.0'
   enableInTreeAutoscaling: false
   ######################headGroupSpecs#################################
   # head group template and specs, (perhaps 'group' is not needed in the name)
@@ -58,8 +58,18 @@ spec:
         containers:
         # The Ray head pod
         - name: ray-head
-          image: rayproject/ray:1.11.0
+          image: rayproject/ray:1.12.0
           imagePullPolicy: Always
+          # The KubeRay operator uses the ports specified on the ray-head container
+          # to configure a service targeting the ports.
+          # The name of the service is <ray cluster name>-head-svc.
+          ports:
+          - containerPort: 6379
+            name: gcs
+          - containerPort: 8265
+            name: dashboard
+          - containerPort: 10001
+            name: client
           env:
           - name: CPU_REQUEST
             valueFrom:
@@ -85,8 +95,6 @@ spec:
             valueFrom:
               fieldRef:
                 fieldPath: status.podIP
-          ports:
-          - containerPort: 6379
           lifecycle:
             preStop:
               exec:
@@ -103,8 +111,12 @@ spec:
               name: ray-logs
         # The Ray autoscaler sidecar to the head pod
         - name: autoscaler
-          # TODO: Use released Ray version starting with Ray 1.12.0.
-          image: rayproject/ray:413fe0
+          # The autoscaler image used carries the latest improvements to KubeRay autoscaling
+          # support.
+          # It is confirmed (via kuberay/test_autoscaling_e2e.py) to be compatible with all
+          # Ray versions since Ray 1.11.0.
+          # TODO: Use released Ray version when autoscaling support is stable.
+          image: rayproject/ray:8c5fe4
           imagePullPolicy: Always
           env:
           - name: RAY_CLUSTER_NAMESPACE
@@ -178,7 +190,7 @@ spec:
           image: busybox:1.28
           command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]   
         containers:
-        - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
+        - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
           image: rayproject/ray:1.11.0
           # environment variables to set in the container.Optional.
           # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
@@ -190,22 +202,22 @@ spec:
           - name: CPU_REQUEST
             valueFrom:
               resourceFieldRef:
-                containerName: machine-learning
+                containerName: ray-worker
                 resource: requests.cpu
           - name: CPU_LIMITS
             valueFrom:
               resourceFieldRef:
-                containerName: machine-learning
+                containerName: ray-worker
                 resource: limits.cpu
           - name: MEMORY_LIMITS
             valueFrom:
               resourceFieldRef:
-                containerName: machine-learning
+                containerName: ray-worker
                 resource: limits.memory
           - name: MEMORY_REQUESTS
             valueFrom:
               resourceFieldRef:
-                containerName: machine-learning
+                containerName: ray-worker
                 resource: requests.memory
           - name: MY_POD_NAME
             valueFrom:

diff --git a/python/ray/tests/kuberay/scripts/gpu_actor_placement.py b/python/ray/tests/kuberay/scripts/gpu_actor_placement.py
@@ -1,12 +1,18 @@
 import ray
 
 
-@ray.remote(num_gpus=1, num_cpus=1)
-class GPUActor:
-    def where_am_i(self):
-        assert len(ray.get_gpu_ids()) == 1
-        return "on-a-gpu-node"
+def main():
+    """Requests placement of a GPU actor."""
 
+    @ray.remote(num_gpus=1, num_cpus=1)
+    class GPUActor:
+        def where_am_i(self):
+            assert len(ray.get_gpu_ids()) == 1
+            return "on-a-gpu-node"
 
-ray.init("auto", namespace="gpu-test")
-GPUActor.options(name="gpu_actor", lifetime="detached").remote()
+    GPUActor.options(name="gpu_actor", lifetime="detached").remote()
+
+
+if __name__ == "__main__":
+    ray.init("auto", namespace="gpu-test")
+    main()
diff --git a/python/ray/tests/kuberay/scripts/gpu_actor_validation.py b/python/ray/tests/kuberay/scripts/gpu_actor_validation.py
@@ -1,6 +1,14 @@
 import ray
 
-ray.init("auto", namespace="gpu-test")
-gpu_actor = ray.get_actor("gpu_actor")
-actor_response = ray.get(gpu_actor.where_am_i.remote())
-print(actor_response)
+
+def main():
+    """Confirms placement of a GPU actor."""
+    gpu_actor = ray.get_actor("gpu_actor")
+    actor_response = ray.get(gpu_actor.where_am_i.remote())
+    return actor_response
+
+
+if __name__ == "__main__":
+    ray.init("auto", namespace="gpu-test")
+    out = main()
+    print(out)
diff --git a/python/ray/tests/kuberay/scripts/scale_down.py b/python/ray/tests/kuberay/scripts/scale_down.py
@@ -1,6 +1,13 @@
 import ray
 
-ray.init("auto", namespace="gpu-test")
-ray.autoscaler.sdk.request_resources(num_cpus=0)
-gpu_actor = ray.get_actor("gpu_actor")
-ray.kill(gpu_actor)
+
+def main():
+    """Removes CPU request, removes GPU actor."""
+    ray.autoscaler.sdk.request_resources(num_cpus=0)
+    gpu_actor = ray.get_actor("gpu_actor")
+    ray.kill(gpu_actor)
+
+
+if __name__ == "__main__":
+    ray.init("auto", namespace="gpu-test")
+    main()
diff --git a/python/ray/tests/kuberay/scripts/scale_up.py b/python/ray/tests/kuberay/scripts/scale_up.py
@@ -1,4 +1,11 @@
 import ray
 
-ray.init("auto")
-ray.autoscaler.sdk.request_resources(num_cpus=2)
+
+def main():
+    """Submits CPU request."""
+    ray.autoscaler.sdk.request_resources(num_cpus=2)
+
+
+if __name__ == "__main__":
+    ray.init("auto")
+    main()
diff --git a/python/ray/tests/kuberay/scripts/scale_up_custom.py b/python/ray/tests/kuberay/scripts/scale_up_custom.py
@@ -1,9 +1,16 @@
 import ray
 
-ray.init("auto")
-# Workers and head are annotated as having 5 "Custom2" capacity each,
-# so this should trigger upscaling of two workers.
-# (One of the bundles will be "placed" on the head.)
-ray.autoscaler.sdk.request_resources(
-    bundles=[{"Custom2": 3}, {"Custom2": 3}, {"Custom2": 3}]
-)
+
+def main():
+    """Submits custom resource request."""
+    # Workers and head are annotated as having 5 "Custom2" capacity each,
+    # so this should trigger upscaling of two workers.
+    # (One of the bundles will be "placed" on the head.)
+    ray.autoscaler.sdk.request_resources(
+        bundles=[{"Custom2": 3}, {"Custom2": 3}, {"Custom2": 3}]
+    )
+
+
+if __name__ == "__main__":
+    ray.init("auto")
+    main()
diff --git a/python/ray/tests/kuberay/test_autoscaling_e2e.py b/python/ray/tests/kuberay/test_autoscaling_e2e.py
@@ -14,13 +14,17 @@
     get_pod,
     get_pod_names,
     get_raycluster,
+    ray_client_port_forward,
     kubectl_exec_python_script,
     wait_for_pods,
     wait_for_pod_to_start,
     wait_for_ray_health,
     wait_for_crd,
 )
 
+
+from ray.tests.kuberay.scripts import gpu_actor_placement, gpu_actor_validation
+
 logger = logging.getLogger(__name__)
 logging.basicConfig(
     level=logging.INFO,
@@ -29,10 +33,13 @@
 
 # This image will be used for both the Ray nodes and the autoscaler.
 # The CI should pass an image built from the test branch.
-RAY_IMAGE = os.environ.get("RAY_IMAGE", "rayproject/ray:413fe0")
+RAY_IMAGE = os.environ.get("RAY_IMAGE", "rayproject/ray:8c5fe4")
+# By default, use the same image for the autoscaler and Ray containers.
+AUTOSCALER_IMAGE = os.environ.get("AUTOSCALER_IMAGE", RAY_IMAGE)
 # Set to IfNotPresent in kind CI.
 PULL_POLICY = os.environ.get("PULL_POLICY", "Always")
-logger.info(f"Using image `{RAY_IMAGE}` for autoscaler and Ray nodes.")
+logger.info(f"Using image `{RAY_IMAGE}` for Ray containers.")
+logger.info(f"Using image `{AUTOSCALER_IMAGE}` for Autoscaler containers.")
 logger.info(f"Using pull policy `{PULL_POLICY}` for all images.")
 # The default "rayproject/ray:413fe0" is the currently pinned autoscaler image
 # (to be replaced with rayproject/ray:1.12.0 upon 1.12.0 release).
@@ -84,44 +91,21 @@ def setUp(self):
         logger.info("Making sure RayCluster CRD has been registered.")
         wait_for_crd("rayclusters.ray.io")
 
-    def _get_ray_cr_config_file(self) -> str:
-        """Formats a RayCluster CR based on the example in the Ray documentation.
-
-        - Replaces Ray node and autoscaler images in example CR with the test image.
-        - Set image pull policies to IfNotPresent.
-        - Writes modified CR to temp file.
-        - Returns temp file's name.
-        """
-        # Set Ray and autoscaler images.
-        with open(EXAMPLE_CLUSTER_PATH) as example_cluster_file:
-            ray_cr_config_str = example_cluster_file.read()
-        ray_images = [
-            word for word in ray_cr_config_str.split() if "rayproject/ray:" in word
-        ]
-        for ray_image in ray_images:
-            ray_cr_config_str = ray_cr_config_str.replace(ray_image, RAY_IMAGE)
-
-        # CI should set pull policies to IfNotPresent to ensure no issues using a local
-        # test image on kind.
-        ray_cr_config_str = ray_cr_config_str.replace("Always", PULL_POLICY)
-
-        raycluster_cr_file = tempfile.NamedTemporaryFile(delete=False)
-        raycluster_cr_file.write(ray_cr_config_str.encode())
-        raycluster_cr_file.close()
-        return raycluster_cr_file.name
-
     def _get_ray_cr_config(
         self, min_replicas=0, max_replicas=300, replicas=0
     ) -> Dict[str, Any]:
         """Get Ray CR config yaml.
 
-        Use configurable replica fields for a CPU workerGroup.
+        - Use configurable replica fields for a CPU workerGroup.
+
+        - Add a GPU-annotated group for testing GPU upscaling.
 
-        Also add a GPU-annotated group for testing GPU upscaling.
+        - Fill in Ray image, autoscaler image, and image pull policies from env
+          variables.
         """
-        with open(self._get_ray_cr_config_file()) as ray_config_file:
-            ray_config_str = ray_config_file.read()
-        config = yaml.safe_load(ray_config_str)
+        with open(EXAMPLE_CLUSTER_PATH) as ray_cr_config_file:
+            ray_cr_config_str = ray_cr_config_file.read()
+        config = yaml.safe_load(ray_cr_config_str)
         cpu_group = config["spec"]["workerGroupSpecs"][0]
         cpu_group["replicas"] = replicas
         cpu_group["minReplicas"] = min_replicas
@@ -138,6 +122,31 @@ def _get_ray_cr_config(
         gpu_group["groupName"] = "fake-gpu-group"
         config["spec"]["workerGroupSpecs"].append(gpu_group)
 
+        # Substitute images.
+        for group_spec in config["spec"]["workerGroupSpecs"] + [
+            config["spec"]["headGroupSpec"]
+        ]:
+            containers = group_spec["template"]["spec"]["containers"]
+
+            ray_container = containers[0]
+            # Confirm the first container in the example config is the Ray container.
+            assert ray_container["name"] in ["ray-head", "ray-worker"]
+
+            ray_container["image"] = RAY_IMAGE
+
+            for container in containers:
+                container["imagePullPolicy"] = PULL_POLICY
+
+        head_containers = config["spec"]["headGroupSpec"]["template"]["spec"][
+            "containers"
+        ]
+        autoscaler_container = [
+            container
+            for container in head_containers
+            if container["name"] == "autoscaler"
+        ].pop()
+        autoscaler_container["image"] = AUTOSCALER_IMAGE
+
         return config
 
     def _apply_ray_cr(
@@ -254,12 +263,11 @@ def testAutoscaling(self):
         )
         # 2. Trigger GPU upscaling by requesting placement of a GPU actor.
         logger.info("Scheduling an Actor with GPU demands.")
-        kubectl_exec_python_script(
-            script_name="gpu_actor_placement.py",
-            pod=head_pod,
-            container="ray-head",
-            namespace="default",
-        )
+        # Use Ray client to validate that it works against KubeRay.
+        with ray_client_port_forward(
+            head_service="raycluster-complete-head-svc", ray_namespace="gpu-test"
+        ):
+            gpu_actor_placement.main()
         # 3. Confirm new pod number and presence of fake GPU worker.
         logger.info("Confirming fake GPU worker up-scaling.")
         wait_for_pods(goal_num_pods=4, namespace="default")
@@ -272,12 +280,10 @@ def testAutoscaling(self):
         # 4. Confirm that the GPU actor is up and that Ray believes
         # the node the actor is on has a GPU.
         logger.info("Confirming GPU actor placement.")
-        out = kubectl_exec_python_script(
-            script_name="gpu_actor_validation.py",
-            pod=head_pod,
-            container="ray-head",
-            namespace="default",
-        )
+        with ray_client_port_forward(
+            head_service="raycluster-complete-head-svc", ray_namespace="gpu-test"
+        ):
+            out = gpu_actor_validation.main()
         # Confirms the actor was placed on a GPU-annotated node.
         # (See gpu_actor_validation.py for details.)
         assert "on-a-gpu-node" in out