[kuberay] Logging-related autoscaler stability improvement.

The autoscaler container writes logs to a directory set up by the Ray container. This PR moves the logic that sets up autoscaler logging so that it is done after the Ray container is ready. This PR also changes things so that the autoscaler process exits after hitting 5 total exceptions. Kubernetes will then restart the autoscaler. The idea here is to ensure the autoscaler is able to restart cleanly in long-running deployments of Ray.
srinathk10 · Jun 29, 2022 · 66ea76d · 66ea76d
1 parent 1f9282a
commit 66ea76d
Showing 1 changed file with 12 additions and 4 deletions.
diff --git a/python/ray/autoscaler/_private/kuberay/run_autoscaler.py b/python/ray/autoscaler/_private/kuberay/run_autoscaler.py
@@ -17,19 +17,23 @@
 
 def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str):
     """Wait until the Ray head container is ready. Then start the autoscaler."""
-    _setup_logging()
     head_ip = get_node_ip_address()
     ray_address = f"{head_ip}:6379"
     while True:
         try:
             subprocess.check_call(["ray", "health-check", "--address", ray_address])
-            logger.info("The Ray head is ready. Starting the autoscaler.")
+            # Logging is not ready yet. Print to stdout for now.
+            print("The Ray head is ready. Starting the autoscaler.")
             break
         except subprocess.CalledProcessError:
-            logger.warning("The Ray head is not yet ready.")
-            logger.warning(f"Will check again in {BACKOFF_S} seconds.")
+            print("The Ray head is not yet ready.")
+            print(f"Will check again in {BACKOFF_S} seconds.")
             time.sleep(BACKOFF_S)
 
+    # The Ray head container sets up the log directory. Thus, we set up logging
+    # only after the Ray head is ready.
+    _setup_logging()
+
     # autoscaling_config_producer reads the RayCluster CR from K8s and uses the CR
     # to output an autoscaling config.
     autoscaling_config_producer = AutoscalingConfigProducer(
@@ -42,6 +46,10 @@ def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str):
         # In this case, it's a callable.
         autoscaling_config=autoscaling_config_producer,
         monitor_ip=head_ip,
+        # Let the autoscaler process exit after it hits 5 exceptions.
+        # (See ray.autoscaler._private.constants.AUTOSCALER_MAX_NUM_FAILURES.)
+        # Kubernetes will then restart the autoscaler container.
+        retry_on_failure=False,
     ).run()