Skip to content

Commit

Permalink
[serve] Avoid spamming error logs when deployments are unhealthy (ray…
Browse files Browse the repository at this point in the history
…-project#41927)

Signed-off-by: Cindy Zhang <[email protected]>
  • Loading branch information
zcin authored Dec 15, 2023
1 parent 706e8bf commit 0160297
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 4 deletions.
13 changes: 9 additions & 4 deletions python/ray/serve/_private/application_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,10 +735,15 @@ def list_deployment_details(self) -> Dict[str, DeploymentDetails]:
return {k: v for k, v in details.items() if v is not None}

def _update_status(self, status: ApplicationStatus, status_msg: str = "") -> None:
if status_msg and status in [
ApplicationStatus.DEPLOY_FAILED,
ApplicationStatus.UNHEALTHY,
]:
if (
status_msg
and status
in [
ApplicationStatus.DEPLOY_FAILED,
ApplicationStatus.UNHEALTHY,
]
and status_msg != self._status_msg
):
logger.warning(status_msg)

self._status = status
Expand Down
22 changes: 22 additions & 0 deletions python/ray/serve/tests/test_deploy_2.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import functools
import os
import sys
import threading
import time
Expand All @@ -13,6 +14,11 @@
from ray._private.pydantic_compat import ValidationError
from ray._private.test_utils import SignalActor, wait_for_condition
from ray.serve._private.common import ApplicationStatus
from ray.serve._private.logging_utils import (
get_component_log_file_name,
get_serve_logs_dir,
)
from ray.util.state import list_actors


@pytest.mark.parametrize("prefixes", [[None, "/f", None], ["/f", None, "/f"]])
Expand Down Expand Up @@ -221,6 +227,22 @@ def check_health(self):
assert serve.status().applications["app"].status == ApplicationStatus.UNHEALTHY
time.sleep(0.1)

# At least 10 control loop iterations should have passed. Check that
# the logs from application state manager notifying about unhealthy
# deployments doesn't spam, they should get printed only once.
controller_pid = [
actor["pid"]
for actor in list_actors()
if actor["name"] == "SERVE_CONTROLLER_ACTOR"
][0]
controller_log_file_name = get_component_log_file_name(
"controller", controller_pid, component_type=None, suffix=".log"
)
controller_log_path = os.path.join(get_serve_logs_dir(), controller_log_file_name)
with open(controller_log_path, "r") as f:
s = f.read()
assert s.count("The deployments ['Model'] are UNHEALTHY.") <= 1


@pytest.mark.skipif(
sys.platform == "win32", reason="Runtime env support experimental on windows"
Expand Down

0 comments on commit 0160297

Please sign in to comment.