Skip to content

Commit

Permalink
[serve] autoscaling release test deflake and debugging (ray-project#4…
Browse files Browse the repository at this point in the history
…4747)

Deflake release test. From the logs sometimes the http request to get the image fails with connection reset error. For those few occasions we can just return.
Also seeing some 504 errors but need more visibility, so this also adds better tracking for failed requests.

Signed-off-by: Cindy Zhang <[email protected]>
  • Loading branch information
zcin authored Apr 16, 2024
1 parent f80d4d3 commit c5b0bab
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 8 deletions.
21 changes: 13 additions & 8 deletions release/serve_tests/workloads/locust_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dataclasses import dataclass
from dataclasses import asdict, dataclass
from itertools import chain
import json
import logging
Expand All @@ -7,6 +7,7 @@
from typing import Any, Dict, List

import ray
from ray.serve._private.utils import generate_request_id


logger = logging.getLogger(__file__)
Expand Down Expand Up @@ -72,14 +73,16 @@ class EndpointUser(FastHttpUser):

@task
def test(self):
headers = {"Authorization": f"Bearer {token}"} if token else None
request_id = generate_request_id()
headers = (
{"Authorization": f"Bearer {token}", "X-Request-ID": request_id}
if token
else None
)
with self.client.get(
"", headers=headers, json=data, catch_response=True
) as r:
if r.status_code == 200:
r.request_meta["context"]["request_id"] = r.headers[
"x-request-id"
]
r.request_meta["context"]["request_id"] = request_id

@events.request.add_listener
def on_request(
Expand All @@ -92,6 +95,7 @@ def on_request(
):
if exception:
request_id = context["request_id"]
response.encoding = "utf-8"
err = FailedRequest(
request_id=request_id,
status_code=response.status_code,
Expand Down Expand Up @@ -267,12 +271,13 @@ def run_locust_load_test(config: LocustLoadTestConfig) -> LocustTestResults:

# Collect results and metrics
stats: LocustTestResults = ray.get(master_ref)
errors = sorted(chain(*ray.get(worker_refs)), key=lambda e: e.start_time)
errors = sorted(chain(*ray.get(worker_refs)), key=lambda e: e.start_time_s)

# If there were any requests that failed, raise error.
if stats.num_failures > 0:
errors_json = [asdict(err) for err in errors]
raise RuntimeError(
f"There were failed requests: {json.dumps(errors, indent=4)}"
f"There were failed requests: {json.dumps(errors_json, indent=4)}"
)

return stats
8 changes: 8 additions & 0 deletions release/serve_tests/workloads/resnet_50.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,16 @@ def __init__(self):

async def __call__(self, request: starlette.requests.Request) -> str:
uri = (await request.json())["uri"]

try:
image_bytes = requests.get(uri).content
except (
requests.exceptions.ConnectionError,
requests.exceptions.ChunkedEncodingError,
):
return

try:
image = Image.open(BytesIO(image_bytes)).convert("RGB")
except PIL.UnidentifiedImageError:
return
Expand Down

0 comments on commit c5b0bab

Please sign in to comment.