Skip to content

Commit

Permalink
[core][dashboard] Feature flag task logs recording (ray-project#34056)
Browse files Browse the repository at this point in the history
This should address a few issues introduced by the original task log recording features:

[core] perf regression: 1_1_actor_calls_concurrent ray-project#33924
[core] perf regression: 1_1_actor_calls_async ray-project#33949
[Tests] Fix two skipped Windows test for test_task_event_2.py ray-project#33738
The roocasue with the regressions are:
With ray-project#32943, we are recording log file offsets before and after executing a task, which calls tell() on the file descriptor object for each worker.

The cost of that shows up when there are concurrent execution of tasks on a single worker.

I am turning this off by default for this release since the subsequent PRs are not merged yet. We will need to tackle or resolve the regression once we turn this feature on when we merge subsequent PRs. One idea is to make this "finding-out-offset-procedure" async, e.g. we try to locate the exact task id's log offset when we querying the task logs at querying time.
  • Loading branch information
rickyyx authored Apr 5, 2023
1 parent 0fdda69 commit cefefd5
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 1 deletion.
2 changes: 2 additions & 0 deletions python/ray/_private/ray_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,3 +422,5 @@ def gcs_actor_scheduling_enabled():
"dashboard_agent_listen_port",
"gcs_server_port", # the `port` option for gcs port.
}

RAY_ENABLE_RECORD_TASK_LOGGING = env_bool("RAY_ENABLE_RECORD_TASK_LOGGING", False)
7 changes: 7 additions & 0 deletions python/ray/_private/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,7 @@ def __init__(self):
self.ray_debugger_external = False
self._load_code_from_local = False
# Opened file descriptor to stdout/stderr for this python worker.
self._enable_record_task_log = ray_constants.RAY_ENABLE_RECORD_TASK_LOGGING
self._out_file = None
self._err_file = None
# Create the lock here because the serializer will use it before
Expand Down Expand Up @@ -539,6 +540,9 @@ def set_out_file(self, out_file=Optional[IO[AnyStr]]) -> None:

def record_task_log_start(self):
"""Record the task log info when task starts executing"""
if not self._enable_record_task_log:
return

self.core_worker.record_task_log_start(
self.get_out_file_path(),
self.get_err_file_path(),
Expand All @@ -548,6 +552,9 @@ def record_task_log_start(self):

def record_task_log_end(self):
"""Record the task log info when task finishes executing"""
if not self._enable_record_task_log:
return

self.core_worker.record_task_log_end(
self.get_current_out_offset(), self.get_current_err_offset()
)
Expand Down
11 changes: 10 additions & 1 deletion python/ray/tests/test_task_events_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,8 @@ def _read_file(filepath, start, end):


@pytest.mark.skipif(
sys.platform == "win32", reason="Failing on Windows. we should fix it asap"
not ray_constants.RAY_ENABLE_RECORD_TASK_LOGGING,
reason="Skipping if not recording task logs offsets.",
)
def test_task_logs_info_basic(shutdown_only):
"""Test tasks (normal tasks/actor tasks) execution logging
Expand Down Expand Up @@ -594,6 +595,10 @@ def verify():
wait_for_condition(verify)


@pytest.mark.skipif(
not ray_constants.RAY_ENABLE_RECORD_TASK_LOGGING,
reason="Skipping if not recording task logs offsets.",
)
def test_task_logs_info_disabled(shutdown_only, monkeypatch):
"""Test when redirect disabled, no task log info is available
due to missing log file
Expand All @@ -619,6 +624,10 @@ def verify():
wait_for_condition(verify)


@pytest.mark.skipif(
not ray_constants.RAY_ENABLE_RECORD_TASK_LOGGING,
reason="Skipping if not recording task logs offsets.",
)
def test_task_logs_info_running_task(shutdown_only):
ray.init(num_cpus=1)

Expand Down

0 comments on commit cefefd5

Please sign in to comment.