Skip to content

Commit

Permalink
Fix bug in cluster mode where driver exits when there are tasks in th…
Browse files Browse the repository at this point in the history
…e waiting queue (ray-project#4251)
  • Loading branch information
stephanie-wang authored and robertnishihara committed Mar 20, 2019
1 parent 8ce7565 commit 4ac9c1e
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 3 deletions.
30 changes: 29 additions & 1 deletion python/ray/tests/test_multi_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import time

import ray
from ray.utils import _random_string
from ray.tests.utils import (run_and_get_output, run_string_as_driver,
run_string_as_driver_nonblocking)

Expand Down Expand Up @@ -409,7 +410,8 @@ def test_driver_exiting_when_worker_blocked(call_ray_start):

ray.init(redis_address=redis_address)

# Define a driver that creates an actor and exits.
# Define a driver that creates two tasks, one that runs forever and the
# other blocked on the first.
driver_script = """
import time
import ray
Expand All @@ -432,6 +434,32 @@ def g():
# Make sure the first driver ran to completion.
assert "success" in out

nonexistent_id_bytes = _random_string()
nonexistent_id_hex = ray.utils.binary_to_hex(nonexistent_id_bytes)
# Define a driver that creates one task that depends on a nonexistent
# object. This task will be queued as waiting to execute.
driver_script = """
import time
import ray
ray.init(redis_address="{}")
@ray.remote
def g(x):
return
g.remote(ray.ObjectID(ray.utils.hex_to_binary("{}")))
time.sleep(1)
print("success")
""".format(redis_address, nonexistent_id_hex)

# Create some drivers and let them exit and make sure everything is
# still alive.
for _ in range(3):
out = run_string_as_driver(driver_script)
# Simulate the nonexistent dependency becoming available.
ray.worker.global_worker.put_object(
ray.ObjectID(nonexistent_id_bytes), None)
# Make sure the first driver ran to completion.
assert "success" in out

@ray.remote
def f():
return 1
Expand Down
5 changes: 3 additions & 2 deletions src/ray/raylet/node_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -592,9 +592,10 @@ void NodeManager::HandleActorStateTransition(const ActorID &actor_id,

void NodeManager::CleanUpTasksForDeadDriver(const DriverID &driver_id) {
auto tasks_to_remove = local_queues_.GetTaskIdsForDriver(driver_id);
local_queues_.RemoveTasks(tasks_to_remove);

task_dependency_manager_.RemoveTasksAndRelatedObjects(tasks_to_remove);
// NOTE(swang): SchedulingQueue::RemoveTasks modifies its argument so we must
// call it last.
local_queues_.RemoveTasks(tasks_to_remove);
}

void NodeManager::ProcessNewClient(LocalClientConnection &client) {
Expand Down

0 comments on commit 4ac9c1e

Please sign in to comment.