Skip to content

Commit

Permalink
Add ability to specify worker and driver ports (ray-project#8071)
Browse files Browse the repository at this point in the history
  • Loading branch information
edoakes authored May 20, 2020
1 parent d765787 commit a76434c
Show file tree
Hide file tree
Showing 25 changed files with 408 additions and 143 deletions.
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ matrix:
- PYTHONWARNINGS=ignore
- RAY_DEFAULT_BUILD=1
- RAY_CYTHON_EXAMPLES=1
- RAY_USE_RANDOM_PORTS=1
install:
- . ./ci/travis/ci.sh init RAY_CI_SERVE_AFFECTED,RAY_CI_TUNE_AFFECTED,RAY_CI_PYTHON_AFFECTED
before_script:
Expand All @@ -37,6 +38,7 @@ matrix:
- PYTHONWARNINGS=ignore
- RAY_DEFAULT_BUILD=1
- RAY_CYTHON_EXAMPLES=1
- RAY_USE_RANDOM_PORTS=1
install:
- . ./ci/travis/ci.sh init RAY_CI_SERVE_AFFECTED,RAY_CI_TUNE_AFFECTED,RAY_CI_PYTHON_AFFECTED
before_script:
Expand All @@ -62,6 +64,7 @@ matrix:
- RAY_INSTALL_JAVA=1
- RAY_GCS_ACTOR_SERVICE_ENABLED=true
- PYTHON=3.6 PYTHONWARNINGS=ignore
- RAY_USE_RANDOM_PORTS=1
install:
- . ./ci/travis/ci.sh init RAY_CI_STREAMING_PYTHON_AFFECTED,RAY_CI_STREAMING_JAVA_AFFECTED
before_script:
Expand Down Expand Up @@ -96,6 +99,7 @@ matrix:
- RAY_INSTALL_JAVA=1
- RAY_GCS_SERVICE_ENABLED=false
- RAY_CYTHON_EXAMPLES=1
- RAY_USE_RANDOM_PORTS=1
install:
- . ./ci/travis/ci.sh init RAY_CI_ONLY_RLLIB_AFFECTED
before_script:
Expand Down
4 changes: 2 additions & 2 deletions ci/keep_alive
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ PID=$$

# Print output to avoid travis killing us
watchdog() {
for i in `seq 5 5 150`; do
sleep 300
for i in `seq 2 2 150`; do
sleep 120
echo "(running, ${i}m total)"
done
echo "TIMED OUT"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ public void setUp() {
"start",
"--head",
"--redis-port=6379",
"--min-worker-port=0",
"--max-worker-port=0",
String.format("--plasma-store-socket-name=%s", PLASMA_STORE_SOCKET_NAME),
String.format("--raylet-socket-name=%s", RAYLET_SOCKET_NAME),
String.format("--node-manager-port=%s", nodeManagerPort),
Expand Down
2 changes: 2 additions & 0 deletions python/ray/cluster_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ def add_node(self, **node_args):
"num_cpus": 1,
"num_gpus": 0,
"object_store_memory": 150 * 1024 * 1024, # 150 MiB
"min_worker_port": 0,
"max_worker_port": 0,
}
if "_internal_config" in node_args:
node_args["_internal_config"] = json.loads(
Expand Down
2 changes: 2 additions & 0 deletions python/ray/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,8 @@ def start_raylet(self, use_valgrind=False, use_profiler=False):
self._temp_dir,
self._session_dir,
self.get_resource_spec(),
self._ray_params.min_worker_port,
self._ray_params.max_worker_port,
self._ray_params.object_manager_port,
self._ray_params.redis_password,
use_valgrind=use_valgrind,
Expand Down
34 changes: 34 additions & 0 deletions python/ray/parameter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os

import numpy as np

Expand Down Expand Up @@ -35,6 +36,10 @@ class RayParams:
node_ip_address (str): The IP address of the node that we are on.
raylet_ip_address (str): The IP address of the raylet that this node
connects to.
min_worker_port (int): The lowest port number that workers will bind
on. If not set or set to 0, random ports will be chosen.
max_worker_port (int): The highest port number that workers will bind
on. If set, min_worker_port must also be set.
object_id_seed (int): Used to seed the deterministic generation of
object IDs. The same value can be used across multiple runs of the
same job in order to generate the object IDs in a consistent
Expand Down Expand Up @@ -98,6 +103,8 @@ def __init__(self,
node_manager_port=None,
node_ip_address=None,
raylet_ip_address=None,
min_worker_port=None,
max_worker_port=None,
object_id_seed=None,
driver_mode=None,
redirect_worker_output=None,
Expand Down Expand Up @@ -135,6 +142,8 @@ def __init__(self,
self.node_manager_port = node_manager_port
self.node_ip_address = node_ip_address
self.raylet_ip_address = raylet_ip_address
self.min_worker_port = min_worker_port
self.max_worker_port = max_worker_port
self.driver_mode = driver_mode
self.redirect_worker_output = redirect_worker_output
self.redirect_output = redirect_output
Expand Down Expand Up @@ -189,6 +198,31 @@ def update_if_absent(self, **kwargs):
self._check_usage()

def _check_usage(self):
# Used primarily for testing.
if os.environ.get("RAY_USE_RANDOM_PORTS", False):
if self.min_worker_port is None and self.min_worker_port is None:
self.min_worker_port = 0
self.max_worker_port = 0

if self.min_worker_port is not None:
if self.min_worker_port != 0 and (self.min_worker_port < 1024
or self.min_worker_port > 65535):
raise ValueError("min_worker_port must be 0 or an integer "
"between 1024 and 65535.")

if self.max_worker_port is not None:
if self.min_worker_port is None:
raise ValueError("If max_worker_port is set, min_worker_port "
"must also be set.")
elif self.max_worker_port != 0:
if self.max_worker_port < 1024 or self.max_worker_port > 65535:
raise ValueError(
"max_worker_port must be 0 or an integer between "
"1024 and 65535.")
elif self.max_worker_port <= self.min_worker_port:
raise ValueError("max_worker_port must be higher than "
"min_worker_port.")

if self.resources is not None:
assert "CPU" not in self.resources, (
"'CPU' should not be included in the resource dictionary. Use "
Expand Down
25 changes: 21 additions & 4 deletions python/ray/scripts/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,20 @@ def dashboard(cluster_config_file, cluster_name, port):
required=False,
type=int,
help="the port to use for starting the node manager")
@click.option(
"--min-worker-port",
required=False,
type=int,
default=10000,
help="the lowest port number that workers will bind on. If not set, "
"random ports will be chosen.")
@click.option(
"--max-worker-port",
required=False,
type=int,
default=10999,
help="the highest port number that workers will bind on. If set, "
"'--min-worker-port' must also be set.")
@click.option(
"--memory",
required=False,
Expand Down Expand Up @@ -289,10 +303,11 @@ def dashboard(cluster_config_file, cluster_name, port):
help="Specify whether load code from local file or GCS serialization.")
def start(node_ip_address, redis_address, address, redis_port, port,
num_redis_shards, redis_max_clients, redis_password,
redis_shard_ports, object_manager_port, node_manager_port, memory,
object_store_memory, redis_max_memory, num_cpus, num_gpus, resources,
head, include_webui, webui_host, block, plasma_directory, huge_pages,
autoscaling_config, no_redirect_worker_output, no_redirect_output,
redis_shard_ports, object_manager_port, node_manager_port,
min_worker_port, max_worker_port, memory, object_store_memory,
redis_max_memory, num_cpus, num_gpus, resources, head, include_webui,
webui_host, block, plasma_directory, huge_pages, autoscaling_config,
no_redirect_worker_output, no_redirect_output,
plasma_store_socket_name, raylet_socket_name, temp_dir, include_java,
java_worker_options, load_code_from_local, internal_config):
"""Start Ray processes manually on the local machine."""
Expand Down Expand Up @@ -327,6 +342,8 @@ def start(node_ip_address, redis_address, address, redis_port, port,
redirect_output = None if not no_redirect_output else True
ray_params = ray.parameter.RayParams(
node_ip_address=node_ip_address,
min_worker_port=min_worker_port,
max_worker_port=max_worker_port,
object_manager_port=object_manager_port,
node_manager_port=node_manager_port,
memory=memory,
Expand Down
14 changes: 14 additions & 0 deletions python/ray/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -1219,6 +1219,8 @@ def start_raylet(redis_address,
temp_dir,
session_dir,
resource_spec,
min_worker_port=None,
max_worker_port=None,
object_manager_port=None,
redis_password=None,
use_valgrind=False,
Expand Down Expand Up @@ -1247,6 +1249,10 @@ def start_raylet(redis_address,
resource_spec (ResourceSpec): Resources for this raylet.
object_manager_port: The port to use for the object manager. If this is
None, then the object manager will choose its own port.
min_worker_port (int): The lowest port number that workers will bind
on. If not set, random ports will be chosen.
max_worker_port (int): The highest port number that workers will bind
on. If set, min_worker_port must also be set.
redis_password: The password to use when connecting to Redis.
use_valgrind (bool): True if the raylet should be started inside
of valgrind. If this is True, use_profiler must be False.
Expand Down Expand Up @@ -1324,6 +1330,12 @@ def start_raylet(redis_address,
if object_manager_port is None:
object_manager_port = 0

if min_worker_port is None:
min_worker_port = 0

if max_worker_port is None:
max_worker_port = 0

if load_code_from_local:
start_worker_command += ["--load-code-from-local"]

Expand All @@ -1332,6 +1344,8 @@ def start_raylet(redis_address,
"--raylet_socket_name={}".format(raylet_name),
"--store_socket_name={}".format(plasma_store_name),
"--object_manager_port={}".format(object_manager_port),
"--min_worker_port={}".format(min_worker_port),
"--max_worker_port={}".format(max_worker_port),
"--node_manager_port={}".format(node_manager_port),
"--node_ip_address={}".format(node_ip_address),
"--redis_address={}".format(gcs_ip_address),
Expand Down
5 changes: 4 additions & 1 deletion python/ray/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,10 @@ def ray_start_object_store_memory(request):

@pytest.fixture
def call_ray_start(request):
parameter = getattr(request, "param", "ray start --head --num-cpus=1")
parameter = getattr(
request, "param",
"ray start --head --num-cpus=1 --min-worker-port=0 --max-worker-port=0"
)
command_args = parameter.split(" ")
out = ray.utils.decode(
subprocess.check_output(command_args, stderr=subprocess.STDOUT))
Expand Down
6 changes: 4 additions & 2 deletions python/ray/tests/test_dynres.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,10 @@ def delete_res(resource_name):
available_res = ray.available_resources()
cluster_res = ray.cluster_resources()

assert res_name not in available_res
assert res_name not in cluster_res
def check_resources():
return res_name not in available_res and res_name not in cluster_res

ray.test_utils.wait_for_condition(check_resources)


def test_dynamic_res_infeasible_rescheduling(ray_start_regular):
Expand Down
17 changes: 14 additions & 3 deletions python/ray/tests/test_multi_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,10 @@ def f():


@pytest.mark.parametrize(
"call_ray_start", ["ray start --head --num-cpus=1 --num-gpus=1"],
"call_ray_start", [
"ray start --head --num-cpus=1 --num-gpus=1 " +
"--min-worker-port=0 --max-worker-port=0"
],
indirect=True)
def test_drivers_release_resources(call_ray_start):
address = call_ray_start
Expand Down Expand Up @@ -334,6 +337,7 @@ def wait_for_success_output(process_handle, timeout=10):
print(output_line)
if output_line == "success":
return
time.sleep(1)
raise RayTestTimeoutException(
"Timed out waiting for process to print success.")

Expand Down Expand Up @@ -376,6 +380,13 @@ def test_calling_start_ray_head(call_ray_stop_only):
])
subprocess.check_output(["ray", "stop"])

# Test starting Ray with the worker port range specified.
subprocess.check_output([
"ray", "start", "--head", "--min-worker-port", "50000",
"--max-worker-port", "51000"
])
subprocess.check_output(["ray", "stop"])

# Test starting Ray with the number of CPUs specified.
subprocess.check_output(["ray", "start", "--head", "--num-cpus", "2"])
subprocess.check_output(["ray", "stop"])
Expand Down Expand Up @@ -419,7 +430,7 @@ def test_calling_start_ray_head(call_ray_stop_only):
assert blocked.returncode is None

kill_process_by_name("raylet")
wait_for_children_of_pid_to_exit(blocked.pid, timeout=120)
wait_for_children_of_pid_to_exit(blocked.pid, timeout=30)
blocked.wait()
assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit"

Expand All @@ -431,7 +442,7 @@ def test_calling_start_ray_head(call_ray_stop_only):
wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30)

blocked.terminate()
wait_for_children_of_pid_to_exit(blocked.pid, timeout=120)
wait_for_children_of_pid_to_exit(blocked.pid, timeout=30)
blocked.wait()
assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit"

Expand Down
28 changes: 19 additions & 9 deletions src/ray/core_worker/core_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,6 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
client_call_manager_(new rpc::ClientCallManager(io_service_)),
death_check_timer_(io_service_),
internal_timer_(io_service_),
core_worker_server_(WorkerTypeString(options_.worker_type),
0 /* let grpc choose a port */),
task_queue_length_(0),
num_executed_tasks_(0),
task_execution_service_work_(task_execution_service_),
Expand All @@ -266,10 +264,6 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
[this] { return local_raylet_client_->TaskDone(); }));
}

// Start RPC server after all the task receivers are properly initialized.
core_worker_server_.RegisterService(grpc_service_);
core_worker_server_.Run();

// Initialize raylet client.
// NOTE(edoakes): the core_worker_server_ must be running before registering with
// the raylet, as the raylet will start sending some RPC messages immediately.
Expand All @@ -280,21 +274,37 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
auto grpc_client = rpc::NodeManagerWorkerClient::make(
options_.raylet_ip_address, options_.node_manager_port, *client_call_manager_);
ClientID local_raylet_id;
int assigned_port;
std::unordered_map<std::string, std::string> internal_config;
local_raylet_client_ = std::shared_ptr<raylet::RayletClient>(new raylet::RayletClient(
io_service_, std::move(grpc_client), options_.raylet_socket, GetWorkerID(),
(options_.worker_type == ray::WorkerType::WORKER),
worker_context_.GetCurrentJobID(), options_.language, &local_raylet_id,
&internal_config, options_.node_ip_address, core_worker_server_.GetPort()));
worker_context_.GetCurrentJobID(), options_.language, options_.node_ip_address,
&local_raylet_id, &assigned_port, &internal_config));
connected_ = true;

RAY_CHECK(assigned_port != -1)
<< "Failed to allocate a port for the worker. Please specify a wider port range "
"using the '--min-worker-port' and '--max-worker-port' arguments to 'ray "
"start'.";

// NOTE(edoakes): any initialization depending on RayConfig must happen after this line.
RayConfig::instance().initialize(internal_config);

// Start RPC server after all the task receivers are properly initialized and we have
// our assigned port from the raylet.
core_worker_server_ = std::unique_ptr<rpc::GrpcServer>(
new rpc::GrpcServer(WorkerTypeString(options_.worker_type), assigned_port));
core_worker_server_->RegisterService(grpc_service_);
core_worker_server_->Run();

// Tell the raylet the port that we are listening on.
RAY_CHECK_OK(local_raylet_client_->AnnounceWorkerPort(core_worker_server_->GetPort()));

// Set our own address.
RAY_CHECK(!local_raylet_id.IsNil());
rpc_address_.set_ip_address(options_.node_ip_address);
rpc_address_.set_port(core_worker_server_.GetPort());
rpc_address_.set_port(core_worker_server_->GetPort());
rpc_address_.set_raylet_id(local_raylet_id.Binary());
rpc_address_.set_worker_id(worker_context_.GetWorkerID().Binary());
RAY_LOG(INFO) << "Initializing worker at address: " << rpc_address_.ip_address() << ":"
Expand Down
2 changes: 1 addition & 1 deletion src/ray/core_worker/core_worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
boost::asio::steady_timer internal_timer_;

/// RPC server used to receive tasks to execute.
rpc::GrpcServer core_worker_server_;
std::unique_ptr<rpc::GrpcServer> core_worker_server_;

/// Address of our RPC server.
rpc::Address rpc_address_;
Expand Down
2 changes: 2 additions & 0 deletions src/ray/core_worker/test/core_worker_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ class CoreWorkerTest : public RedisServiceManagerForTest {
.append(" --node_ip_address=" + node_ip_address)
.append(" --redis_address=" + redis_address)
.append(" --redis_port=6379")
.append(" --min-worker-port=0")
.append(" --max-worker-port=0")
.append(" --num_initial_workers=1")
.append(" --maximum_startup_concurrency=10")
.append(" --static_resource_list=" + resource)
Expand Down
Loading

0 comments on commit a76434c

Please sign in to comment.