forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[autoscaler] Experimental support for local / on-prem clusters (ray-p…
…roject#2678) This adds some experimental (undocumented) support for launching Ray on existing nodes. You have to provide the head ip, and the list of worker ips. There are also a couple additional utils added for rsyncing files and port-forward.
- Loading branch information
Showing
10 changed files
with
339 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from __future__ import absolute_import | ||
from __future__ import division | ||
from __future__ import print_function | ||
|
||
|
||
def bootstrap_local(config): | ||
return config |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
cluster_name: default | ||
min_workers: 0 | ||
max_workers: 0 | ||
docker: | ||
image: "" | ||
container_name: "" | ||
target_utilization_fraction: 0.8 | ||
idle_timeout_minutes: 5 | ||
provider: | ||
type: local | ||
head_ip: YOUR_HEAD_NODE_HOSTNAME | ||
worker_ips: [] | ||
auth: | ||
ssh_user: YOUR_USERNAME | ||
ssh_private_key: ~/.ssh/id_rsa | ||
head_node: {} | ||
worker_nodes: {} | ||
file_mounts: | ||
"/tmp/ray_sha": "/YOUR/LOCAL/RAY/REPO/.git/refs/heads/YOUR_BRANCH" | ||
setup_commands: [] | ||
head_setup_commands: [] | ||
worker_setup_commands: [] | ||
setup_commands: | ||
- source activate ray && test -e ray || git clone https://github.com/YOUR_GITHUB/ray.git | ||
- source activate ray && cd ray && git fetch && git reset --hard `cat /tmp/ray_sha` | ||
# - source activate ray && cd ray/python && pip install -e . | ||
head_start_ray_commands: | ||
- source activate ray && ray stop | ||
- source activate ray && ulimit -c unlimited && ray start --head --redis-port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml | ||
worker_start_ray_commands: | ||
- source activate ray && ray stop | ||
- source activate ray && ray start --redis-address=$RAY_HEAD_IP:6379 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
from __future__ import absolute_import | ||
from __future__ import division | ||
from __future__ import print_function | ||
|
||
from filelock import FileLock | ||
import json | ||
import os | ||
import socket | ||
|
||
from ray.autoscaler.node_provider import NodeProvider | ||
from ray.autoscaler.tags import TAG_RAY_NODE_TYPE | ||
|
||
|
||
class ClusterState(object): | ||
def __init__(self, lock_path, save_path, provider_config): | ||
self.file_lock = FileLock(lock_path) | ||
self.save_path = save_path | ||
|
||
with self.file_lock: | ||
if os.path.exists(self.save_path): | ||
workers = json.loads(open(self.save_path).read()) | ||
else: | ||
workers = {} | ||
print("Loaded cluster state", workers) | ||
for worker_ip in provider_config["worker_ips"]: | ||
if worker_ip not in workers: | ||
workers[worker_ip] = { | ||
"tags": { | ||
TAG_RAY_NODE_TYPE: "worker" | ||
}, | ||
"state": "terminated", | ||
} | ||
else: | ||
assert workers[worker_ip]["tags"][ | ||
TAG_RAY_NODE_TYPE] == "worker" | ||
if provider_config["head_ip"] not in workers: | ||
workers[provider_config["head_ip"]] = { | ||
"tags": { | ||
TAG_RAY_NODE_TYPE: "head" | ||
}, | ||
"state": "terminated", | ||
} | ||
else: | ||
assert workers[provider_config["head_ip"]]["tags"][ | ||
TAG_RAY_NODE_TYPE] == "head" | ||
assert len(workers) == len(provider_config["worker_ips"]) + 1 | ||
with open(self.save_path, "w") as f: | ||
print("Writing cluster state", workers) | ||
f.write(json.dumps(workers)) | ||
|
||
def get(self): | ||
with self.file_lock: | ||
workers = json.loads(open(self.save_path).read()) | ||
return workers | ||
|
||
def put(self, worker_id, info): | ||
assert "tags" in info | ||
assert "state" in info | ||
with self.file_lock: | ||
workers = self.get() | ||
workers[worker_id] = info | ||
with open(self.save_path, "w") as f: | ||
print("Writing cluster state", workers) | ||
f.write(json.dumps(workers)) | ||
|
||
|
||
class LocalNodeProvider(NodeProvider): | ||
def __init__(self, provider_config, cluster_name): | ||
NodeProvider.__init__(self, provider_config, cluster_name) | ||
self.state = ClusterState("/tmp/cluster-{}.lock".format(cluster_name), | ||
"/tmp/cluster-{}.state".format(cluster_name), | ||
provider_config) | ||
|
||
def nodes(self, tag_filters): | ||
workers = self.state.get() | ||
matching_ips = [] | ||
for worker_ip, info in workers.items(): | ||
if info["state"] == "terminated": | ||
continue | ||
ok = True | ||
for k, v in tag_filters.items(): | ||
if info["tags"].get(k) != v: | ||
ok = False | ||
break | ||
if ok: | ||
matching_ips.append(worker_ip) | ||
return matching_ips | ||
|
||
def is_running(self, node_id): | ||
return self.state.get()[node_id]["state"] == "running" | ||
|
||
def is_terminated(self, node_id): | ||
return not self.is_running(node_id) | ||
|
||
def node_tags(self, node_id): | ||
return self.state.get()[node_id]["tags"] | ||
|
||
def external_ip(self, node_id): | ||
return socket.gethostbyname(node_id) | ||
|
||
def internal_ip(self, node_id): | ||
return socket.gethostbyname(node_id) | ||
|
||
def set_node_tags(self, node_id, tags): | ||
with self.state.file_lock: | ||
info = self.state.get()[node_id] | ||
info["tags"].update(tags) | ||
self.state.put(node_id, info) | ||
|
||
def create_node(self, node_config, tags, count): | ||
node_type = tags[TAG_RAY_NODE_TYPE] | ||
with self.state.file_lock: | ||
workers = self.state.get() | ||
for node_id, info in workers.items(): | ||
if (info["state"] == "terminated" | ||
and info["tags"][TAG_RAY_NODE_TYPE] == node_type): | ||
info["tags"] = tags | ||
info["state"] = "running" | ||
self.state.put(node_id, info) | ||
return | ||
|
||
def terminate_node(self, node_id): | ||
workers = self.state.get() | ||
info = workers[node_id] | ||
info["state"] = "terminated" | ||
self.state.put(node_id, info) |
Oops, something went wrong.