[autoscaler] Experimental support for local / on-prem clusters (ray-p…

…roject#2678) This adds some experimental (undocumented) support for launching Ray on existing nodes. You have to provide the head ip, and the list of worker ips. There are also a couple additional utils added for rsyncing files and port-forward.
xlnwel · Aug 19, 2018 · 9473da6 · 9473da6
1 parent 62d0698
commit 9473da6
Show file tree

Hide file tree

Showing 10 changed files with 339 additions and 37 deletions.
diff --git a/doc/source/autoscaling.rst b/doc/source/autoscaling.rst
@@ -89,11 +89,21 @@ You can use ``ray attach`` to attach to an interactive console on the cluster.
 Port-forwarding applications
 ----------------------------
 
-To run connect to applications running on the cluster (e.g. Jupyter notebook) using a web browser, you can forward the port to your local machine using SSH:
+To run connect to applications running on the cluster (e.g. Jupyter notebook) using a web browser, you can use the port-forward option for ``ray exec``. The local port opened is the same as the remote port:
 
 .. code-block:: bash
 
-    $ ssh -L 8899:localhost:8899 -i <key> <user>@<addr> 'source ~/anaconda3/bin/activate tensorflow_p36 && jupyter notebook --port=8899'
+    $ ray exec cluster.yaml --port-forward=8899 'source ~/anaconda3/bin/activate tensorflow_p36 && jupyter notebook --port=8899'
+
+Manually synchronizing files
+----------------------------
+
+To download or upload files to the cluster head node, use ``ray rsync_down`` or ``ray rsync_up``:
+
+.. code-block:: bash
+
+    $ ray rsync_down cluster.yaml '/path/on/cluster' '/local/path'
+    $ ray rsync_up cluster.yaml '/local/path' '/path/on/cluster'
 
 Updating your cluster
 ---------------------

diff --git a/python/ray/autoscaler/autoscaler.py b/python/ray/autoscaler/autoscaler.py
@@ -65,6 +65,8 @@
             "module": (str,
                        OPTIONAL),  # module, if using external node provider
             "project_id": (None, OPTIONAL),  # gcp project id, if using gcp
+            "head_ip": (str, OPTIONAL),  # local cluster head node
+            "worker_ips": (list, OPTIONAL),  # local cluster worker nodes
         },
         REQUIRED),
 

diff --git a/python/ray/autoscaler/commands.py b/python/ray/autoscaler/commands.py
@@ -38,19 +38,20 @@ def create_or_update_cluster(config_file, override_min_workers,
     if override_cluster_name is not None:
         config["cluster_name"] = override_cluster_name
     config = _bootstrap_config(config)
-    get_or_create_head_node(config, config_file, no_restart, restart_only, yes)
+    get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
+                            override_cluster_name)
 
 
 def _bootstrap_config(config):
+    config = fillout_defaults(config)
+
     hasher = hashlib.sha1()
     hasher.update(json.dumps([config], sort_keys=True).encode("utf-8"))
     cache_key = os.path.join(tempfile.gettempdir(),
                              "ray-config-{}".format(hasher.hexdigest()))
     if os.path.exists(cache_key):
-        print("Cached settings:", cache_key)
         return json.loads(open(cache_key).read())
     validate_config(config)
-    config = fillout_defaults(config)
 
     importer = NODE_PROVIDERS.get(config["provider"]["type"])
     if not importer:
@@ -91,8 +92,8 @@ def teardown_cluster(config_file, yes, workers_only, override_cluster_name):
         nodes = provider.nodes({TAG_RAY_NODE_TYPE: "worker"})
 
 
-def get_or_create_head_node(config, config_file, no_restart, restart_only,
-                            yes):
+def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
+                            override_cluster_name):
     """Create the cluster head node, which in turn creates the workers."""
 
     provider = get_node_provider(config["provider"], config["cluster_name"])
@@ -192,10 +193,16 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only,
                 and "--autoscaling-config" in s):
             monitor_str = "docker exec {} /bin/sh -c {}".format(
                 config["docker"]["container_name"], quote(monitor_str))
+    if override_cluster_name:
+        modifiers = " --cluster-name={}".format(quote(override_cluster_name))
+    else:
+        modifiers = ""
     print("To monitor auto-scaling activity, you can run:\n\n"
-          "  ray exec {} {} --cluster-name={}\n".format(
-              config_file, quote(monitor_str), quote(config["cluster_name"])))
-    print("To login to the cluster, run:\n\n"
+          "  ray exec {} {}{}\n".format(config_file, quote(monitor_str),
+                                        modifiers))
+    print("To open a console on the cluster:\n\n"
+          "  ray attach {}{}\n".format(config_file, modifiers))
+    print("To ssh manually to the cluster, run:\n\n"
           "  ssh -i {} {}@{}\n".format(config["auth"]["ssh_private_key"],
                                        config["auth"]["ssh_user"],
                                        provider.external_ip(head_node)))
@@ -211,10 +218,11 @@ def attach_cluster(config_file, start, override_cluster_name):
     """
 
     exec_cluster(config_file, "screen -L -xRR", False, False, start,
-                 override_cluster_name)
+                 override_cluster_name, None)
 
 
-def exec_cluster(config_file, cmd, screen, stop, start, override_cluster_name):
+def exec_cluster(config_file, cmd, screen, stop, start, override_cluster_name,
+                 port_forward):
     """Runs a command on the specified cluster.
 
     Arguments:
@@ -224,13 +232,15 @@ def exec_cluster(config_file, cmd, screen, stop, start, override_cluster_name):
         stop: whether to stop the cluster after command run
         start: whether to start the cluster if it isn't up
         override_cluster_name: set the name of the cluster
+        port_forward: port to forward
     """
 
     config = yaml.load(open(config_file).read())
     if override_cluster_name is not None:
         config["cluster_name"] = override_cluster_name
     config = _bootstrap_config(config)
-    head_node = _get_head_node(config, config_file, create_if_needed=start)
+    head_node = _get_head_node(
+        config, config_file, override_cluster_name, create_if_needed=start)
     updater = NodeUpdaterProcess(
         head_node,
         config["provider"],
@@ -242,10 +252,10 @@ def exec_cluster(config_file, cmd, screen, stop, start, override_cluster_name):
     if stop:
         cmd += ("; ray stop; ray teardown ~/ray_bootstrap_config.yaml --yes "
                 "--workers-only; sudo shutdown -h now")
-    _exec(updater, cmd, screen, expect_error=stop)
+    _exec(updater, cmd, screen, expect_error=stop, port_forward=port_forward)
 
 
-def _exec(updater, cmd, screen, expect_error=False):
+def _exec(updater, cmd, screen, expect_error=False, port_forward=None):
     if cmd:
         if screen:
             cmd = [
@@ -254,7 +264,43 @@ def _exec(updater, cmd, screen, expect_error=False):
             ]
             cmd = " ".join(cmd)
         updater.ssh_cmd(
-            cmd, verbose=True, allocate_tty=True, expect_error=expect_error)
+            cmd,
+            verbose=False,
+            allocate_tty=True,
+            expect_error=expect_error,
+            port_forward=port_forward)
+
+
+def rsync(config_file, source, target, override_cluster_name, down):
+    """Rsyncs files.
+
+    Arguments:
+        config_file: path to the cluster yaml
+        source: source dir
+        target: target dir
+        override_cluster_name: set the name of the cluster
+        down: whether we're syncing remote -> local
+    """
+
+    config = yaml.load(open(config_file).read())
+    if override_cluster_name is not None:
+        config["cluster_name"] = override_cluster_name
+    config = _bootstrap_config(config)
+    head_node = _get_head_node(
+        config, config_file, override_cluster_name, create_if_needed=False)
+    updater = NodeUpdaterProcess(
+        head_node,
+        config["provider"],
+        config["auth"],
+        config["cluster_name"],
+        config["file_mounts"], [],
+        "",
+        redirect_output=False)
+    if down:
+        rsync = updater.rsync_down
+    else:
+        rsync = updater.rsync_up
+    rsync(source, target, check_error=False)
 
 
 def get_head_node_ip(config_file, override_cluster_name):
@@ -264,11 +310,14 @@ def get_head_node_ip(config_file, override_cluster_name):
     if override_cluster_name is not None:
         config["cluster_name"] = override_cluster_name
     provider = get_node_provider(config["provider"], config["cluster_name"])
-    head_node = _get_head_node(config, config_file)
+    head_node = _get_head_node(config, config_file, override_cluster_name)
     return provider.external_ip(head_node)
 
 
-def _get_head_node(config, config_file, create_if_needed=False):
+def _get_head_node(config,
+                   config_file,
+                   override_cluster_name,
+                   create_if_needed=False):
     provider = get_node_provider(config["provider"], config["cluster_name"])
     head_node_tags = {
         TAG_RAY_NODE_TYPE: "head",
@@ -283,8 +332,10 @@ def _get_head_node(config, config_file, create_if_needed=False):
             config_file,
             restart_only=False,
             no_restart=False,
-            yes=True)
-        return _get_head_node(config, config_file, create_if_needed=False)
+            yes=True,
+            override_cluster_name=override_cluster_name)
+        return _get_head_node(
+            config, config_file, override_cluster_name, create_if_needed=False)
     else:
         print("Head node of cluster ({}) not found!".format(
             config["cluster_name"]))

diff --git a/python/ray/autoscaler/local/__init__.py b/python/ray/autoscaler/local/__init__.py
diff --git a/python/ray/autoscaler/local/config.py b/python/ray/autoscaler/local/config.py
@@ -0,0 +1,7 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+def bootstrap_local(config):
+    return config
diff --git a/python/ray/autoscaler/local/example-full.yaml b/python/ray/autoscaler/local/example-full.yaml
@@ -0,0 +1,32 @@
+cluster_name: default
+min_workers: 0
+max_workers: 0
+docker:
+    image: ""
+    container_name: ""
+target_utilization_fraction: 0.8
+idle_timeout_minutes: 5
+provider:
+    type: local
+    head_ip: YOUR_HEAD_NODE_HOSTNAME
+    worker_ips: []
+auth:
+    ssh_user: YOUR_USERNAME
+    ssh_private_key: ~/.ssh/id_rsa
+head_node: {}
+worker_nodes: {}
+file_mounts:
+     "/tmp/ray_sha": "/YOUR/LOCAL/RAY/REPO/.git/refs/heads/YOUR_BRANCH"
+setup_commands: []
+head_setup_commands: []
+worker_setup_commands: []
+setup_commands:
+    - source activate ray && test -e ray || git clone https://github.com/YOUR_GITHUB/ray.git
+    - source activate ray && cd ray && git fetch && git reset --hard `cat /tmp/ray_sha`
+#    - source activate ray && cd ray/python && pip install -e .
+head_start_ray_commands:
+    - source activate ray && ray stop
+    - source activate ray && ulimit -c unlimited && ray start --head --redis-port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml
+worker_start_ray_commands:
+    - source activate ray && ray stop
+    - source activate ray && ray start --redis-address=$RAY_HEAD_IP:6379
diff --git a/python/ray/autoscaler/local/node_provider.py b/python/ray/autoscaler/local/node_provider.py
@@ -0,0 +1,126 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from filelock import FileLock
+import json
+import os
+import socket
+
+from ray.autoscaler.node_provider import NodeProvider
+from ray.autoscaler.tags import TAG_RAY_NODE_TYPE
+
+
+class ClusterState(object):
+    def __init__(self, lock_path, save_path, provider_config):
+        self.file_lock = FileLock(lock_path)
+        self.save_path = save_path
+
+        with self.file_lock:
+            if os.path.exists(self.save_path):
+                workers = json.loads(open(self.save_path).read())
+            else:
+                workers = {}
+            print("Loaded cluster state", workers)
+            for worker_ip in provider_config["worker_ips"]:
+                if worker_ip not in workers:
+                    workers[worker_ip] = {
+                        "tags": {
+                            TAG_RAY_NODE_TYPE: "worker"
+                        },
+                        "state": "terminated",
+                    }
+                else:
+                    assert workers[worker_ip]["tags"][
+                        TAG_RAY_NODE_TYPE] == "worker"
+            if provider_config["head_ip"] not in workers:
+                workers[provider_config["head_ip"]] = {
+                    "tags": {
+                        TAG_RAY_NODE_TYPE: "head"
+                    },
+                    "state": "terminated",
+                }
+            else:
+                assert workers[provider_config["head_ip"]]["tags"][
+                    TAG_RAY_NODE_TYPE] == "head"
+            assert len(workers) == len(provider_config["worker_ips"]) + 1
+            with open(self.save_path, "w") as f:
+                print("Writing cluster state", workers)
+                f.write(json.dumps(workers))
+
+    def get(self):
+        with self.file_lock:
+            workers = json.loads(open(self.save_path).read())
+            return workers
+
+    def put(self, worker_id, info):
+        assert "tags" in info
+        assert "state" in info
+        with self.file_lock:
+            workers = self.get()
+            workers[worker_id] = info
+            with open(self.save_path, "w") as f:
+                print("Writing cluster state", workers)
+                f.write(json.dumps(workers))
+
+
+class LocalNodeProvider(NodeProvider):
+    def __init__(self, provider_config, cluster_name):
+        NodeProvider.__init__(self, provider_config, cluster_name)
+        self.state = ClusterState("/tmp/cluster-{}.lock".format(cluster_name),
+                                  "/tmp/cluster-{}.state".format(cluster_name),
+                                  provider_config)
+
+    def nodes(self, tag_filters):
+        workers = self.state.get()
+        matching_ips = []
+        for worker_ip, info in workers.items():
+            if info["state"] == "terminated":
+                continue
+            ok = True
+            for k, v in tag_filters.items():
+                if info["tags"].get(k) != v:
+                    ok = False
+                    break
+            if ok:
+                matching_ips.append(worker_ip)
+        return matching_ips
+
+    def is_running(self, node_id):
+        return self.state.get()[node_id]["state"] == "running"
+
+    def is_terminated(self, node_id):
+        return not self.is_running(node_id)
+
+    def node_tags(self, node_id):
+        return self.state.get()[node_id]["tags"]
+
+    def external_ip(self, node_id):
+        return socket.gethostbyname(node_id)
+
+    def internal_ip(self, node_id):
+        return socket.gethostbyname(node_id)
+
+    def set_node_tags(self, node_id, tags):
+        with self.state.file_lock:
+            info = self.state.get()[node_id]
+            info["tags"].update(tags)
+            self.state.put(node_id, info)
+
+    def create_node(self, node_config, tags, count):
+        node_type = tags[TAG_RAY_NODE_TYPE]
+        with self.state.file_lock:
+            workers = self.state.get()
+            for node_id, info in workers.items():
+                if (info["state"] == "terminated"
+                        and info["tags"][TAG_RAY_NODE_TYPE] == node_type):
+                    info["tags"] = tags
+                    info["state"] = "running"
+                    self.state.put(node_id, info)
+                    return
+
+    def terminate_node(self, node_id):
+        workers = self.state.get()
+        info = workers[node_id]
+        info["state"] = "terminated"
+        self.state.put(node_id, info)