Updates to azure autoscaler for authentication and dependency updates (…

…ray-project#19603) * updating azure autoscaler versions and backwards compatibility, and moving to azure-identity based authentication * adding azure sdk rqmts for tests * updating azure test requirements and adding wrapper function for azure sdk function resolution * adding docstring to get_azure_sdk_function Co-authored-by: Scott Graham <[email protected]>
yutiansut · Dec 16, 2021 · 7153d58 · 7153d58
1 parent a85df8c
commit 7153d58
Show file tree

Hide file tree

Showing 10 changed files with 103 additions and 100 deletions.
diff --git a/doc/azure/azure-ray-template.json b/doc/azure/azure-ray-template.json
@@ -80,14 +80,14 @@
         },
         "condaEnv": {
             "type": "string",
-            "defaultValue": "py37_tensorflow",
+            "defaultValue": "py38_tensorflow",
             "allowedValues": [
                 "azureml_py36_automl",
                 "azureml_py36_pytorch",
                 "azureml_py36_tensorflow",
-                "py37_default",
-                "py37_pytorch",
-                "py37_tensorflow"
+                "py38_default",
+                "py38_pytorch",
+                "py38_tensorflow"
             ],
             "metadata": {
                 "description": "Conda environment to select (installed on DSVM)"
@@ -131,7 +131,7 @@
         "imagePublisher": "microsoft-dsvm",
         "imageOffer": "ubuntu-1804",
         "imageSku": "1804",
-        "imageVersion": "20.07.06"
+        "imageVersion": "latest"
     },
     "resources": [
         {

diff --git a/doc/source/cluster/cloud.rst b/doc/source/cluster/cloud.rst
@@ -83,7 +83,7 @@ Ray with cloud providers
            :alt: Deploy to Azure
 
         Once the template is successfully deployed the deployment Outputs page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input).
-        Use the following code in a Jupyter notebook (using the conda environment specified in the template input, py37_tensorflow by default) to connect to the Ray cluster.
+        Use the following code in a Jupyter notebook (using the conda environment specified in the template input, py38_tensorflow by default) to connect to the Ray cluster.
 
         .. code-block:: python
 

diff --git a/docker/ray-deps/Dockerfile b/docker/ray-deps/Dockerfile
@@ -17,11 +17,12 @@ RUN $HOME/anaconda3/bin/pip --no-cache-dir install --find-links $FIND_LINKS_PATH
         "google-api-python-client==1.7.8" \
         "google-oauth" \
         "kubernetes" \
-        "azure-cli-core==2.22.0" \
-        "azure-mgmt-compute==14.0.0" \
-        "azure-mgmt-msi==1.0.0" \
-        "azure-mgmt-network==10.2.0" \
-        "azure-mgmt-resource==13.0.0"; fi) \
+        "azure-cli-core==2.29.1" \
+        "azure-identity==1.7.0" \
+        "azure-mgmt-compute==23.1.0" \
+        "azure-mgmt-network==19.0.0" \
+        "azure-mgmt-resource==20.0.0" \
+        "msrestazure==0.6.4"; fi) \
     $(if [ $($HOME/anaconda3/bin/python -c "import sys; print(sys.version_info.minor)") != 6 ] \
         && [ "$AUTOSCALER" = "autoscaler" ]; then echo "kopf"; fi) \
     && $HOME/anaconda3/bin/pip uninstall ray -y && sudo rm $(basename $WHEEL_PATH)
diff --git a/python/ray/autoscaler/_private/_azure/config.py b/python/ray/autoscaler/_private/_azure/config.py
@@ -2,9 +2,10 @@
 import logging
 from pathlib import Path
 import random
+from typing import Any, Callable
 
-from azure.common.client_factory import get_client_from_cli_profile
-from azure.common.credentials import get_azure_cli_credentials
+from azure.common.credentials import get_cli_profile
+from azure.identity import AzureCliCredential
 from azure.mgmt.resource import ResourceManagementClient
 from azure.mgmt.resource.resources.models import DeploymentMode
 
@@ -17,31 +18,39 @@
 logger = logging.getLogger(__name__)
 
 
+def get_azure_sdk_function(client: Any, function_name: str) -> Callable:
+    """Retrieve a callable function from Azure SDK client object.
+
+       Newer versions of the various client SDKs renamed function names to
+       have a begin_ prefix. This function supports both the old and new
+       versions of the SDK by first trying the old name and falling back to
+       the prefixed new name.
+    """
+    func = getattr(client, function_name,
+                   getattr(client, f"begin_{function_name}"))
+    if func is None:
+        raise AttributeError(
+            "'{obj}' object has no {func} or begin_{func} attribute".format(
+                obj={client.__name__}, func=function_name))
+    return func
+
+
 def bootstrap_azure(config):
     config = _configure_key_pair(config)
     config = _configure_resource_group(config)
     return config
 
 
-def _get_client(client_class, config):
-    kwargs = {}
-    if "subscription_id" in config["provider"]:
-        kwargs["subscription_id"] = config["provider"]["subscription_id"]
-
-    return get_client_from_cli_profile(client_class=client_class, **kwargs)
-
-
 def _configure_resource_group(config):
     # TODO: look at availability sets
     # https://docs.microsoft.com/en-us/azure/virtual-machines/windows/tutorial-availability-sets
-    resource_client = _get_client(ResourceManagementClient, config)
-
-    _, cli_subscription_id = get_azure_cli_credentials(
-        resource=ResourceManagementClient)
-    subscription_id = config["provider"].get("subscription_id",
-                                             cli_subscription_id)
-    logger.info("Using subscription id: %s", subscription_id)
+    subscription_id = config["provider"].get("subscription_id")
+    if subscription_id is None:
+        subscription_id = get_cli_profile().get_subscription_id()
+    resource_client = ResourceManagementClient(AzureCliCredential(),
+                                               subscription_id)
     config["provider"]["subscription_id"] = subscription_id
+    logger.info("Using subscription id: %s", subscription_id)
 
     assert "resource_group" in config["provider"], (
         "Provider config must include resource_group field")
@@ -80,10 +89,8 @@ def _configure_resource_group(config):
         }
     }
 
-    if hasattr(resource_client.deployments, "create_or_update"):
-        create_or_update = resource_client.deployments.create_or_update
-    else:
-        create_or_update = resource_client.deployments.begin_create_or_update
+    create_or_update = get_azure_sdk_function(
+        client=resource_client.deployments, function_name="create_or_update")
     create_or_update(
         resource_group_name=resource_group,
         deployment_name="ray-config",

diff --git a/python/ray/autoscaler/_private/_azure/node_provider.py b/python/ray/autoscaler/_private/_azure/node_provider.py
@@ -4,22 +4,24 @@
 from threading import RLock
 from uuid import uuid4
 
-from azure.common.client_factory import get_client_from_cli_profile
-from msrestazure.azure_active_directory import MSIAuthentication
+from azure.identity import DefaultAzureCredential
 from azure.mgmt.compute import ComputeManagementClient
 from azure.mgmt.network import NetworkManagementClient
 from azure.mgmt.resource import ResourceManagementClient
 from azure.mgmt.resource.resources.models import DeploymentMode
-from knack.util import CLIError
 
 from ray.autoscaler.node_provider import NodeProvider
 from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_NAME
-from ray.autoscaler._private._azure.config import bootstrap_azure
+from ray.autoscaler._private._azure.config import (bootstrap_azure,
+                                                   get_azure_sdk_function)
 
 VM_NAME_MAX_LEN = 64
 VM_NAME_UUID_LEN = 8
 
 logger = logging.getLogger(__name__)
+azure_logger = logging.getLogger(
+    "azure.core.pipeline.policies.http_logging_policy")
+azure_logger.setLevel(logging.WARNING)
 
 
 def synchronized(f):
@@ -47,29 +49,15 @@ class AzureNodeProvider(NodeProvider):
 
     def __init__(self, provider_config, cluster_name):
         NodeProvider.__init__(self, provider_config, cluster_name)
-        kwargs = {}
-        if "subscription_id" in provider_config:
-            kwargs["subscription_id"] = provider_config["subscription_id"]
-        try:
-            self.compute_client = get_client_from_cli_profile(
-                client_class=ComputeManagementClient, **kwargs)
-            self.network_client = get_client_from_cli_profile(
-                client_class=NetworkManagementClient, **kwargs)
-            self.resource_client = get_client_from_cli_profile(
-                client_class=ResourceManagementClient, **kwargs)
-        except CLIError as e:
-            if str(e) != "Please run 'az login' to setup account.":
-                raise
-            else:
-                logger.info("CLI profile authentication failed. Trying MSI")
-
-                credentials = MSIAuthentication()
-                self.compute_client = ComputeManagementClient(
-                    credentials=credentials, **kwargs)
-                self.network_client = NetworkManagementClient(
-                    credentials=credentials, **kwargs)
-                self.resource_client = ResourceManagementClient(
-                    credentials=credentials, **kwargs)
+        subscription_id = provider_config["subscription_id"]
+        credential = DefaultAzureCredential(
+            exclude_shared_token_cache_credential=True)
+        self.compute_client = ComputeManagementClient(credential,
+                                                      subscription_id)
+        self.network_client = NetworkManagementClient(credential,
+                                                      subscription_id)
+        self.resource_client = ResourceManagementClient(
+            credential, subscription_id)
 
         self.lock = RLock()
 
@@ -213,11 +201,10 @@ def create_node(self, node_config, tags, count):
         }
 
         # TODO: we could get the private/public ips back directly
-        if hasattr(self.resource_client.deployments, "create_or_update"):
-            create = self.resource_client.deployments.create_or_update
-        else:
-            create = self.resource_client.deployments.begin_create_or_update
-        create(
+        create_or_update = get_azure_sdk_function(
+            client=self.resource_client.deployments,
+            function_name="create_or_update")
+        create_or_update(
             resource_group_name=resource_group,
             deployment_name="ray-vm-{}".format(name_tag),
             parameters=parameters).wait()
@@ -227,17 +214,13 @@ def set_node_tags(self, node_id, tags):
         """Sets the tag values (string dict) for the specified node."""
         node_tags = self._get_cached_node(node_id)["tags"]
         node_tags.update(tags)
-        if hasattr(self.compute_client.virtual_machines, "update"):
-            self.compute_client.virtual_machines.update(
-                resource_group_name=self.provider_config["resource_group"],
-                vm_name=node_id,
-                parameters={"tags": node_tags})
-        else:
-            # Newer versions of the client use begin_update, not update
-            self.compute_client.virtual_machines.begin_update(
-                resource_group_name=self.provider_config["resource_group"],
-                vm_name=node_id,
-                parameters={"tags": node_tags})
+        update = get_azure_sdk_function(
+            client=self.compute_client.virtual_machines,
+            function_name="update")
+        update(
+            resource_group_name=self.provider_config["resource_group"],
+            vm_name=node_id,
+            parameters={"tags": node_tags})
         self.cached_nodes[node_id]["tags"] = node_tags
 
     def terminate_node(self, node_id):
@@ -265,14 +248,19 @@ def terminate_node(self, node_id):
 
         try:
             # delete machine, must wait for this to complete
-            self.compute_client.virtual_machines.delete(
-                resource_group_name=resource_group, vm_name=node_id).wait()
+            delete = get_azure_sdk_function(
+                client=self.compute_client.virtual_machines,
+                function_name="delete")
+            delete(resource_group_name=resource_group, vm_name=node_id).wait()
         except Exception as e:
             logger.warning("Failed to delete VM: {}".format(e))
 
         try:
             # delete nic
-            self.network_client.network_interfaces.delete(
+            delete = get_azure_sdk_function(
+                client=self.network_client.network_interfaces,
+                function_name="delete")
+            delete(
                 resource_group_name=resource_group,
                 network_interface_name=metadata["nic_name"])
         except Exception as e:
@@ -281,7 +269,10 @@ def terminate_node(self, node_id):
         # delete ip address
         if "public_ip_name" in metadata:
             try:
-                self.network_client.public_ip_addresses.delete(
+                delete = get_azure_sdk_function(
+                    client=self.network_client.public_ip_addresses,
+                    function_name="delete")
+                delete(
                     resource_group_name=resource_group,
                     public_ip_address_name=metadata["public_ip_name"])
             except Exception as e:
@@ -290,8 +281,9 @@ def terminate_node(self, node_id):
         # delete disks
         for disk in disks:
             try:
-                self.compute_client.disks.delete(
-                    resource_group_name=resource_group, disk_name=disk)
+                delete = get_azure_sdk_function(
+                    client=self.compute_client.disks, function_name="delete")
+                delete(resource_group_name=resource_group, disk_name=disk)
             except Exception as e:
                 logger.warning("Failed to delete disk: {}".format(e))
 

diff --git a/python/ray/autoscaler/azure/defaults.yaml b/python/ray/autoscaler/azure/defaults.yaml
@@ -56,7 +56,7 @@ available_node_types:
                 imagePublisher: microsoft-dsvm
                 imageOffer: ubuntu-1804
                 imageSku: 1804-gen2
-                imageVersion: 21.01.21
+                imageVersion: latest
 
     ray.worker.default:
         # The minimum number of nodes of this type to launch.
@@ -72,8 +72,8 @@ available_node_types:
                 imagePublisher: microsoft-dsvm
                 imageOffer: ubuntu-1804
                 imageSku: 1804-gen2
-                imageVersion: 21.01.21
-                # optionally set priority to use Spot instances
+                imageVersion: latest
+                # comment lines below to not use Spot instances
                 priority: Spot
                 # set a maximum price for spot instances if desired
                 # billingProfile:
@@ -121,17 +121,17 @@ setup_commands:
     # has your Ray repo pre-cloned. Then, you can replace the pip installs
     # below with a git checkout <your_sha> (and possibly a recompile).
     - (which conda && echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc) || true
-    # - (conda activate py37_pytorch &> /dev/null && echo 'conda activate py37_pytorch' >> ~/.bashrc) || true
-    - (conda activate py37_tensorflow &> /dev/null && echo 'conda activate py37_tensorflow' >> ~/.bashrc) || true
-    - which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
+    # - (conda activate py38_pytorch &> /dev/null && echo 'conda activate py38_pytorch' >> ~/.bashrc) || true
+    - (conda activate py38_tensorflow &> /dev/null && echo 'conda activate py38_tensorflow' >> ~/.bashrc) || true
+    - which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"
     # Consider uncommenting these if you also want to run apt-get commands during setup
     # - sudo pkill -9 apt-get || true
     # - sudo pkill -9 dpkg || true
     # - sudo dpkg --configure -a
 
 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
-    - pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0
+    - pip install -U azure-cli-core==2.29.1 azure-identity==1.7.0 azure-mgmt-compute==23.1.0 azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4
 
 # Custom commands that will be run on worker nodes after common setup.
 worker_setup_commands: []

diff --git a/python/ray/autoscaler/azure/example-full-legacy.yaml b/python/ray/autoscaler/azure/example-full-legacy.yaml
@@ -68,7 +68,7 @@ head_node:
         imagePublisher: microsoft-dsvm
         imageOffer: ubuntu-1804
         imageSku: 1804-gen2
-        imageVersion: 21.01.21
+        imageVersion: latest
 
 # Provider-specific config for worker nodes, e.g. instance type.
 worker_nodes:
@@ -78,7 +78,7 @@ worker_nodes:
         imagePublisher: microsoft-dsvm
         imageOffer: ubuntu-1804
         imageSku: 1804-gen2
-        imageVersion: 21.01.21
+        imageVersion: latest
         # optionally set priority to use Spot instances
         priority: Spot
         # set a maximum price for spot instances if desired
@@ -132,7 +132,7 @@ setup_commands: []
     # below with a git checkout <your_sha> (and possibly a recompile).
     # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
     # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
-    # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
+    # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"
 
 # Custom commands that will be run on the head node after common setup.
 # NOTE: rayproject/ray-ml:latest has azure packages bundled

diff --git a/python/ray/autoscaler/azure/example-full.yaml b/python/ray/autoscaler/azure/example-full.yaml
@@ -72,7 +72,7 @@ available_node_types:
                 imagePublisher: microsoft-dsvm
                 imageOffer: ubuntu-1804
                 imageSku: 1804-gen2
-                imageVersion: 21.01.21
+                imageVersion: latest
 
     ray.worker.default:
         # The minimum number of worker nodes of this type to launch.
@@ -91,7 +91,7 @@ available_node_types:
                 imagePublisher: microsoft-dsvm
                 imageOffer: ubuntu-1804
                 imageSku: 1804-gen2
-                imageVersion: 21.01.21
+                imageVersion: latest
                 # optionally set priority to use Spot instances
                 priority: Spot
                 # set a maximum price for spot instances if desired
@@ -148,7 +148,7 @@ setup_commands: []
     # below with a git checkout <your_sha> (and possibly a recompile).
     # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
     # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
-    # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
+    # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"
 
 # Custom commands that will be run on the head node after common setup.
 # NOTE: rayproject/ray-ml:latest has azure packages bundled