Skip to content

Commit

Permalink
Updates to azure autoscaler for authentication and dependency updates (
Browse files Browse the repository at this point in the history
…ray-project#19603)

* updating azure autoscaler versions and backwards compatibility, and moving to azure-identity based authentication

* adding azure sdk rqmts for tests

* updating azure test requirements and adding wrapper function for azure sdk function resolution

* adding docstring to get_azure_sdk_function

Co-authored-by: Scott Graham <[email protected]>
  • Loading branch information
gramhagen and Scott Graham authored Dec 16, 2021
1 parent a85df8c commit 7153d58
Show file tree
Hide file tree
Showing 10 changed files with 103 additions and 100 deletions.
10 changes: 5 additions & 5 deletions doc/azure/azure-ray-template.json
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,14 @@
},
"condaEnv": {
"type": "string",
"defaultValue": "py37_tensorflow",
"defaultValue": "py38_tensorflow",
"allowedValues": [
"azureml_py36_automl",
"azureml_py36_pytorch",
"azureml_py36_tensorflow",
"py37_default",
"py37_pytorch",
"py37_tensorflow"
"py38_default",
"py38_pytorch",
"py38_tensorflow"
],
"metadata": {
"description": "Conda environment to select (installed on DSVM)"
Expand Down Expand Up @@ -131,7 +131,7 @@
"imagePublisher": "microsoft-dsvm",
"imageOffer": "ubuntu-1804",
"imageSku": "1804",
"imageVersion": "20.07.06"
"imageVersion": "latest"
},
"resources": [
{
Expand Down
2 changes: 1 addition & 1 deletion doc/source/cluster/cloud.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ Ray with cloud providers
:alt: Deploy to Azure

Once the template is successfully deployed the deployment Outputs page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input).
Use the following code in a Jupyter notebook (using the conda environment specified in the template input, py37_tensorflow by default) to connect to the Ray cluster.
Use the following code in a Jupyter notebook (using the conda environment specified in the template input, py38_tensorflow by default) to connect to the Ray cluster.

.. code-block:: python
Expand Down
11 changes: 6 additions & 5 deletions docker/ray-deps/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ RUN $HOME/anaconda3/bin/pip --no-cache-dir install --find-links $FIND_LINKS_PATH
"google-api-python-client==1.7.8" \
"google-oauth" \
"kubernetes" \
"azure-cli-core==2.22.0" \
"azure-mgmt-compute==14.0.0" \
"azure-mgmt-msi==1.0.0" \
"azure-mgmt-network==10.2.0" \
"azure-mgmt-resource==13.0.0"; fi) \
"azure-cli-core==2.29.1" \
"azure-identity==1.7.0" \
"azure-mgmt-compute==23.1.0" \
"azure-mgmt-network==19.0.0" \
"azure-mgmt-resource==20.0.0" \
"msrestazure==0.6.4"; fi) \
$(if [ $($HOME/anaconda3/bin/python -c "import sys; print(sys.version_info.minor)") != 6 ] \
&& [ "$AUTOSCALER" = "autoscaler" ]; then echo "kopf"; fi) \
&& $HOME/anaconda3/bin/pip uninstall ray -y && sudo rm $(basename $WHEEL_PATH)
49 changes: 28 additions & 21 deletions python/ray/autoscaler/_private/_azure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import logging
from pathlib import Path
import random
from typing import Any, Callable

from azure.common.client_factory import get_client_from_cli_profile
from azure.common.credentials import get_azure_cli_credentials
from azure.common.credentials import get_cli_profile
from azure.identity import AzureCliCredential
from azure.mgmt.resource import ResourceManagementClient
from azure.mgmt.resource.resources.models import DeploymentMode

Expand All @@ -17,31 +18,39 @@
logger = logging.getLogger(__name__)


def get_azure_sdk_function(client: Any, function_name: str) -> Callable:
"""Retrieve a callable function from Azure SDK client object.
Newer versions of the various client SDKs renamed function names to
have a begin_ prefix. This function supports both the old and new
versions of the SDK by first trying the old name and falling back to
the prefixed new name.
"""
func = getattr(client, function_name,
getattr(client, f"begin_{function_name}"))
if func is None:
raise AttributeError(
"'{obj}' object has no {func} or begin_{func} attribute".format(
obj={client.__name__}, func=function_name))
return func


def bootstrap_azure(config):
config = _configure_key_pair(config)
config = _configure_resource_group(config)
return config


def _get_client(client_class, config):
kwargs = {}
if "subscription_id" in config["provider"]:
kwargs["subscription_id"] = config["provider"]["subscription_id"]

return get_client_from_cli_profile(client_class=client_class, **kwargs)


def _configure_resource_group(config):
# TODO: look at availability sets
# https://docs.microsoft.com/en-us/azure/virtual-machines/windows/tutorial-availability-sets
resource_client = _get_client(ResourceManagementClient, config)

_, cli_subscription_id = get_azure_cli_credentials(
resource=ResourceManagementClient)
subscription_id = config["provider"].get("subscription_id",
cli_subscription_id)
logger.info("Using subscription id: %s", subscription_id)
subscription_id = config["provider"].get("subscription_id")
if subscription_id is None:
subscription_id = get_cli_profile().get_subscription_id()
resource_client = ResourceManagementClient(AzureCliCredential(),
subscription_id)
config["provider"]["subscription_id"] = subscription_id
logger.info("Using subscription id: %s", subscription_id)

assert "resource_group" in config["provider"], (
"Provider config must include resource_group field")
Expand Down Expand Up @@ -80,10 +89,8 @@ def _configure_resource_group(config):
}
}

if hasattr(resource_client.deployments, "create_or_update"):
create_or_update = resource_client.deployments.create_or_update
else:
create_or_update = resource_client.deployments.begin_create_or_update
create_or_update = get_azure_sdk_function(
client=resource_client.deployments, function_name="create_or_update")
create_or_update(
resource_group_name=resource_group,
deployment_name="ray-config",
Expand Down
90 changes: 41 additions & 49 deletions python/ray/autoscaler/_private/_azure/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,24 @@
from threading import RLock
from uuid import uuid4

from azure.common.client_factory import get_client_from_cli_profile
from msrestazure.azure_active_directory import MSIAuthentication
from azure.identity import DefaultAzureCredential
from azure.mgmt.compute import ComputeManagementClient
from azure.mgmt.network import NetworkManagementClient
from azure.mgmt.resource import ResourceManagementClient
from azure.mgmt.resource.resources.models import DeploymentMode
from knack.util import CLIError

from ray.autoscaler.node_provider import NodeProvider
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_NAME
from ray.autoscaler._private._azure.config import bootstrap_azure
from ray.autoscaler._private._azure.config import (bootstrap_azure,
get_azure_sdk_function)

VM_NAME_MAX_LEN = 64
VM_NAME_UUID_LEN = 8

logger = logging.getLogger(__name__)
azure_logger = logging.getLogger(
"azure.core.pipeline.policies.http_logging_policy")
azure_logger.setLevel(logging.WARNING)


def synchronized(f):
Expand Down Expand Up @@ -47,29 +49,15 @@ class AzureNodeProvider(NodeProvider):

def __init__(self, provider_config, cluster_name):
NodeProvider.__init__(self, provider_config, cluster_name)
kwargs = {}
if "subscription_id" in provider_config:
kwargs["subscription_id"] = provider_config["subscription_id"]
try:
self.compute_client = get_client_from_cli_profile(
client_class=ComputeManagementClient, **kwargs)
self.network_client = get_client_from_cli_profile(
client_class=NetworkManagementClient, **kwargs)
self.resource_client = get_client_from_cli_profile(
client_class=ResourceManagementClient, **kwargs)
except CLIError as e:
if str(e) != "Please run 'az login' to setup account.":
raise
else:
logger.info("CLI profile authentication failed. Trying MSI")

credentials = MSIAuthentication()
self.compute_client = ComputeManagementClient(
credentials=credentials, **kwargs)
self.network_client = NetworkManagementClient(
credentials=credentials, **kwargs)
self.resource_client = ResourceManagementClient(
credentials=credentials, **kwargs)
subscription_id = provider_config["subscription_id"]
credential = DefaultAzureCredential(
exclude_shared_token_cache_credential=True)
self.compute_client = ComputeManagementClient(credential,
subscription_id)
self.network_client = NetworkManagementClient(credential,
subscription_id)
self.resource_client = ResourceManagementClient(
credential, subscription_id)

self.lock = RLock()

Expand Down Expand Up @@ -213,11 +201,10 @@ def create_node(self, node_config, tags, count):
}

# TODO: we could get the private/public ips back directly
if hasattr(self.resource_client.deployments, "create_or_update"):
create = self.resource_client.deployments.create_or_update
else:
create = self.resource_client.deployments.begin_create_or_update
create(
create_or_update = get_azure_sdk_function(
client=self.resource_client.deployments,
function_name="create_or_update")
create_or_update(
resource_group_name=resource_group,
deployment_name="ray-vm-{}".format(name_tag),
parameters=parameters).wait()
Expand All @@ -227,17 +214,13 @@ def set_node_tags(self, node_id, tags):
"""Sets the tag values (string dict) for the specified node."""
node_tags = self._get_cached_node(node_id)["tags"]
node_tags.update(tags)
if hasattr(self.compute_client.virtual_machines, "update"):
self.compute_client.virtual_machines.update(
resource_group_name=self.provider_config["resource_group"],
vm_name=node_id,
parameters={"tags": node_tags})
else:
# Newer versions of the client use begin_update, not update
self.compute_client.virtual_machines.begin_update(
resource_group_name=self.provider_config["resource_group"],
vm_name=node_id,
parameters={"tags": node_tags})
update = get_azure_sdk_function(
client=self.compute_client.virtual_machines,
function_name="update")
update(
resource_group_name=self.provider_config["resource_group"],
vm_name=node_id,
parameters={"tags": node_tags})
self.cached_nodes[node_id]["tags"] = node_tags

def terminate_node(self, node_id):
Expand Down Expand Up @@ -265,14 +248,19 @@ def terminate_node(self, node_id):

try:
# delete machine, must wait for this to complete
self.compute_client.virtual_machines.delete(
resource_group_name=resource_group, vm_name=node_id).wait()
delete = get_azure_sdk_function(
client=self.compute_client.virtual_machines,
function_name="delete")
delete(resource_group_name=resource_group, vm_name=node_id).wait()
except Exception as e:
logger.warning("Failed to delete VM: {}".format(e))

try:
# delete nic
self.network_client.network_interfaces.delete(
delete = get_azure_sdk_function(
client=self.network_client.network_interfaces,
function_name="delete")
delete(
resource_group_name=resource_group,
network_interface_name=metadata["nic_name"])
except Exception as e:
Expand All @@ -281,7 +269,10 @@ def terminate_node(self, node_id):
# delete ip address
if "public_ip_name" in metadata:
try:
self.network_client.public_ip_addresses.delete(
delete = get_azure_sdk_function(
client=self.network_client.public_ip_addresses,
function_name="delete")
delete(
resource_group_name=resource_group,
public_ip_address_name=metadata["public_ip_name"])
except Exception as e:
Expand All @@ -290,8 +281,9 @@ def terminate_node(self, node_id):
# delete disks
for disk in disks:
try:
self.compute_client.disks.delete(
resource_group_name=resource_group, disk_name=disk)
delete = get_azure_sdk_function(
client=self.compute_client.disks, function_name="delete")
delete(resource_group_name=resource_group, disk_name=disk)
except Exception as e:
logger.warning("Failed to delete disk: {}".format(e))

Expand Down
14 changes: 7 additions & 7 deletions python/ray/autoscaler/azure/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ available_node_types:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 21.01.21
imageVersion: latest

ray.worker.default:
# The minimum number of nodes of this type to launch.
Expand All @@ -72,8 +72,8 @@ available_node_types:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 21.01.21
# optionally set priority to use Spot instances
imageVersion: latest
# comment lines below to not use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
Expand Down Expand Up @@ -121,17 +121,17 @@ setup_commands:
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
- (which conda && echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc) || true
# - (conda activate py37_pytorch &> /dev/null && echo 'conda activate py37_pytorch' >> ~/.bashrc) || true
- (conda activate py37_tensorflow &> /dev/null && echo 'conda activate py37_tensorflow' >> ~/.bashrc) || true
- which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
# - (conda activate py38_pytorch &> /dev/null && echo 'conda activate py38_pytorch' >> ~/.bashrc) || true
- (conda activate py38_tensorflow &> /dev/null && echo 'conda activate py38_tensorflow' >> ~/.bashrc) || true
- which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
# - sudo dpkg --configure -a

# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0
- pip install -U azure-cli-core==2.29.1 azure-identity==1.7.0 azure-mgmt-compute==23.1.0 azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
Expand Down
6 changes: 3 additions & 3 deletions python/ray/autoscaler/azure/example-full-legacy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ head_node:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 21.01.21
imageVersion: latest

# Provider-specific config for worker nodes, e.g. instance type.
worker_nodes:
Expand All @@ -78,7 +78,7 @@ worker_nodes:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 21.01.21
imageVersion: latest
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
Expand Down Expand Up @@ -132,7 +132,7 @@ setup_commands: []
# below with a git checkout <your_sha> (and possibly a recompile).
# To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
# that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"

# Custom commands that will be run on the head node after common setup.
# NOTE: rayproject/ray-ml:latest has azure packages bundled
Expand Down
6 changes: 3 additions & 3 deletions python/ray/autoscaler/azure/example-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ available_node_types:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 21.01.21
imageVersion: latest

ray.worker.default:
# The minimum number of worker nodes of this type to launch.
Expand All @@ -91,7 +91,7 @@ available_node_types:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 21.01.21
imageVersion: latest
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
Expand Down Expand Up @@ -148,7 +148,7 @@ setup_commands: []
# below with a git checkout <your_sha> (and possibly a recompile).
# To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
# that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"

# Custom commands that will be run on the head node after common setup.
# NOTE: rayproject/ray-ml:latest has azure packages bundled
Expand Down
Loading

0 comments on commit 7153d58

Please sign in to comment.