Skip to content

Commit

Permalink
[autoscaler] Retry creating EC2 instances in new AZ (ray-project#6129)
Browse files Browse the repository at this point in the history
  • Loading branch information
AdamGleave authored and richardliaw committed Nov 10, 2019
1 parent d17ae5a commit 01aee8d
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 9 deletions.
24 changes: 15 additions & 9 deletions python/ray/autoscaler/aws/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ray.autoscaler.node_provider import NodeProvider
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_NAME, \
TAG_RAY_LAUNCH_CONFIG, TAG_RAY_NODE_TYPE
from ray.ray_constants import BOTO_MAX_RETRIES
from ray.ray_constants import BOTO_MAX_RETRIES, BOTO_CREATE_MAX_RETRIES
from ray.autoscaler.log_timer import LogTimer

logger = logging.getLogger(__name__)
Expand All @@ -38,14 +38,21 @@ def from_aws_format(tags):
return tags


def make_ec2_client(region, max_retries):
"""Make client, retrying requests up to `max_retries`."""
config = Config(retries={"max_attempts": max_retries})
return boto3.resource("ec2", region_name=region, config=config)


class AWSNodeProvider(NodeProvider):
def __init__(self, provider_config, cluster_name):
NodeProvider.__init__(self, provider_config, cluster_name)
config = Config(retries={"max_attempts": BOTO_MAX_RETRIES})
self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes",
True)
self.ec2 = boto3.resource(
"ec2", region_name=provider_config["region"], config=config)
self.ec2 = make_ec2_client(
region=provider_config["region"], max_retries=BOTO_MAX_RETRIES)
self.ec2_fail_fast = make_ec2_client(
region=provider_config["region"], max_retries=0)

# Try availability zones round-robin, starting from random offset
self.subnet_idx = random.randint(0, 100)
Expand Down Expand Up @@ -268,8 +275,7 @@ def _create_node(self, node_config, tags, count):
# single SubnetId before invoking the AWS API.
subnet_ids = conf.pop("SubnetIds")

max_retries = 5
for attempt in range(1, max_retries + 1):
for attempt in range(1, BOTO_CREATE_MAX_RETRIES + 1):
try:
subnet_id = subnet_ids[self.subnet_idx % len(subnet_ids)]
logger.info("NodeProvider: calling create_instances "
Expand All @@ -281,7 +287,7 @@ def _create_node(self, node_config, tags, count):
"SubnetId": subnet_id,
"TagSpecifications": tag_specs
})
created = self.ec2.create_instances(**conf)
created = self.ec2_fail_fast.create_instances(**conf)
for instance in created:
logger.info("NodeProvider: Created instance "
"[id={}, name={}, info={}]".format(
Expand All @@ -290,10 +296,10 @@ def _create_node(self, node_config, tags, count):
instance.state_reason["Message"]))
break
except botocore.exceptions.ClientError as exc:
if attempt == max_retries:
if attempt == BOTO_CREATE_MAX_RETRIES:
logger.error(
"create_instances: Max attempts ({}) exceeded.".format(
max_retries))
BOTO_CREATE_MAX_RETRIES))
raise exc
else:
logger.error(exc)
Expand Down
2 changes: 2 additions & 0 deletions python/ray/ray_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ def to_memory_units(memory_bytes, round_up):

# Max number of retries to AWS (default is 5, time increases exponentially)
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)
# Max number of retries to create an EC2 node (retry different subnet)
BOTO_CREATE_MAX_RETRIES = env_integer("BOTO_CREATE_MAX_RETRIES", 5)

LOGGER_FORMAT = (
"%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s")
Expand Down

0 comments on commit 01aee8d

Please sign in to comment.