From 477d381a5ab4e7e9d1e01fec540f1cbac5e540f6 Mon Sep 17 00:00:00 2001 From: Helena Bales Date: Fri, 6 Dec 2019 15:14:44 -0800 Subject: [PATCH] Confirm and retry AWS instance allocations. (#119) Confirm and retry AWS instance allocations. Retry with timeouts, confirming the expected number of instances have been allocated. Retry with timeouts, confirming the expected number of instances are in the running state. Signed-off-by: Robert Houghton --- .../infrastructure/aws/LaunchCluster.java | 67 ++++++++++++------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/infrastructure/src/main/java/org/apache/geode/infrastructure/aws/LaunchCluster.java b/infrastructure/src/main/java/org/apache/geode/infrastructure/aws/LaunchCluster.java index ca7c7ae81..4fd7bb4a7 100644 --- a/infrastructure/src/main/java/org/apache/geode/infrastructure/aws/LaunchCluster.java +++ b/infrastructure/src/main/java/org/apache/geode/infrastructure/aws/LaunchCluster.java @@ -23,6 +23,8 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.attribute.PosixFilePermissions; +import java.time.Duration; +import java.time.Instant; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.ArrayList; @@ -99,9 +101,10 @@ public static void main(String[] args) throws IOException, InterruptedException authorizeSecurityGroup(benchmarkTag); createLaunchTemplate(benchmarkTag, newestImage); - List hostIds = allocateHosts(tags, count); - List instanceIds = launchInstances(benchmarkTag, tags, count, hostIds); - DescribeInstancesResponse instances = waitForInstances(instanceIds); + int ec2Timeout = 300; + List hostIds = allocateHosts(tags, count, ec2Timeout); + List instanceIds = launchInstances(benchmarkTag, tags, hostIds); + DescribeInstancesResponse instances = waitForInstances(instanceIds, ec2Timeout); List publicIps = getPublicIps(instances); createMetadata(benchmarkTag, publicIps); installPrivateKey(benchmarkTag, publicIps); @@ -120,26 +123,38 @@ private static void usage(String s) { throw new IllegalStateException(s); } - private static List allocateHosts(List tags, int count) { - AllocateHostsResponse hosts = ec2.allocateHosts(AllocateHostsRequest.builder() - .availabilityZone("us-west-2a") - .instanceType(AwsBenchmarkMetadata.instanceType().toString()) - .quantity(count) - .tagSpecifications(TagSpecification.builder() - .tags(tags) - .resourceType(ResourceType.DEDICATED_HOST) - .build()) - .build()); + private static List allocateHosts(List tags, int count, int timeout) + throws InterruptedException { + int gotHosts = 0; + AllocateHostsResponse hosts; + List hostIds = new ArrayList<>(); + + Instant end = Instant.now().plus(Duration.ofSeconds(timeout)); + do { + hosts = ec2.allocateHosts(AllocateHostsRequest.builder() + .availabilityZone("us-west-2a") + .instanceType(AwsBenchmarkMetadata.instanceType().toString()) + .quantity(count - gotHosts) + .tagSpecifications(TagSpecification.builder() + .tags(tags) + .resourceType(ResourceType.DEDICATED_HOST) + .build()) + .build()); + hostIds.addAll(hosts.hostIds()); + gotHosts += hosts.hostIds().size(); + if (Instant.now().isAfter(end)) { + throw new InterruptedException( + count + " hosts were not allocated before timeout of " + timeout + " seconds."); + } + } while (gotHosts < count); - return hosts.hostIds(); + return hostIds; } private static List launchInstances(String launchTemplate, List tags, - int instanceCount, List hosts) - throws InterruptedException { - List instanceIds = new ArrayList<>(instanceCount); + List hosts) { + List instanceIds = new ArrayList<>(hosts.size()); for (String host : hosts) { - // launch instances RunInstancesResponse rir = ec2.runInstances(RunInstancesRequest.builder() .launchTemplate(LaunchTemplateSpecification.builder() .launchTemplateName(AwsBenchmarkMetadata.launchTemplate(launchTemplate)) @@ -162,16 +177,22 @@ private static List launchInstances(String launchTemplate, List tag return instanceIds; } - private static DescribeInstancesResponse waitForInstances(List instanceIds) + private static DescribeInstancesResponse waitForInstances(List instanceIds, int timeout) throws InterruptedException { System.out.println("Waiting for cluster instances to go fully online."); - DescribeInstancesResponse describeInstancesResponse = describeInstances(instanceIds, "running"); - while (instanceCount(describeInstancesResponse) < instanceIds.size()) { + Instant end = Instant.now().plus(Duration.ofSeconds(timeout)); + DescribeInstancesResponse describeInstancesResponse; + do { sleep(AwsBenchmarkMetadata.POLL_INTERVAL); - System.out.println("Continuing to wait."); + System.out.println( + "Continuing to wait for " + new StringBuilder().append(instanceIds + ", ").toString()); describeInstancesResponse = describeInstances(instanceIds, "running"); - } + if (Instant.now().isAfter(end)) { + throw new InterruptedException(instanceIds.size() + + " hosts were not running before timeout of " + timeout + " seconds."); + } + } while (instanceCount(describeInstancesResponse) < instanceIds.size()); return describeInstancesResponse; }