PUBDEV-6556 - S3 tests failing non-deterministically in hadoop pipeline (h2oai#3579)

Pavel Pscheidl · web-flow · commit 73b5d9e712db · 2019-06-10T09:20:37.000+02:00
* PUBDEV-6556 - S3 test fails due to a file not yet existing in S3

* S3 HDP import/export test uses millisecond precision in exported file name
diff --git a/h2o-hadoop-2/tests/python/pyunit_s3_import_export.py b/h2o-hadoop-2/tests/python/pyunit_s3_import_export.py
@@ -12,16 +12,26 @@
 def s3_import_export():
     local_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
     for scheme in ["s3n", "s3a"]:
-        timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S")
+        timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S.%f")
         unique_suffix = str(uuid.uuid4())
         s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                   timestamp + "." + unique_suffix + ".csv.zip"
         h2o.export_file(local_frame, s3_path)
+
+        s3 = boto3.resource('s3')
+        client = boto3.client('s3')
+        # S3 might have a delay in indexing the file (usually milliseconds or hundreds of milliseconds)
+        # Wait for the file to be available, if not available in the biginning, try every 2 seconds, up to 10 times
+        client.get_waiter('object_exists').wait(Bucket='test.0xdata.com',
+                                                Key="h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
+                                                    timestamp + "." + unique_suffix + ".csv.zip",
+                                                WaiterConfig={
+                                                    'Delay': 2,
+                                                    'MaxAttempts': 10
+                                                })
         s3_frame = h2o.import_file(s3_path)
         assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame())
         
-        #Delete the file afterwards
-        s3 = boto3.resource('s3')
         s3.Object(bucket_name='test.0xdata.com', key="h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                                                      timestamp + "." + unique_suffix + ".csv.zip").delete()
 
diff --git a/h2o-hadoop-3/tests/python/pyunit_s3_import_export.py b/h2o-hadoop-3/tests/python/pyunit_s3_import_export.py
@@ -11,17 +11,27 @@
 
 def s3_import_export():
     local_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
-    for scheme in ["s3a"]: # s3n is deprecated since HDP3/CDH6
-        timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S")
+    for scheme in ["s3a"]:  # s3n is deprecated since HDP3/CDH6
+        timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S.%f")
         unique_suffix = str(uuid.uuid4())
         s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                   timestamp + "." + unique_suffix + ".csv.zip"
         h2o.export_file(local_frame, s3_path)
+        
+        s3 = boto3.resource('s3')
+        client = boto3.client('s3')
+        # S3 might have a delay in indexing the file (usually milliseconds or hundreds of milliseconds)
+        # Wait for the file to be available, if not available in the biginning, try every 2 seconds, up to 10 times
+        client.get_waiter('object_exists').wait(Bucket='test.0xdata.com',
+                                                Key="h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
+                                                    timestamp + "." + unique_suffix + ".csv.zip",
+                                                WaiterConfig={
+                                                    'Delay': 2,
+                                                    'MaxAttempts': 10
+                                                })
         s3_frame = h2o.import_file(s3_path)
         assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame())
         
-        #Delete the file afterwards
-        s3 = boto3.resource('s3')
         s3.Object(bucket_name='test.0xdata.com', key="h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                                                      timestamp + "." + unique_suffix + ".csv.zip").delete()