From d6521058addaf558660c091c8eb8e8d21f869056 Mon Sep 17 00:00:00 2001
From: Eric Arellano <ericarellano@me.com>
Date: Thu, 21 Nov 2019 16:48:37 -0700
Subject: [PATCH] Improve stability of CI through retries and tweaked timeouts
 (#8677)

CI is painfully flaky. While the most robust solution would be to fix the flakes themselves, we can provide some improved stability now through these changes:

* Install the `pytest-rerunfailures` plugin to allow us to mark tests as `@pytest.mark.flaky(retries=1)`
    * We should be careful to not abuse this. This plugin means that we are normalizing flakes even more than we have already.
    * We should never use `retries>1`. If a test is so flaky that it needs 3 runs to work, then it should be skipped, deleted, or fixed.
* Mark 6 tests as flaky.
* Bump the Rust queue_buffer_time from 150 s to 160 s. Yesterday, I saw the unit test shard fail over 10 tests due to time out, even though those tests only take 1 second to run each.
* Bump the max timeout from 540 seconds to 590 seconds for V1. Travis caps us at 600 seconds, so we just need to time out before that. The extra 50 seconds should reduce the number of time outs we get.
* Lower the timeout for `exception_sink_integration` so that fail eagerly when it hangs.
---
 pants.ini                                             | 11 ++++++++---
 pants.travis-ci.ini                                   |  4 ++--
 src/rust/engine/process_executor/src/main.rs          |  2 +-
 src/rust/engine/src/context.rs                        |  2 +-
 .../backend/graph_info/tasks/test_list_targets.py     |  3 +++
 tests/python/pants_test/base/BUILD                    |  4 +++-
 tests/python/pants_test/cache/test_artifact_cache.py  |  5 +++++
 .../pants_test/pantsd/test_pantsd_integration.py      | 10 +++++-----
 .../python/pants_test/pantsd/test_process_manager.py  |  2 ++
 9 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/pants.ini b/pants.ini
index 6c96454ce8c..72194d92698 100644
--- a/pants.ini
+++ b/pants.ini
@@ -400,9 +400,14 @@ resolver_cache_dir: %(pants_bootstrapdir)s/python_cache/requirements
 
 [pytest]
 version: pytest>=5.2.4
-# TODO(#8651): We need this until we switch to implicit namespace packages so that pytest-cov
-#  understands our __init__ files. NB: this version matches 3rdparty/python/requirements.txt.
-pytest_plugins: +["setuptools==40.6.3"]
+pytest_plugins: +[
+    # TODO(8528): implement a generic retry mechanism for the V2 test runner instead of using this
+    #  plugin.
+    "pytest-rerunfailures>=8.0",
+    # TODO(#8651): We need this until we switch to implicit namespace packages so that pytest-cov
+    #  understands our __init__ files. NB: this version matches 3rdparty/python/requirements.txt.
+    "setuptools==40.6.3",
+  ]
 
 
 [test.pytest]
diff --git a/pants.travis-ci.ini b/pants.travis-ci.ini
index 1118910e750..6ee4c68f9bb 100644
--- a/pants.travis-ci.ini
+++ b/pants.travis-ci.ini
@@ -11,9 +11,9 @@ execution_strategy: subprocess
 worker_count: 4
 
 [test.pytest]
-# NB: We set a maximum timeout of 9 minutes to fail before hitting Travis' 10 minute timeout (which
+# NB: We set a maximum timeout of 9.8 minutes to fail before hitting Travis' 10 minute timeout (which
 # doesn't give us useful debug info).
-timeout_maximum: 540
+timeout_maximum: 590
 
 
 [test.junit]
diff --git a/src/rust/engine/process_executor/src/main.rs b/src/rust/engine/process_executor/src/main.rs
index 226dc3d93af..1b8d3eeb1b8 100644
--- a/src/rust/engine/process_executor/src/main.rs
+++ b/src/rust/engine/process_executor/src/main.rs
@@ -361,7 +361,7 @@ fn main() {
           store.clone(),
           Platform::Linux,
           executor.clone(),
-          std::time::Duration::from_secs(150),
+          std::time::Duration::from_secs(160),
           std::time::Duration::from_millis(500),
           std::time::Duration::from_secs(5),
         )
diff --git a/src/rust/engine/src/context.rs b/src/rust/engine/src/context.rs
index adc4db8b48f..a1d5659a8a5 100644
--- a/src/rust/engine/src/context.rs
+++ b/src/rust/engine/src/context.rs
@@ -182,7 +182,7 @@ impl Core {
             // need to take an option all the way down here and into the remote::CommandRunner struct.
             Platform::Linux,
             executor.clone(),
-            std::time::Duration::from_secs(150),
+            std::time::Duration::from_secs(160),
             std::time::Duration::from_millis(500),
             std::time::Duration::from_secs(5),
           )?),
diff --git a/tests/python/pants_test/backend/graph_info/tasks/test_list_targets.py b/tests/python/pants_test/backend/graph_info/tasks/test_list_targets.py
index 2ffcd97533f..d8d6d34010b 100644
--- a/tests/python/pants_test/backend/graph_info/tasks/test_list_targets.py
+++ b/tests/python/pants_test/backend/graph_info/tasks/test_list_targets.py
@@ -4,6 +4,8 @@
 import os
 from textwrap import dedent
 
+import pytest
+
 from pants.backend.jvm.artifact import Artifact
 from pants.backend.jvm.repository import Repository
 from pants.backend.jvm.scala_artifact import ScalaArtifact
@@ -106,6 +108,7 @@ def test_list_descendants(self):
         'a/b/e:e1',
         args=['a/b::'])
 
+  @pytest.mark.flaky(retries=1)  # https://github.com/pantsbuild/pants/issues/8678
   def test_list_all(self):
     self.assert_entries('\n',
         'a:a',
diff --git a/tests/python/pants_test/base/BUILD b/tests/python/pants_test/base/BUILD
index 151ca5eb4d4..80b26178354 100644
--- a/tests/python/pants_test/base/BUILD
+++ b/tests/python/pants_test/base/BUILD
@@ -212,7 +212,9 @@ python_tests(
     'testprojects:pants_plugins_directory',
   ],
   tags = {'platform_specific_behavior', 'integration'},
-  timeout = 540,
+  # NB: This frequently times out, but due to hanging. So, we want to fail eagerly. See
+  # https://github.com/pantsbuild/pants/issues/8127.
+  timeout = 200,
 )
 
 python_tests(
diff --git a/tests/python/pants_test/cache/test_artifact_cache.py b/tests/python/pants_test/cache/test_artifact_cache.py
index e89fd24e7c8..75d26edd779 100644
--- a/tests/python/pants_test/cache/test_artifact_cache.py
+++ b/tests/python/pants_test/cache/test_artifact_cache.py
@@ -4,6 +4,8 @@
 import os
 from contextlib import contextmanager
 
+import pytest
+
 from pants.cache.artifact import TarballArtifact
 from pants.cache.artifact_cache import (
   NonfatalArtifactCacheError,
@@ -62,6 +64,7 @@ def test_local_cache(self):
     with self.setup_local_cache() as artifact_cache:
       self.do_test_artifact_cache(artifact_cache)
 
+  @pytest.mark.flaky(retries=1)  # https://github.com/pantsbuild/pants/issues/6838
   def test_restful_cache(self):
     with self.assertRaises(InvalidRESTfulCacheProtoError):
       RESTfulArtifactCache('foo', BestUrlSelector(['ftp://localhost/bar']), 'foo')
@@ -69,6 +72,7 @@ def test_restful_cache(self):
     with self.setup_rest_cache() as artifact_cache:
       self.do_test_artifact_cache(artifact_cache)
 
+  @pytest.mark.flaky(retries=1)  # https://github.com/pantsbuild/pants/issues/6838
   def test_restful_cache_failover(self):
     bad_url = 'http://badhost:123'
 
@@ -185,6 +189,7 @@ def test_local_backed_remote_cache_corrupt_artifact(self):
             self.assertTrue(os.path.exists(results_dir))
             self.assertTrue(len(os.listdir(results_dir)) == 0)
 
+  @pytest.mark.flaky(retries=1)  # https://github.com/pantsbuild/pants/issues/6838
   def test_multiproc(self):
     key = CacheKey('muppet_key', 'fake_hash')
 
diff --git a/tests/python/pants_test/pantsd/test_pantsd_integration.py b/tests/python/pants_test/pantsd/test_pantsd_integration.py
index 0e3ec0d880b..896fec815e4 100644
--- a/tests/python/pants_test/pantsd/test_pantsd_integration.py
+++ b/tests/python/pants_test/pantsd/test_pantsd_integration.py
@@ -6,12 +6,13 @@
 import os
 import re
 import signal
-import sys
 import threading
 import time
 import unittest
 from textwrap import dedent
 
+import pytest
+
 from pants.testutil.pants_run_integration_test import read_pantsd_log
 from pants.testutil.process_test_util import no_lingering_process_by_command
 from pants.util.contextutil import environment_as, temporary_dir, temporary_file
@@ -120,6 +121,7 @@ def test_pantsd_pantsd_runner_doesnt_die_after_failed_run(self):
         self.assert_success(self.run_pants_with_workdir(['help'], workdir, pantsd_config))
         checker.assert_running()
 
+  @pytest.mark.flaky(retries=1)  # https://github.com/pantsbuild/pants/issues/6114
   def test_pantsd_lifecycle_invalidation(self):
     """Runs pants commands with pantsd enabled, in a loop, alternating between options that
     should invalidate pantsd and incur a restart and then asserts for pid consistency.
@@ -392,13 +394,11 @@ def test_pantsd_pid_change(self):
       # Remove the pidfile so that the teardown script doesn't try to kill process 9.
       os.unlink(pidpath)
 
-  @unittest.skipIf(sys.version_info[0] == 2,
-                   reason='Increases by ~0.52 under python 2 as described in '
-                          'https://github.com/pantsbuild/pants/issues/7761.')
+  @pytest.mark.flaky(retries=1)  # https://github.com/pantsbuild/pants/issues/8193
   def test_pantsd_memory_usage(self):
     """Validates that after N runs, memory usage has increased by no more than X percent."""
     number_of_runs = 10
-    max_memory_increase_fraction = 0.40 # TODO https://github.com/pantsbuild/pants/issues/7647
+    max_memory_increase_fraction = 0.40  # TODO https://github.com/pantsbuild/pants/issues/7647
     with self.pantsd_successful_run_context() as (pantsd_run, checker, workdir, config):
       # NB: This doesn't actually run against all testprojects, only those that are in the chroot,
       # i.e. explicitly declared in this test file's BUILD.
diff --git a/tests/python/pants_test/pantsd/test_process_manager.py b/tests/python/pants_test/pantsd/test_process_manager.py
index 86d21c06a19..44cf6f9432b 100644
--- a/tests/python/pants_test/pantsd/test_process_manager.py
+++ b/tests/python/pants_test/pantsd/test_process_manager.py
@@ -10,6 +10,7 @@
 from contextlib import contextmanager
 
 import psutil
+import pytest
 
 from pants.pantsd.process_manager import (
   ProcessGroup,
@@ -140,6 +141,7 @@ def test_readwrite_metadata_by_name(self):
         self.TEST_VALUE_INT
       )
 
+  @pytest.mark.flaky(retries=1)  # https://github.com/pantsbuild/pants/issues/6836
   def test_deadline_until(self):
     with self.assertRaises(ProcessMetadataManager.Timeout):
       with self.captured_logging(logging.INFO) as captured: