Skip to content

Commit

Permalink
Improve stability of CI through retries and tweaked timeouts (pantsbu…
Browse files Browse the repository at this point in the history
…ild#8677)

CI is painfully flaky. While the most robust solution would be to fix the flakes themselves, we can provide some improved stability now through these changes:

* Install the `pytest-rerunfailures` plugin to allow us to mark tests as `@pytest.mark.flaky(retries=1)`
    * We should be careful to not abuse this. This plugin means that we are normalizing flakes even more than we have already.
    * We should never use `retries>1`. If a test is so flaky that it needs 3 runs to work, then it should be skipped, deleted, or fixed.
* Mark 6 tests as flaky.
* Bump the Rust queue_buffer_time from 150 s to 160 s. Yesterday, I saw the unit test shard fail over 10 tests due to time out, even though those tests only take 1 second to run each.  
* Bump the max timeout from 540 seconds to 590 seconds for V1. Travis caps us at 600 seconds, so we just need to time out before that. The extra 50 seconds should reduce the number of time outs we get.
* Lower the timeout for `exception_sink_integration` so that fail eagerly when it hangs.
  • Loading branch information
Eric-Arellano authored Nov 21, 2019
1 parent fc3fbd5 commit d652105
Show file tree
Hide file tree
Showing 9 changed files with 30 additions and 13 deletions.
11 changes: 8 additions & 3 deletions pants.ini
Original file line number Diff line number Diff line change
Expand Up @@ -400,9 +400,14 @@ resolver_cache_dir: %(pants_bootstrapdir)s/python_cache/requirements

[pytest]
version: pytest>=5.2.4
# TODO(#8651): We need this until we switch to implicit namespace packages so that pytest-cov
# understands our __init__ files. NB: this version matches 3rdparty/python/requirements.txt.
pytest_plugins: +["setuptools==40.6.3"]
pytest_plugins: +[
# TODO(8528): implement a generic retry mechanism for the V2 test runner instead of using this
# plugin.
"pytest-rerunfailures>=8.0",
# TODO(#8651): We need this until we switch to implicit namespace packages so that pytest-cov
# understands our __init__ files. NB: this version matches 3rdparty/python/requirements.txt.
"setuptools==40.6.3",
]


[test.pytest]
Expand Down
4 changes: 2 additions & 2 deletions pants.travis-ci.ini
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ execution_strategy: subprocess
worker_count: 4

[test.pytest]
# NB: We set a maximum timeout of 9 minutes to fail before hitting Travis' 10 minute timeout (which
# NB: We set a maximum timeout of 9.8 minutes to fail before hitting Travis' 10 minute timeout (which
# doesn't give us useful debug info).
timeout_maximum: 540
timeout_maximum: 590


[test.junit]
Expand Down
2 changes: 1 addition & 1 deletion src/rust/engine/process_executor/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ fn main() {
store.clone(),
Platform::Linux,
executor.clone(),
std::time::Duration::from_secs(150),
std::time::Duration::from_secs(160),
std::time::Duration::from_millis(500),
std::time::Duration::from_secs(5),
)
Expand Down
2 changes: 1 addition & 1 deletion src/rust/engine/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ impl Core {
// need to take an option all the way down here and into the remote::CommandRunner struct.
Platform::Linux,
executor.clone(),
std::time::Duration::from_secs(150),
std::time::Duration::from_secs(160),
std::time::Duration::from_millis(500),
std::time::Duration::from_secs(5),
)?),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import os
from textwrap import dedent

import pytest

from pants.backend.jvm.artifact import Artifact
from pants.backend.jvm.repository import Repository
from pants.backend.jvm.scala_artifact import ScalaArtifact
Expand Down Expand Up @@ -106,6 +108,7 @@ def test_list_descendants(self):
'a/b/e:e1',
args=['a/b::'])

@pytest.mark.flaky(retries=1) # https://github.com/pantsbuild/pants/issues/8678
def test_list_all(self):
self.assert_entries('\n',
'a:a',
Expand Down
4 changes: 3 additions & 1 deletion tests/python/pants_test/base/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,9 @@ python_tests(
'testprojects:pants_plugins_directory',
],
tags = {'platform_specific_behavior', 'integration'},
timeout = 540,
# NB: This frequently times out, but due to hanging. So, we want to fail eagerly. See
# https://github.com/pantsbuild/pants/issues/8127.
timeout = 200,
)

python_tests(
Expand Down
5 changes: 5 additions & 0 deletions tests/python/pants_test/cache/test_artifact_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import os
from contextlib import contextmanager

import pytest

from pants.cache.artifact import TarballArtifact
from pants.cache.artifact_cache import (
NonfatalArtifactCacheError,
Expand Down Expand Up @@ -62,13 +64,15 @@ def test_local_cache(self):
with self.setup_local_cache() as artifact_cache:
self.do_test_artifact_cache(artifact_cache)

@pytest.mark.flaky(retries=1) # https://github.com/pantsbuild/pants/issues/6838
def test_restful_cache(self):
with self.assertRaises(InvalidRESTfulCacheProtoError):
RESTfulArtifactCache('foo', BestUrlSelector(['ftp://localhost/bar']), 'foo')

with self.setup_rest_cache() as artifact_cache:
self.do_test_artifact_cache(artifact_cache)

@pytest.mark.flaky(retries=1) # https://github.com/pantsbuild/pants/issues/6838
def test_restful_cache_failover(self):
bad_url = 'http://badhost:123'

Expand Down Expand Up @@ -185,6 +189,7 @@ def test_local_backed_remote_cache_corrupt_artifact(self):
self.assertTrue(os.path.exists(results_dir))
self.assertTrue(len(os.listdir(results_dir)) == 0)

@pytest.mark.flaky(retries=1) # https://github.com/pantsbuild/pants/issues/6838
def test_multiproc(self):
key = CacheKey('muppet_key', 'fake_hash')

Expand Down
10 changes: 5 additions & 5 deletions tests/python/pants_test/pantsd/test_pantsd_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
import os
import re
import signal
import sys
import threading
import time
import unittest
from textwrap import dedent

import pytest

from pants.testutil.pants_run_integration_test import read_pantsd_log
from pants.testutil.process_test_util import no_lingering_process_by_command
from pants.util.contextutil import environment_as, temporary_dir, temporary_file
Expand Down Expand Up @@ -120,6 +121,7 @@ def test_pantsd_pantsd_runner_doesnt_die_after_failed_run(self):
self.assert_success(self.run_pants_with_workdir(['help'], workdir, pantsd_config))
checker.assert_running()

@pytest.mark.flaky(retries=1) # https://github.com/pantsbuild/pants/issues/6114
def test_pantsd_lifecycle_invalidation(self):
"""Runs pants commands with pantsd enabled, in a loop, alternating between options that
should invalidate pantsd and incur a restart and then asserts for pid consistency.
Expand Down Expand Up @@ -392,13 +394,11 @@ def test_pantsd_pid_change(self):
# Remove the pidfile so that the teardown script doesn't try to kill process 9.
os.unlink(pidpath)

@unittest.skipIf(sys.version_info[0] == 2,
reason='Increases by ~0.52 under python 2 as described in '
'https://github.com/pantsbuild/pants/issues/7761.')
@pytest.mark.flaky(retries=1) # https://github.com/pantsbuild/pants/issues/8193
def test_pantsd_memory_usage(self):
"""Validates that after N runs, memory usage has increased by no more than X percent."""
number_of_runs = 10
max_memory_increase_fraction = 0.40 # TODO https://github.com/pantsbuild/pants/issues/7647
max_memory_increase_fraction = 0.40 # TODO https://github.com/pantsbuild/pants/issues/7647
with self.pantsd_successful_run_context() as (pantsd_run, checker, workdir, config):
# NB: This doesn't actually run against all testprojects, only those that are in the chroot,
# i.e. explicitly declared in this test file's BUILD.
Expand Down
2 changes: 2 additions & 0 deletions tests/python/pants_test/pantsd/test_process_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from contextlib import contextmanager

import psutil
import pytest

from pants.pantsd.process_manager import (
ProcessGroup,
Expand Down Expand Up @@ -140,6 +141,7 @@ def test_readwrite_metadata_by_name(self):
self.TEST_VALUE_INT
)

@pytest.mark.flaky(retries=1) # https://github.com/pantsbuild/pants/issues/6836
def test_deadline_until(self):
with self.assertRaises(ProcessMetadataManager.Timeout):
with self.captured_logging(logging.INFO) as captured:
Expand Down

0 comments on commit d652105

Please sign in to comment.