Skip to content

Commit

Permalink
Revert "Reduce pytest prints (pytorch#117069)"
Browse files Browse the repository at this point in the history
This reverts commit 2f89ef2.

Reverted pytorch#117069 on behalf of https://github.com/clee2000 due to distributed tests are not printing items ([comment](pytorch#117069 (comment)))
  • Loading branch information
pytorchmergebot committed Jan 19, 2024
1 parent a468b9f commit 77cfaca
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 122 deletions.
5 changes: 5 additions & 0 deletions .ci/docker/requirements-ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,11 @@ pytest-xdist==3.3.1
#Pinned versions:
#test that import:

pytest-shard==0.1.2
#Description: plugin spliting up tests in pytest
#Pinned versions:
#test that import:

pytest-flakefinder==1.1.0
#Description: plugin for rerunning tests a fixed number of times in pytest
#Pinned versions: 1.1.0
Expand Down
1 change: 1 addition & 0 deletions .github/requirements/pip-requirements-macOS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pytest==7.3.2
pytest-xdist==3.3.1
pytest-rerunfailures==10.3
pytest-flakefinder==1.1.0
pytest-shard==0.1.2
scipy==1.10.1
sympy==1.11.1
unittest-xml-reporting<=3.2.0,>=2.0.0
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ jobs:
cache: pip
- name: Install dependencies
run: |
pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.1.* numpy==1.24.*
pip install pytest-rerunfailures==11.1.* pytest-shard==0.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.1.* numpy==1.24.*
pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/
- name: Run run_test.py (nonretryable)
run: |
Expand Down
2 changes: 0 additions & 2 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ addopts =
--capture=sys
# don't suppress warnings, but don't shove them all to the end either
-p no:warnings
# Use custom pytest shard located in test/pytest_shard_custom.py instead
-p no:pytest-shard
# don't rewrite assertions (usually not a problem in CI due to differences in imports, see #95844)
--assert=plain
testpaths =
Expand Down
4 changes: 0 additions & 4 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import json
import re
from collections import defaultdict
from pytest_shard_custom import PytestShardPlugin, pytest_addoptions as shard_addoptions

# a lot of this file is copied from _pytest.junitxml and modified to get rerun info

Expand Down Expand Up @@ -85,7 +84,6 @@ def pytest_addoption(parser: Parser) -> None:
"Emit XML for schema: one of legacy|xunit1|xunit2",
default="xunit2",
)
shard_addoptions(parser)


def pytest_configure(config: Config) -> None:
Expand All @@ -107,8 +105,6 @@ def pytest_configure(config: Config) -> None:
config.option.stepcurrent = config.getoption("stepcurrent_skip")
if config.getoption("stepcurrent"):
config.pluginmanager.register(StepcurrentPlugin(config), "stepcurrentplugin")
if config.getoption("num_shards"):
config.pluginmanager.register(PytestShardPlugin(config), "pytestshardplugin")


def pytest_unconfigure(config: Config) -> None:
Expand Down
67 changes: 0 additions & 67 deletions test/pytest_shard_custom.py

This file was deleted.

87 changes: 39 additions & 48 deletions test/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,11 +517,11 @@ def run_test(
else:
unittest_args.extend(
[
f"--shard-id={test_module.shard}",
f"--shard-id={test_module.shard - 1}",
f"--num-shards={test_module.num_shards}",
]
)
stepcurrent_key = f"{test_file}_{test_module.shard}_{os.urandom(8).hex()}"
stepcurrent_key = f"{test_file}_{test_module.shard - 1}_{os.urandom(8).hex()}"

if options.verbose:
unittest_args.append(f'-{"v"*options.verbose}') # in case of pytest
Expand All @@ -540,6 +540,7 @@ def run_test(
unittest_args.extend(
get_pytest_args(
options,
stepcurrent_key,
is_cpp_test=is_cpp_test,
is_distributed_test=is_distributed_test,
)
Expand Down Expand Up @@ -602,15 +603,15 @@ def run_test(
os.close(log_fd)

command = (launcher_cmd or []) + executable + argv
should_retry = "--subprocess" not in command and not RERUN_DISABLED_TESTS
num_retries = 0 if "--subprocess" in command or RERUN_DISABLED_TESTS else 2
is_slow = "slow" in os.environ.get("TEST_CONFIG", "") or "slow" in os.environ.get(
"BUILD_ENVRIONMENT", ""
)
timeout = (
THRESHOLD * 6
if is_slow
else THRESHOLD * 3
if should_retry
if num_retries > 0
and isinstance(test_module, ShardedTest)
and test_module.time is not None
else None
Expand All @@ -622,25 +623,20 @@ def run_test(
if IS_CI:
output = stack.enter_context(open(log_path, "w"))

if should_retry:
ret_code, was_rerun = run_test_retries(
command,
test_directory,
env,
timeout,
stepcurrent_key,
output,
options.continue_through_error,
if options.continue_through_error and "--subprocess" not in command:
# I think subprocess is better handled by common_utils? but it's not working atm
ret_code, was_rerun = run_test_continue_through_error(
command, test_directory, env, timeout, stepcurrent_key, output
)
else:
command.append(f"--sc={stepcurrent_key}")
ret_code, was_rerun = retry_shell(
command,
test_directory,
stdout=output,
stderr=output,
env=env,
timeout=timeout,
retries=num_retries,
)

# Pytest return code 5 means no test is collected. This is needed
Expand All @@ -659,37 +655,23 @@ def run_test(
return ret_code


def run_test_retries(
command,
test_directory,
env,
timeout,
stepcurrent_key,
output,
continue_through_error,
def run_test_continue_through_error(
command, test_directory, env, timeout, stepcurrent_key, output
):
# Run the test with -x to stop at first failure. Try again, skipping the
# previously run tests, repeating this until there is a test that fails 3
# times (same number of rVetries we typically give).
#
# If continue through error is not set, then we fail fast.
#
# If continue through error is set, then we skip that test, and keep going.
# Basically if the same test fails 3 times in a row, skip the test on the
# next run, but still fail in the end. I take advantage of the value saved
# in stepcurrent to keep track of the most recently run test (which is the
# one that failed if there was a failure).

def print_to_file(s):
print(s, file=output, flush=True)
# times (same number of rVetries we typically give). Then we skip that
# test, and keep going. Basically if the same test fails 3 times in a row,
# skip the test on the next run, but still fail in the end. I take advantage
# of the value saved in stepcurrent to keep track of the most recently run
# test (which is the one that failed if there was a failure).

num_failures = defaultdict(int)

print_items = ["--print-items"]
sc_command = f"--sc={stepcurrent_key}"
while True:
ret_code = shell(
command + [sc_command] + print_items,
command + [sc_command],
test_directory,
stdout=output,
stderr=output,
Expand All @@ -700,7 +682,11 @@ def print_to_file(s):
if ret_code == 0:
break # Got to the end of the test suite successfully
signal_name = f" ({SIGNALS_TO_NAMES_DICT[-ret_code]})" if ret_code < 0 else ""
print_to_file(f"Got exit code {ret_code}{signal_name}")
print(
f"Got exit code {ret_code}{signal_name}, retrying...",
file=output,
flush=True,
)

# Read what just failed
with open(
Expand All @@ -710,24 +696,25 @@ def print_to_file(s):

num_failures[current_failure] += 1
if num_failures[current_failure] >= 3:
if not continue_through_error:
print_to_file("Stopping at first consistent failure")
break
sc_command = f"--scs={stepcurrent_key}"
else:
sc_command = f"--sc={stepcurrent_key}"
print_to_file("Retrying...")
print_items = [] # do not continue printing them, massive waste of space

consistent_failures = [x[1:-1] for x in num_failures.keys() if num_failures[x] >= 3]
flaky_failures = [x[1:-1] for x in num_failures.keys() if 0 < num_failures[x] < 3]
if len(flaky_failures) > 0:
print_to_file(
"The following tests failed and then succeeded when run in a new process"
+ f"{flaky_failures}",
print(
"The following tests failed flakily (had to be rerun in a new process,"
+ f" doesn't include reruns froms same process): {flaky_failures}",
file=output,
flush=True,
)
if len(consistent_failures) > 0:
print_to_file(f"The following tests failed consistently: {consistent_failures}")
print(
f"The following tests failed consistently: {consistent_failures}",
file=output,
flush=True,
)
return 1, True
return 0, any(x > 0 for x in num_failures.values())

Expand Down Expand Up @@ -1053,7 +1040,9 @@ def handle_log_file(
os.remove(file_path)


def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
def get_pytest_args(
options, stepcurrent_key, is_cpp_test=False, is_distributed_test=False
):
if RERUN_DISABLED_TESTS:
# Distributed tests are too slow, so running them x50 will cause the jobs to timeout after
# 3+ hours. So, let's opt for less number of reruns. We need at least 150 instances of the
Expand All @@ -1074,8 +1063,9 @@ def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
]
if not is_cpp_test:
# C++ tests need to be run with pytest directly, not via python
# We have a custom pytest shard that conflicts with the normal plugin
pytest_args.extend(["-p", "no:xdist", "--use-pytest"])
if not options.continue_through_error and IS_CI:
pytest_args.append(f"--sc={stepcurrent_key}")
else:
# Use pytext-dist to run C++ tests in parallel as running them sequentially using run_test
# is much slower than running them directly
Expand Down Expand Up @@ -1648,6 +1638,7 @@ def parallel_test_completion_callback(failure):
def check_pip_packages() -> None:
packages = [
"pytest-rerunfailures",
"pytest-shard",
"pytest-flakefinder",
"pytest-xdist",
]
Expand Down

0 comments on commit 77cfaca

Please sign in to comment.