Implement test filtering (Lightning-AI#742)

pvsnp9 · Nov 17, 2023 · 0a202f6 · 0a202f6
1 parent f475bd5
commit 0a202f6
Show file tree

Hide file tree

Showing 13 changed files with 205 additions and 83 deletions.
diff --git a/.github/azure-gpu-test.yml b/.github/azure-gpu-test.yml
@@ -53,4 +53,14 @@ jobs:
       displayName: "Env details"
 
     - bash: pytest -v --disable-pytest-warnings --strict-markers --color=yes
-      displayName: 'Testing'
+      displayName: 'Ordinary tests'
+      env:
+        PL_RUN_CUDA_TESTS: "1"
+      timeoutInMinutes: "5"
+
+    - bash: bash run_standalone_tests.sh
+      workingDirectory: tests
+      env:
+        PL_RUN_CUDA_TESTS: "1"
+      displayName: "Standalone tests"
+      timeoutInMinutes: "5"
diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml
@@ -56,15 +56,17 @@ jobs:
         echo "$modules"
         python -c "$modules"
 
-    - name: Run tests without the package installed
+    - name: Install all dependencies
       run: |
-        pip install -r requirements-all.txt pytest pytest-rerunfailures transformers einops protobuf
+        pip install -r requirements-all.txt pytest pytest-rerunfailures pytest-timeout transformers einops protobuf
         pip list
 
-        pytest -v --disable-pytest-warnings --strict-markers --color=yes
+    - name: Run tests without the package installed
+      run: |
+        pytest -v --disable-pytest-warnings --strict-markers --color=yes --timeout 60
 
     - name: Run tests
       run: |
         pip install . --no-deps
 
-        pytest -v --disable-pytest-warnings --strict-markers --color=yes
+        pytest -v --disable-pytest-warnings --strict-markers --color=yes --timeout 60
diff --git a/pretrain/tinyllama.py b/pretrain/tinyllama.py
@@ -161,14 +161,13 @@ def train(fabric, state, train_dataloader, val_dataloader, resume):
             if curr_iter < initial_iter:
                 curr_iter += 1
                 continue
-            else:
-                resume = False
-                curr_iter = -1
-                fabric.barrier()
-                fabric.print(
-                    "Resuming data loader finished."
-                    f"Took {time.perf_counter() - total_t0:.1f} seconds to reach iteration {initial_iter}."
-                )
+            resume = False
+            curr_iter = -1
+            fabric.barrier()
+            fabric.print(
+                "Resuming data loader finished."
+                f"Took {time.perf_counter() - total_t0:.1f} seconds to reach iteration {initial_iter}."
+            )
 
         if state["iter_num"] >= max_iters:
             break

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,9 +1,11 @@
 import os
 import sys
 from pathlib import Path
+from typing import List
 
 import pytest
 import torch
+from lightning.fabric.utilities.testing import _runif_reasons
 
 wd = Path(__file__).parent.parent.absolute()
 
@@ -40,3 +42,53 @@ def tensor_like():
 def restore_default_dtype():
     # just in case
     torch.set_default_dtype(torch.float32)
+
+
+def RunIf(**kwargs):
+    reasons, marker_kwargs = _runif_reasons(**kwargs)
+    return pytest.mark.skipif(condition=len(reasons) > 0, reason=f"Requires: [{' + '.join(reasons)}]", **marker_kwargs)
+
+
+# https://github.com/Lightning-AI/lightning/blob/6e517bd55b50166138ce6ab915abd4547702994b/tests/tests_fabric/conftest.py#L140
+def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.Config) -> None:
+    initial_size = len(items)
+    conditions = []
+    filtered, skipped = 0, 0
+
+    options = {"standalone": "PL_RUN_STANDALONE_TESTS", "min_cuda_gpus": "PL_RUN_CUDA_TESTS"}
+    if os.getenv(options["standalone"], "0") == "1" and os.getenv(options["min_cuda_gpus"], "0") == "1":
+        # special case: we don't have a CPU job for standalone tests, so we shouldn't run only cuda tests.
+        # by deleting the key, we avoid filtering out the CPU tests
+        del options["min_cuda_gpus"]
+
+    for kwarg, env_var in options.items():
+        # this will compute the intersection of all tests selected per environment variable
+        if os.getenv(env_var, "0") == "1":
+            conditions.append(env_var)
+            for i, test in reversed(list(enumerate(items))):  # loop in reverse, since we are going to pop items
+                already_skipped = any(marker.name == "skip" for marker in test.own_markers)
+                if already_skipped:
+                    # the test was going to be skipped anyway, filter it out
+                    items.pop(i)
+                    skipped += 1
+                    continue
+                has_runif_with_kwarg = any(
+                    marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers
+                )
+                if not has_runif_with_kwarg:
+                    # the test has `@RunIf(kwarg=True)`, filter it out
+                    items.pop(i)
+                    filtered += 1
+
+    if config.option.verbose >= 0 and (filtered or skipped):
+        writer = config.get_terminal_writer()
+        writer.write(
+            (
+                f"\nThe number of tests has been filtered from {initial_size} to {initial_size - filtered} after the"
+                f" filters {conditions}.\n{skipped} tests are marked as unconditional skips.\nIn total,"
+                f" {len(items)} tests will run.\n"
+            ),
+            flush=True,
+            bold=True,
+            purple=True,  # oh yeah, branded pytest messages
+        )
diff --git a/tests/run_standalone_tests.sh b/tests/run_standalone_tests.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+set -e
+
+# Batch size for testing: Determines how many standalone test invocations run in parallel
+# It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE
+test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-1}"
+
+# this environment variable allows special tests to run
+export PL_RUN_STANDALONE_TESTS=1
+# python arguments
+defaults="-m pytest --no-header -v --disable-pytest-warnings --strict-markers --color=yes -s --timeout 120"
+echo "Using defaults: ${defaults}"
+
+# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster
+grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py')
+
+# file paths, remove duplicates
+files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq)
+
+# get the list of parametrizations. we need to call them separately. the last two lines are removed.
+# note: if there's a syntax error, this will fail with some garbled output
+if [[ "$OSTYPE" == "darwin"* ]]; then
+  parametrizations=$(python3 -m pytest $files --collect-only --quiet --disable-pytest-warnings "$@" | tail -r | sed -e '1,3d' | tail -r)
+else
+  parametrizations=$(python3 -m pytest $files --collect-only --quiet --disable-pytest-warnings "$@" | head -n -2)
+fi
+# remove the "tests/" path suffix
+path_suffix=$(basename "$(pwd)")"/"  # https://stackoverflow.com/a/8223345
+parametrizations=${parametrizations//$path_suffix/}
+parametrizations_arr=($parametrizations)
+
+report=''
+
+rm -f standalone_test_output.txt  # in case it exists, remove it
+function show_batched_output {
+  if [ -f standalone_test_output.txt ]; then  # if exists
+    cat standalone_test_output.txt
+    # heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail
+    if grep -iE 'error|exception|traceback|failed' standalone_test_output.txt | grep -qvE 'on_exception|xfailed'; then
+      echo "Potential error! Stopping."
+      rm standalone_test_output.txt
+      exit 1
+    fi
+    rm standalone_test_output.txt
+  fi
+}
+trap show_batched_output EXIT  # show the output on exit
+
+for i in "${!parametrizations_arr[@]}"; do
+  parametrization=${parametrizations_arr[$i]}
+  prefix="$((i+1))/${#parametrizations_arr[@]}"
+
+  echo "$prefix: Running $parametrization"
+  # execute the test in the background
+  # redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
+  # output to std{out,err} because the outputs would be garbled together
+  python3 ${defaults} "$parametrization" &>> standalone_test_output.txt &
+  # save the PID in an array
+  pids[${i}]=$!
+  # add row to the final report
+  report+="Ran\t$parametrization\n"
+
+  if ((($i + 1) % $test_batch_size == 0)); then
+    # wait for running tests
+    for pid in ${pids[*]}; do wait $pid; done
+    unset pids  # empty the array
+    show_batched_output
+  fi
+done
+# wait for leftover tests
+for pid in ${pids[*]}; do wait $pid; done
+show_batched_output
+
+# echo test report
+printf '=%.s' {1..80}
+printf "\n$report"
+printf '=%.s' {1..80}
+printf '\n'
diff --git a/tests/test_adapter.py b/tests/test_adapter.py
@@ -1,11 +1,10 @@
-import sys
 from contextlib import redirect_stdout
 from dataclasses import asdict
 from io import StringIO
 from unittest.mock import Mock
 
-import pytest
 import torch
+from conftest import RunIf
 from lightning import Fabric
 
 
@@ -106,7 +105,7 @@ def test_adapter_gpt_init_weights():
     assert (param == 0).all()
 
 
-@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="torch.compile not supported on this platform")
+@RunIf(dynamo=True)
 @torch.inference_mode()
 def test_adapter_compile():
     from lit_gpt.adapter import GPT

diff --git a/tests/test_adapter_v2.py b/tests/test_adapter_v2.py
@@ -1,10 +1,9 @@
-import sys
 from contextlib import redirect_stdout
 from io import StringIO
 from unittest.mock import Mock
 
-import pytest
 import torch
+from conftest import RunIf
 from lightning import Fabric
 
 
@@ -135,7 +134,7 @@ def test_base_model_can_be_adapter_v2_loaded():
         assert adapter_filter(k, None)
 
 
-@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="torch.compile not supported on this platform")
+@RunIf(dynamo=True)
 @torch.inference_mode()
 def test_adapter_v2_compile():
     from lit_gpt.adapter_v2 import GPT

diff --git a/tests/test_ci.py b/tests/test_ci.py
@@ -0,0 +1,17 @@
+# this file is just to validate on the CI logs that these tests were run
+from conftest import RunIf
+
+
+@RunIf(min_cuda_gpus=1)
+def test_runif_min_cuda_gpus():
+    assert True
+
+
+@RunIf(min_cuda_gpus=1, standalone=True)
+def test_runif_min_cuda_gpus_standalone():
+    assert True
+
+
+@RunIf(standalone=True)
+def test_runif_standalone():
+    assert True
diff --git a/tests/test_convert_lit_checkpoint.py b/tests/test_convert_lit_checkpoint.py
@@ -1,5 +1,4 @@
 import json
-import operator
 import os
 from dataclasses import asdict
 from pathlib import Path
@@ -8,7 +7,6 @@
 
 import pytest
 import torch
-from lightning_utilities import compare_version
 
 wd = Path(__file__).parent.parent.absolute()
 
@@ -115,18 +113,7 @@ def test_against_original_gpt_neox():
 
 @torch.inference_mode()
 @pytest.mark.parametrize(
-    "ours_kwargs",
-    [
-        {"name": "Llama-2-7b-hf"},
-        pytest.param(
-            {"name": "CodeLlama-7b-hf"},
-            marks=pytest.mark.skipif(
-                compare_version("transformers", operator.lt, "4.33.0", use_base_version=True),
-                reason="requires rope_theta",
-            ),
-        ),
-        {"name": "Llama-2-70b-chat-hf"},
-    ],
+    "ours_kwargs", [{"name": "Llama-2-7b-hf"}, {"name": "CodeLlama-7b-hf"}, {"name": "Llama-2-70b-chat-hf"}]
 )
 def test_against_hf_llama2(ours_kwargs):
     from transformers.models.llama.configuration_llama import LlamaConfig
@@ -207,9 +194,10 @@ def test_against_hf_phi():
     if not file_path.is_file():
         urlretrieve(url=url, filename=file_path)
 
+    from original_phi_1_5 import MixFormerSequentialConfig, MixFormerSequentialForCausalLM
+
     from lit_gpt import GPT, Config
     from scripts.convert_lit_checkpoint import copy_weights_phi
-    from tests.original_phi_1_5 import MixFormerSequentialConfig, MixFormerSequentialForCausalLM
 
     ours_config = Config.from_name(
         "phi-1_5", padded_vocab_size=10000, n_layer=2, n_head=4, n_embd=256, rotary_percentage=0.5

diff --git a/tests/test_gptq.py b/tests/test_gptq.py
@@ -1,10 +1,10 @@
 import lightning as L
 import pytest
 import torch
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2
+from conftest import RunIf
 
 
-@pytest.mark.skipif(_TORCH_GREATER_EQUAL_2_2, reason="Core dumped")
+@RunIf(max_torch="2.2")  # TODO: core dumped
 def test_gptq_blockwise_quantization():
     from quantize.gptq import _TRITON_AVAILABLE
 

diff --git a/tests/test_lora.py b/tests/test_lora.py
@@ -1,11 +1,11 @@
-import sys
 from contextlib import redirect_stdout
 from io import StringIO
 from itertools import product
 from unittest.mock import Mock
 
 import pytest
 import torch
+from conftest import RunIf
 from lightning import Fabric
 
 
@@ -351,7 +351,7 @@ def test_lora_qkv_linear_weights_merged_status(rank, enable_lora, expected_merge
     assert layer.merged == expected_merged
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="8bit requires CUDA")
+@RunIf(min_cuda_gpus=1)
 # platform dependent cuda issue: libbitsandbytes_cpu.so: undefined symbol: cquantize_blockwise_fp16_nf4
 @pytest.mark.xfail(raises=AttributeError, strict=False)
 # https://github.com/Lightning-AI/lit-gpt/issues/513
@@ -456,7 +456,7 @@ def test_base_model_can_be_lora_loaded():
         assert lora_filter(k, None)
 
 
-@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="torch.compile not supported on this platform")
+@RunIf(dynamo=True)
 @torch.inference_mode()
 def test_lora_compile():
     from lit_gpt.lora import GPT