Skip to content

Commit

Permalink
Separate GPU test jobs into general and Ampere-specific (iree-org#14251)
Browse files Browse the repository at this point in the history
This will be followed by switching the generic runners to be T4
machines. I think we need to roll things out this way with the extra
job also running on A100.

Part of iree-org#14169
  • Loading branch information
GMNGeoffrey authored Jun 29, 2023
1 parent 7b28a41 commit 465492e
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 42 deletions.
59 changes: 40 additions & 19 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -274,16 +274,28 @@ jobs:
test_gpu:
needs: [setup, build_all]
if: fromJson(needs.setup.outputs.should-run)
env:
BUILD_DIR: ${{ needs.build_all.outputs.build-dir }}
BUILD_DIR_ARCHIVE: ${{ needs.build_all.outputs.build-dir-archive }}
BUILD_DIR_GCS_ARTIFACT: ${{ needs.build_all.outputs.build-dir-gcs-artifact }}
strategy:
matrix:
target:
- runner-type: gpu
iree-ctest-label-regex: ^requires-gpu|^driver=vulkan$|^driver=cuda$
iree-nvidia-sm80-tests-disable: 1
- runner-type: a100
iree-ctest-label-regex: ^requires-gpu-sm80$
iree-nvidia-sm80-tests-disable: 0
# Run other jobs even if one fails.
fail-fast: false
name: test_${{ matrix.target.runner-type }}
runs-on:
- self-hosted # must come first
- runner-group=${{ needs.setup.outputs.runner-group }}
- environment=${{ needs.setup.outputs.runner-env }}
- gpu
- ${{ matrix.target.runner-type }}
- os-family=Linux
env:
BUILD_DIR: ${{ needs.build_all.outputs.build-dir }}
BUILD_DIR_ARCHIVE: ${{ needs.build_all.outputs.build-dir-archive }}
BUILD_DIR_GCS_ARTIFACT: ${{ needs.build_all.outputs.build-dir-gcs-artifact }}
steps:
- name: "Checking out repository"
uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
Expand All @@ -297,20 +309,25 @@ jobs:
run: gcloud storage cp "${BUILD_DIR_GCS_ARTIFACT}" "${BUILD_DIR_ARCHIVE}"
- name: "Extracting build dir archive"
run: tar -xf "${BUILD_DIR_ARCHIVE}"
- name: "Testing with GPU"
- name: "Testing all"
env:
IREE_NVIDIA_SM80_TESTS_DISABLE: ${{ matrix.target.iree-nvidia-sm80-tests-disable }}
IREE_CTEST_LABEL_REGEX: ${{ matrix.target.iree-ctest-label-regex }}
run: |
./build_tools/github_actions/docker_run.sh \
--env IREE_VULKAN_F16_DISABLE=0 \
--env IREE_CUDA_DISABLE=0 \
--env IREE_NVIDIA_GPU_TESTS_DISABLE=0 \
--env CTEST_PARALLEL_LEVEL=2 \
--gpus all \
--env NVIDIA_DRIVER_CAPABILITIES=all \
gcr.io/iree-oss/nvidia@sha256:de6e4453614aa48059fd611d7e7255f4d6ac27ac29a47aabdc04191ec1758533 \
bash -euo pipefail -c \
"./build_tools/scripts/check_cuda.sh
./build_tools/scripts/check_vulkan.sh
./build_tools/cmake/ctest_all.sh ${BUILD_DIR}"
--env IREE_NVIDIA_SM80_TESTS_DISABLE \
--env IREE_CTEST_LABEL_REGEX \
--env IREE_VULKAN_F16_DISABLE=0 \
--env IREE_CUDA_DISABLE=0 \
--env IREE_NVIDIA_GPU_TESTS_DISABLE=0 \
--env CTEST_PARALLEL_LEVEL=2 \
--env NVIDIA_DRIVER_CAPABILITIES=all \
--gpus all \
gcr.io/iree-oss/nvidia@sha256:1717431fd46b8b1e96d95fa72508e3e3eacb5c95f1245b9b7dbeec23ae823d02 \
bash -euo pipefail -c \
"./build_tools/scripts/check_cuda.sh
./build_tools/scripts/check_vulkan.sh
./build_tools/cmake/ctest_all.sh ${BUILD_DIR}"
################################## Subsets ###################################
# Jobs that build some subset of IREE
Expand Down Expand Up @@ -1003,11 +1020,15 @@ jobs:

# Basic
- build_all
- test_all
- build_test_all_bazel

# Platforms
- build_test_all_windows
- build_test_all_macos_arm64
- build_test_all_macos_x86_64
- build_test_all_bazel
- test_all

# Accelerators
- test_gpu

# Subsets
Expand Down
4 changes: 2 additions & 2 deletions build_tools/bazel/build_core.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ default_test_tag_filters+=("-vulkan_uses_vk_khr_shader_float16_int8")
# CUDA CI testing disabled until we setup a target for it.
default_test_tag_filters+=("-driver=cuda")

if [[ "${IREE_VULKAN_DISABLE?}" == 1 ]]; then
if (( IREE_VULKAN_DISABLE == 1 )); then
default_test_tag_filters+=("-driver=vulkan")
fi
if [[ "${IREE_NVIDIA_GPU_TESTS_DISABLE?}" == 1 ]]; then
if (( IREE_NVIDIA_GPU_TESTS_DISABLE == 1 )); then
default_test_tag_filters+=("-requires-gpu-nvidia" "-requires-gpu-sm80")
fi

Expand Down
8 changes: 4 additions & 4 deletions build_tools/cmake/build_and_test_asan.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,13 @@ for asan_in_bytecode_modules_ON_OFF in OFF ON; do

# IREE_VULKAN_DISABLE is handled separately as we run Vulkan and non-Vulkan
# tests in separate ctest commands anyway.
if [[ "${IREE_CUDA_DISABLE?}" == 1 ]]; then
if (( IREE_CUDA_DISABLE == 1 )); then
label_exclude_args+=("^driver=cuda$")
fi
if [[ "${IREE_VULKAN_F16_DISABLE?}" == 1 ]]; then
if (( IREE_VULKAN_F16_DISABLE == 1 )); then
label_exclude_args+=("^vulkan_uses_vk_khr_shader_float16_int8$")
fi
if [[ "${IREE_NVIDIA_GPU_TESTS_DISABLE}" == 1 ]]; then
if (( IREE_NVIDIA_GPU_TESTS_DISABLE == 1 )); then
label_exclude_args+=("^requires-gpu")
fi

Expand All @@ -138,7 +138,7 @@ for asan_in_bytecode_modules_ON_OFF in OFF ON; do
echo "------------------"
cmake --build . --target check-iree-dialects -- -k 0

if [[ "${IREE_VULKAN_DISABLE?}" == 0 ]]; then
if (( IREE_VULKAN_DISABLE == 0 )); then
echo "*** Running ctests that use the Vulkan driver, with LSAN disabled (IREE_BYTECODE_MODULE_ENABLE_ASAN=${asan_in_bytecode_modules_ON_OFF}) ***"
echo "------------------"
# Disable LeakSanitizer (LSAN) because of a history of issues with Swiftshader
Expand Down
41 changes: 27 additions & 14 deletions build_tools/cmake/ctest_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,25 @@ get_default_parallel_level() {
}

# Respect the user setting, but default to as many jobs as we have cores.
export CTEST_PARALLEL_LEVEL=${CTEST_PARALLEL_LEVEL:-$(get_default_parallel_level)}
export CTEST_PARALLEL_LEVEL="${CTEST_PARALLEL_LEVEL:-$(get_default_parallel_level)}"

# Respect the user setting, but default to turning on Vulkan.
export IREE_VULKAN_DISABLE=${IREE_VULKAN_DISABLE:-0}
export IREE_VULKAN_DISABLE="${IREE_VULKAN_DISABLE:-0}"
# Respect the user setting, but default to turning off CUDA.
export IREE_CUDA_DISABLE=${IREE_CUDA_DISABLE:-1}
export IREE_CUDA_DISABLE="${IREE_CUDA_DISABLE:-1}"
# The VK_KHR_shader_float16_int8 extension is optional prior to Vulkan 1.2.
# We test on SwiftShader as a baseline, which does not support this extension.
export IREE_VULKAN_F16_DISABLE=${IREE_VULKAN_F16_DISABLE:-1}
export IREE_VULKAN_F16_DISABLE="${IREE_VULKAN_F16_DISABLE:-1}"
# Respect the user setting, but default to skipping tests that require Nvidia GPU.
export IREE_NVIDIA_GPU_TESTS_DISABLE=${IREE_NVIDIA_GPU_TESTS_DISABLE:-1}
export IREE_NVIDIA_GPU_TESTS_DISABLE="${IREE_NVIDIA_GPU_TESTS_DISABLE:-1}"
# Respect the user setting, but default to skipping tests that require SM80 Nvidia GPU.
export IREE_NVIDIA_SM80_TESTS_DISABLE="${IREE_NVIDIA_SM80_TESTS_DISABLE:-1}"
# Respect the user setting, default to no --repeat-until-fail.
export IREE_CTEST_REPEAT_UNTIL_FAIL_COUNT=${IREE_CTEST_REPEAT_UNTIL_FAIL_COUNT:-}
export IREE_CTEST_REPEAT_UNTIL_FAIL_COUNT="${IREE_CTEST_REPEAT_UNTIL_FAIL_COUNT:-}"
# Respect the user setting, default to no --tests-regex.
export IREE_CTEST_TESTS_REGEX=${IREE_CTEST_TESTS_REGEX:-}
export IREE_CTEST_TESTS_REGEX="${IREE_CTEST_TESTS_REGEX:-}"
# Respect the user setting, default to no --label-regex
export IREE_CTEST_LABEL_REGEX="${IREE_CTEST_LABEL_REGEX:-}"

# Tests to exclude by label. In addition to any custom labels (which are carried
# over from Bazel tags), every test should be labeled with its directory.
Expand All @@ -62,19 +66,24 @@ declare -a label_exclude_args=(
# ^bindings/
)

if [[ "${IREE_VULKAN_DISABLE}" == 1 ]]; then

if (( IREE_VULKAN_DISABLE == 1 )); then
label_exclude_args+=("^driver=vulkan$")
fi
if [[ "${IREE_CUDA_DISABLE}" == 1 ]]; then
if (( IREE_CUDA_DISABLE == 1 )); then
label_exclude_args+=("^driver=cuda$")
fi
if [[ "${IREE_VULKAN_F16_DISABLE}" == 1 ]]; then
if (( IREE_VULKAN_F16_DISABLE == 1 )); then
label_exclude_args+=("^vulkan_uses_vk_khr_shader_float16_int8$")
fi
if [[ "${IREE_NVIDIA_GPU_TESTS_DISABLE}" == 1 ]]; then
label_exclude_args+=("^requires-gpu$")
if (( IREE_NVIDIA_GPU_TESTS_DISABLE == 1 )); then
label_exclude_args+=("^requires-gpu")
fi
if (( IREE_NVIDIA_SM80_TESTS_DISABLE == 1 )); then
label_exclude_args+=("^requires-gpu-sm80$")
fi


IFS=',' read -ra extra_label_exclude_args <<< "${IREE_EXTRA_COMMA_SEPARATED_CTEST_LABELS_TO_EXCLUDE:-}"
label_exclude_args+=(${extra_label_exclude_args[@]})

Expand All @@ -83,7 +92,7 @@ label_exclude_args+=(${extra_label_exclude_args[@]})
# platforms it doesn't support, but that would require editing through layers
# of CMake functions. Hopefully this list stays very short.
declare -a excluded_tests=()
if [[ "$OSTYPE" =~ ^msys ]]; then
if [[ "${OSTYPE}" =~ ^msys ]]; then
# These tests are failing on Windows.
excluded_tests+=(
# TODO(#11077): INVALID_ARGUMENT: argument/result signature mismatch
Expand All @@ -99,7 +108,7 @@ if [[ "$OSTYPE" =~ ^msys ]]; then
# TODO(#11070): Fix argument/result signature mismatch
"iree/tests/e2e/tosa_ops/check_vmvx_local-sync_microkernels_fully_connected.mlir"
)
elif [[ "$OSTYPE" =~ ^darwin ]]; then
elif [[ "${OSTYPE}" =~ ^darwin ]]; then
excluded_tests+=(
#TODO(#12496): Remove after fixing the test on macOS
"iree/compiler/bindings/c/loader_test"
Expand Down Expand Up @@ -129,6 +138,10 @@ if [[ -n "${IREE_CTEST_TESTS_REGEX}" ]]; then
ctest_args+=("--tests-regex ${IREE_CTEST_TESTS_REGEX}")
fi

if [[ -n "${IREE_CTEST_LABEL_REGEX}" ]]; then
ctest_args+=("--label-regex ${IREE_CTEST_LABEL_REGEX}")
fi

if [[ -n "${IREE_CTEST_REPEAT_UNTIL_FAIL_COUNT}" ]]; then
ctest_args+=("--repeat-until-fail ${IREE_CTEST_REPEAT_UNTIL_FAIL_COUNT}")
fi
Expand Down
6 changes: 3 additions & 3 deletions build_tools/cmake/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,13 @@ declare -a label_exclude_args=(
# ^bindings/
)

if [[ "${IREE_VULKAN_DISABLE?}" == 1 ]]; then
if (( IREE_VULKAN_DISABLE == 1 )); then
label_exclude_args+=("^driver=vulkan$")
fi
if [[ "${IREE_CUDA_DISABLE?}" == 1 ]]; then
if (( IREE_CUDA_DISABLE == 1 )); then
label_exclude_args+=("^driver=cuda$")
fi
if [[ "${IREE_VULKAN_F16_DISABLE?}" == 1 ]]; then
if (( IREE_VULKAN_F16_DISABLE == 1 )); then
label_exclude_args+=("^vulkan_uses_vk_khr_shader_float16_int8$")
fi

Expand Down

0 comments on commit 465492e

Please sign in to comment.