Update A6000 workflows to use newer docker container - 24.09 vs 24.03 (…

…#6967) - Issues with nv-sd updates, will follow up with a subsequent PR Signed-off-by: gyou2021 <[email protected]>
deepspeedai · gyou2021 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
commit f5e97963fd52e717ee7aa3c149c5ac123846c6f3
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
@@ -23,7 +23,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:24.03-py3
+      image: nvcr.io/nvidia/pytorch:24.09-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -57,8 +57,8 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.3" --cuda_ver="12"
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.3" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.5" --cuda_ver="12"
       - name: MII unit tests
         run: |
           BRANCH="main"

diff --git a/.github/workflows/nv-flash-attn.yml b/.github/workflows/nv-flash-attn.yml
@@ -18,7 +18,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:24.03-py3
+      image: nvcr.io/nvidia/pytorch:24.09-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -53,7 +53,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.3" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.5" --cuda_ver="12"
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
         uses: JasonEtco/create-an-issue@v2

diff --git a/.github/workflows/nv-human-eval.yml b/.github/workflows/nv-human-eval.yml
@@ -11,7 +11,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:24.03-py3
+      image: nvcr.io/nvidia/pytorch:24.09-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -50,4 +50,4 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.3" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.5" --cuda_ver="12"