Add support for CUDA 12.4 (pytorch#2565)

Summary: - Add support for CUDA 12.4 Pull Request resolved: pytorch#2565 Reviewed By: spcyppt Differential Revision: D57027676 Pulled By: q10 fbshipit-source-id: e8b32e101c385fe6317ddc1ebc019cba7ae2bf20
xmfan · May 7, 2024 · f2b1b50 · f2b1b50
1 parent b83460f
commit f2b1b50
Show file tree

Hide file tree

Showing 10 changed files with 23 additions and 14 deletions.
diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
@@ -88,8 +88,8 @@ install_fbgemm_gpu_pip () {
     echo "Usage: ${FUNCNAME[0]} ENV_NAME FBGEMM_GPU_CHANNEL[/VERSION] FBGEMM_GPU_VARIANT_TYPE[/VARIANT_VERSION]"
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env 0.5.0 cpu                  # Install the CPU variant, specific version from release channel"
-    echo "    ${FUNCNAME[0]} build_env release cuda 12.1.1        # Install the CUDA variant, latest version from release channel"
-    echo "    ${FUNCNAME[0]} build_env test/0.6.0rc0 cuda 12.1.0  # Install the CUDA 12.1 variant, specific version from test channel"
+    echo "    ${FUNCNAME[0]} build_env release cuda 12.4.1        # Install the CUDA variant, latest version from release channel"
+    echo "    ${FUNCNAME[0]} build_env test/0.6.0rc0 cuda 12.4.1  # Install the CUDA 12.4 variant, specific version from test channel"
     echo "    ${FUNCNAME[0]} build_env nightly rocm 5.3           # Install the ROCM 5.3 variant, latest version from nightly channel"
     return 1
   else

diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -379,6 +379,7 @@ test_fbgemm_gpu_setup_and_pip_install () {
     local variant_versions=(
       11.8.0
       12.1.1
+      12.4.1
     )
   elif [ "$variant_type" == "rocm" ]; then
     local variant_versions=(

diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
@@ -135,7 +135,8 @@ install_cudnn () {
     ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-${PLATFORM_NAME_LC}-8.3.2.44_cuda11.5-archive.tar.xz"
     ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-${PLATFORM_NAME_LC}-8.5.0.96_cuda11-archive.tar.xz"
     ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-${PLATFORM_NAME_LC}-8.7.0.84_cuda11-archive.tar.xz"
-    ["121"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.8.1.3_cuda12-archive.tar.xz"
+    ["121"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz"
+    ["124"]="https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz"
   )
 
   # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]

diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -65,7 +65,7 @@ jobs:
           { arch: x86, instance: "linux.24xlarge" },
         ]
         python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
         compiler: [ "gcc", "clang" ]
 
     steps:
@@ -145,7 +145,7 @@ jobs:
           # { arch: x86, instance: "linux.gcp.a100" },
         ]
         python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
         # Specify exactly ONE CUDA version for artifact publish
         cuda-version-publish: [ "12.1.1" ]
         compiler: [ "gcc", "clang" ]

diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
@@ -119,7 +119,7 @@ jobs:
           { instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
         python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
 
     steps:
     # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old

diff --git a/.github/workflows/fbgemm_gpu_release_cuda.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -34,7 +34,7 @@ on:
         description: CUDA Version to Use for Building Artifact
         type: choice
         required: false
-        options: [ "11.8.0", "12.1.1" ]
+        options: [ "11.8.0", "12.1.1", "12.4.1" ]
         default: "12.1.1"
       publish_to_pypi:
         description: Publish Artifact to PyPI
@@ -69,7 +69,7 @@ jobs:
           { arch: x86, instance: "linux.24xlarge" },
         ]
         python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
 
     steps:
     - name: Setup Build Container
@@ -139,7 +139,7 @@ jobs:
           { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
         python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
-        cuda-version: [ "11.8.0", "12.1.1" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
     needs: build_artifact
 
     steps:

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -97,7 +97,7 @@ if(NOT FBGEMM_CPU_ONLY)
   if(NOT USE_ROCM)
     # CUTLASS currently doesn't build on ROCm:
     #
-    # /__w/FBGEMM/FBGEMM/fbgemm_gpu/../third_party/cutlass/include/cutlass/half.h:73:10: fatal error: 'cuda_fp16.h' file not found
+    # 2024-05-06T23:09:35.5730483Z /__w/FBGEMM/FBGEMM/fbgemm_gpu/../third_party/cutlass/include/cutlass/half.h:73:10: fatal error: 'cuda_fp16.h' file not found
     # #include <cuda_fp16.h>
     #
     add_subdirectory(experimental/gen_ai)

diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/gqa_attn_splitk.cu b/fbgemm_gpu/experimental/gen_ai/src/attention/gqa_attn_splitk.cu
@@ -18,8 +18,10 @@
     ((defined(CUDA_VERSION) && CUDA_VERSION < 11000) || \
      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))))
 #include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #elif (defined(USE_ROCM))
-#include <hip/hip_bfloat16.h> // @manual
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
 #endif
 
 #ifndef USE_ROCM

diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions.cu
@@ -16,14 +16,17 @@
     defined(USE_ROCM) ||                                \
     ((defined(CUDA_VERSION) && CUDA_VERSION < 11000) || \
      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))))
+#include <cublasLt.h>
 #include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #include <cuda/atomic>
 #elif (defined(USE_ROCM))
-#include <hip/hip_bfloat16.h>
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+#include <hipblaslt/hipblaslt.h>
 #endif
 #include <c10/core/ScalarType.h>
 #include <c10/cuda/CUDAGuard.h>
-#include <cublasLt.h>
 #include <cutlass/core_io.h>
 #include <cutlass/cutlass.h>
 #include <cutlass/gemm/device/gemm.h>

diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu
@@ -23,8 +23,10 @@
     ((defined(CUDA_VERSION) && CUDA_VERSION < 11000) || \
      (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))))
 #include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #elif (defined(USE_ROCM))
-#include <hip/hip_bfloat16.h>
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
 #endif
 
 #ifndef USE_ROCM
-Original file line number
+Diff line change
@@ Expand Up / @@ -379,6 +379,7 @@ test_fbgemm_gpu_setup_and_pip_install () { @@
         local variant_versions=(
 .8.0
 .1.1
+.4.1
         )
       elif [ "$variant_type" == "rocm" ]; then
         local variant_versions=(
@@ Expand Down @@