Revert "[BE] [cuDNN] Always build assuming cuDNN >= 8.0 (pytorch#95722)"

This reverts commit df4f0b3. Reverted pytorch#95722 on behalf of https://github.com/PaliC due to is breaking a bunch of internal pytorch users ([comment](pytorch#95722 (comment)))
corey-lambda · Nov 10, 2023 · 3c9a59c · 3c9a59c
1 parent 2a271a3
commit 3c9a59c
Show file tree

Hide file tree

Showing 24 changed files with 170 additions and 55 deletions.
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -413,7 +413,6 @@ cc_library(
         "@cuda//:cusolver",
         "@cuda//:nvrtc",
         "@cudnn",
-        "@cudnn_frontend",
     ],
     alwayslink = True,
 )

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -213,6 +213,9 @@ cmake_dependent_option(
 cmake_dependent_option(
     USE_CUSPARSELT "Use cuSPARSELt" ON
     "USE_CUDA" OFF)
+cmake_dependent_option(
+    USE_EXPERIMENTAL_CUDNN_V8_API "Use experimental cuDNN v8 API" ON
+    "USE_CUDNN" OFF)
 option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)

diff --git a/WORKSPACE b/WORKSPACE
@@ -246,12 +246,6 @@ new_local_repository(
     path = "/usr/",
 )
 
-new_local_repository(
-    name = "cudnn_frontend",
-    build_file = "@//third_party:cudnn_frontend.BUILD",
-    path = "third_party/cudnn_frontend/",
-)
-
 local_repository(
     name = "com_github_google_flatbuffers",
     path = "third_party/flatbuffers",

diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
@@ -305,13 +305,15 @@ struct TORCH_CUDA_CPP_API CTCLossDescriptor
   void set(cudnnDataType_t datatype) {
     AT_CUDNN_CHECK(cudnnSetCTCLossDescriptor(mut_desc(), datatype));
   }
+#if CUDNN_VERSION >= 7600
   void setEx(
       cudnnDataType_t datatype,
       cudnnLossNormalizationMode_t normMode,
       cudnnNanPropagation_t gradMode) {
     AT_CUDNN_CHECK(
         cudnnSetCTCLossDescriptorEx(mut_desc(), datatype, normMode, gradMode));
   }
+#endif
 };
 
 struct TORCH_CUDA_CPP_API ActivationDescriptor

diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp
@@ -59,7 +59,11 @@ cudnnBatchNormMode_t getCudnnBatchNormMode(bool training, at::MemoryFormat memor
     return CUDNN_BATCHNORM_PER_ACTIVATION;
   } else if (training && memory_format == at::MemoryFormat::ChannelsLast) {
 
+#if CUDNN_VERSION >= 7400
     return CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#else
+    return CUDNN_BATCHNORM_SPATIAL;
+#endif // CUDNN_VERSION >= 7400
 
   } else if (training && memory_format == at::MemoryFormat::ChannelsLast3d) {
 
@@ -148,6 +152,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
     save_mean = at::empty({ num_features }, weight_t.options());
     save_var = at::empty({ num_features }, weight_t.options());
 
+#if CUDNN_VERSION >= 7400
     auto op = CUDNN_BATCHNORM_OPS_BN;
     size_t workspace_size;
     AT_CUDNN_CHECK(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
@@ -199,6 +204,22 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
         workspace_size,
         reserve.mutable_data_ptr(),
         reserve_size));
+#else
+    reserve = at::empty({0}, input->options().dtype(kByte));
+    AT_CUDNN_CHECK(cudnnBatchNormalizationForwardTraining(
+      handle, mode, &one, &zero,
+      idesc.desc(), input->data_ptr(),
+      idesc.desc(), output->data_ptr(),
+      wdesc.desc(),
+      weight->data_ptr(),
+      bias->data_ptr(),
+      exponential_average_factor,
+      at::maybe_data_ptr(running_mean),
+      at::maybe_data_ptr(running_var),
+      epsilon,
+      save_mean.mutable_data_ptr(),
+      save_var.mutable_data_ptr()));
+#endif // CUDNN_VERSION >= 7400
   } else {
     reserve = at::empty({0}, input->options().dtype(kByte));
     // This keeps a consistent output with native_batch_norm
@@ -296,6 +317,7 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
   Constant one(dataType, 1);
   Constant zero(dataType, 0);
 
+#if CUDNN_VERSION >= 7400
   auto op = CUDNN_BATCHNORM_OPS_BN;
 
   size_t workspace_size;
@@ -332,6 +354,19 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
     workspace_size,
     reserve->data_ptr(),
     reserve->numel()));
+#else
+  AT_CUDNN_CHECK(cudnnBatchNormalizationBackward(
+    handle, mode, &one, &zero, &one, &zero,
+    idesc.desc(), input->data_ptr(),
+    odesc.desc(), grad_output->data_ptr(),
+    idesc.desc(), grad_input_t.data_ptr(),
+    wdesc.desc(), weight->data_ptr(),
+    grad_weight_t.data_ptr(),
+    grad_bias_t.data_ptr(),
+    epsilon,
+    save_mean->data_ptr(),
+    save_var->data_ptr()));
+#endif // CUDNN_VERSION >= 7400
 
   return std::tuple<Tensor,Tensor,Tensor>{grad_input_t, grad_weight_t, grad_bias_t};
 }

diff --git a/aten/src/ATen/native/cudnn/ConvShared.h b/aten/src/ATen/native/cudnn/ConvShared.h
@@ -109,7 +109,9 @@ void raw_cudnn_convolution_add_relu_fallback_out(
 
 
 #if AT_CUDNN_ENABLED()
+#include <ATen/native/cudnn/Macros.h>
 
+#if HAS_CUDNN_V8()
 // v7 functions are preserved here to allow for runtime switching to v7
 // (e.g., TORCH_CUDNN_V8_API_DISABLED=1).
 // Note that v7 forward/backward out can have different behavior from the v8
@@ -147,4 +149,5 @@ void raw_cudnn_convolution_add_relu_out_v7(
     bool deterministic,
     bool allow_tf32);
 #endif
+#endif
 }}
diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@@ -3,6 +3,7 @@
 
 #if AT_CUDNN_ENABLED()
 
+#include <ATen/native/cudnn/Macros.h>
 #include <ATen/core/Tensor.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -59,6 +60,10 @@
 // with the best algo, under the hood, cudnn will run with the slower kernel
 // since it sees fastest algorithm combination with a sub optimal mathType.
 
+// Note [blocklist fft algorithms for strided dgrad]
+// This is a workaround for a CuDNN bug that gave wrong results in certain strided convolution
+// gradient setups. Check Issue #16610 for bug details. Bug is there for CUDNN version < 7.5 .
+
 constexpr size_t operator "" _TiB(unsigned long long n) {
   return size_t(n) * 1024 * 1024 * 1024 * 1024;
 }
@@ -220,6 +225,15 @@ size_t getMaxWorkspaceSize(
 template<typename perf_t>
 std::vector<perf_t> getValidAlgorithms(perf_t *perfResults, const ConvolutionArgs& args, int n_algo) {
 
+// See Note [blocklist fft algorithms for strided dgrad]
+#if CUDNN_VERSION < 7500
+  bool blocklist = std::is_same<decltype(perfResults[0].algo), cudnnConvolutionBwdDataAlgo_t>::value;
+  int stride_dim = args.input.dim() - 2;
+  blocklist &= std::any_of(std::begin(args.params.stride),
+                            std::begin(args.params.stride) + stride_dim,
+                            [=](int n){return n != 1;});
+#endif
+
   std::vector<perf_t> result;
   result.reserve(n_algo);
   for (const auto i : c10::irange(n_algo)) {
@@ -230,6 +244,16 @@ std::vector<perf_t> getValidAlgorithms(perf_t *perfResults, const ConvolutionArg
     if (perf.status == CUDNN_STATUS_SUCCESS) {
       if (!args.params.deterministic || perf.determinism == CUDNN_DETERMINISTIC) {
 
+        // See Note [blocklist fft algorithms for strided dgrad]
+#if CUDNN_VERSION < 7500
+        bool skip = blocklist;
+        skip &= (static_cast<cudnnConvolutionBwdDataAlgo_t>(perfResults[i].algo) == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
+                  static_cast<cudnnConvolutionBwdDataAlgo_t>(perfResults[i].algo) == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT);
+        if (skip) {
+          continue;
+        }
+#endif
+
         result.push_back(perf);
       }
     }
@@ -469,9 +493,11 @@ class AlgoIterator {
       perfResults[0].mathType = CUDNN_TENSOR_OP_MATH;
     } else {
       perfResults[0].mathType = CUDNN_DEFAULT_MATH;
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8000
       if (args.params.dataType == CUDNN_DATA_FLOAT && !args.params.allow_tf32) {
         perfResults[0].mathType = CUDNN_FMA_MATH;
       }
+#endif
     }
     search::getWorkspaceSize(args, perfResults[0].algo, &(perfResults[0].memory));
     return perfResults;
@@ -584,10 +610,14 @@ static inline void split_batch_dim_to_32bit_out(
 }
 
 
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8000
 #define ASSERT_CORRECT_PRECISION(math_type)                                     \
 if (args.params.dataType == CUDNN_DATA_FLOAT) {                                 \
   TORCH_INTERNAL_ASSERT(args.params.allow_tf32 || math_type == CUDNN_FMA_MATH); \
 }
+#else
+#define ASSERT_CORRECT_PRECISION(math_type)
+#endif  // CUDNN_VERSION >= 8000
 
 
 // ---------------------------------------------------------------------
@@ -642,7 +672,11 @@ void raw_cudnn_convolution_forward_out_32bit(
 }
 
 
+#if !HAS_CUDNN_V8()
+void raw_cudnn_convolution_forward_out(
+#else
 void raw_cudnn_convolution_forward_out_v7(
+#endif
     const Tensor& output, const Tensor& input, const Tensor& weight,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     bool benchmark, bool deterministic, bool allow_tf32) {
@@ -700,7 +734,11 @@ void raw_cudnn_convolution_backward_input_out_32bit(
   );
 }
 
+#if !HAS_CUDNN_V8()
+void raw_cudnn_convolution_backward_input_out(
+#else
 void raw_cudnn_convolution_backward_input_out_v7(
+#endif
     const at::Tensor& grad_input,
     const at::Tensor& grad_output,
     const at::Tensor& weight,
@@ -759,7 +797,11 @@ void raw_cudnn_convolution_backward_weight_out_32bit(
   );
 }
 
+#if !HAS_CUDNN_V8()
+void raw_cudnn_convolution_backward_weight_out(
+#else
 void raw_cudnn_convolution_backward_weight_out_v7(
+#endif
     const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     bool benchmark, bool deterministic, bool allow_tf32) {
@@ -811,7 +853,12 @@ void raw_cudnn_convolution_backward_weight_out_v7(
   TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN.");
 }
 
+#if !HAS_CUDNN_V8()
+void raw_cudnn_convolution_add_relu_out(
+#else
 void raw_cudnn_convolution_add_relu_out_v7(
+#endif
+
     const Tensor& output,
     const Tensor& input,
     const Tensor& weight,

diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@@ -4,6 +4,10 @@
 
 #if AT_CUDNN_ENABLED()
 
+#include <ATen/native/cudnn/Macros.h>
+
+#if HAS_CUDNN_V8()
+
 #include <ATen/cudnn/cudnn-wrapper.h>
 
 #include <c10/macros/Macros.h>
@@ -796,4 +800,5 @@ void raw_cudnn_convolution_add_relu_out(
 
 }} // at::native
 
+#endif  // HAS_CUDNN_V8
 #endif  // AT_CUDNN_ENABLED
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -18,7 +18,7 @@
 #include <ATen/ops/empty_like.h>
 #endif
 
-#if (!AT_CUDNN_ENABLED())
+#if (!AT_CUDNN_ENABLED()) || (CUDNN_VERSION < 7600)
 
 namespace at { namespace native {
 

diff --git a/aten/src/ATen/native/cudnn/Macros.h b/aten/src/ATen/native/cudnn/Macros.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <ATen/cudnn/cudnn-wrapper.h>
+
+// Note: The version below should not actually be 8000. Instead, it should
+// be whatever version of cuDNN that v8 API work with PyTorch correctly.
+// The version is set to 8000 today for convenience of debugging.
+#if defined(USE_EXPERIMENTAL_CUDNN_V8_API) && defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200
+#define HAS_CUDNN_V8() true
+#else
+#define HAS_CUDNN_V8() false
+#endif
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@@ -28,6 +28,7 @@
 #include <utility>
 #endif
 
+int register_linear_params();
 int register_embedding_params();
 
 #ifdef USE_FBGEMM
@@ -436,9 +437,7 @@ TORCH_API int register_conv_params<2>();
 template
 TORCH_API int register_conv_params<3>();
 
-TORCH_API int register_linear_params();
-
-TORCH_API int register_linear_params() {
+int register_linear_params() {
   using SerializationType = std::tuple<at::Tensor, c10::optional<at::Tensor>>;
   static auto register_linear_params =
       torch::selective_class_<LinearPackedParamsBase>(

diff --git a/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp b/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp
@@ -2,6 +2,8 @@
 #include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
 
 #if AT_CUDNN_ENABLED()
+#include <ATen/native/cudnn/Macros.h>
+#if HAS_CUDNN_V8()
 
 #include <ATen/core/TensorBase.h>
 #include <ATen/core/TensorBody.h>
@@ -257,5 +259,6 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
 } // namespace native
 } // namespace at
 
+#endif  // HAS_CUDNN_V8
 #endif  // AT_CUDNN_ENABLED
 #endif  // USE_CUDA
diff --git a/aten/src/ATen/native/quantized/cudnn/Conv.cpp b/aten/src/ATen/native/quantized/cudnn/Conv.cpp
@@ -3,8 +3,11 @@
 
 #if AT_CUDNN_ENABLED()
 
+#include <ATen/native/cudnn/Macros.h>
 #include <c10/util/ArrayRef.h>
 
+#if HAS_CUDNN_V8()
+
 #include <ATen/ATen.h>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/cudnn/Handle.h>
@@ -22,12 +25,6 @@
 #include <unordered_map>
 #include <vector>
 
-template <int kSpatialDim = 2>
-int register_conv_params();
-
-extern template int register_conv_params<2>();
-extern template int register_conv_params<3>();
-
 // TODO: there is a table from input dtype and weight dtype to operator qdtype,
 // we can derive the operator dtype based on input dtype
 cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, c10::IntArrayRef padding, c10::IntArrayRef stride, c10::IntArrayRef dilation) {
@@ -394,8 +391,6 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
   // this is inconsistent with what has been done for conv2d where new variants use packed weights, and
   // old variant does not. we adopt this inconsistency for now to be consistent with QuantizedCPU's conv1d
   // and will eventually deprecate the old variants
-  register_conv_params<2>();
-  register_conv_params<3>();
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d"), QConv1dInt8<false>::run);
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_relu"), QConv1dInt8<true>::run);
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d.new"), QConvInt8<2, false>::run);
@@ -406,5 +401,6 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
 } // namespace at::native
 
 
+#endif  // HAS_CUDNN_V8
 #endif  // AT_CUDNN_ENABLED
 #endif  // USE_CUDA