Fix occupancy calculation for grouped GEMM (NVIDIA#532)

codecat-he · Jun 18, 2022 · fa56763 · fa56763
1 parent 25e26a6
commit fa56763
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 64 deletions.
diff --git a/examples/24_gemm_grouped/gemm_grouped.cu b/examples/24_gemm_grouped/gemm_grouped.cu
@@ -756,12 +756,6 @@ public:
   /// Returns the number of threadblocks to launch if the kernel can run on the target
   /// device. Otherwise, returns zero.
   int sufficient() const {
-    //
-    // Determine SMEM requirements and waive if not satisfied
-    //
-
-    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
-
     cudaDeviceProp properties;
     int device_idx;
     cudaError_t result = cudaGetDevice(&device_idx);
@@ -776,9 +770,10 @@ public:
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    int occupancy = std::min(2, int(properties.sharedMemPerMultiprocessor / smem_size));
+    int occupancy = Gemm::maximum_active_blocks();
 
     return properties.multiProcessorCount * occupancy;
+
   }
 
 

diff --git a/include/cutlass/gemm/device/gemm_grouped.h b/include/cutlass/gemm/device/gemm_grouped.h
@@ -139,70 +139,40 @@ class GemmGrouped {
 
     CUTLASS_TRACE_HOST("GemmUniversalBase::maximum_active_blocks()");
 
-    int max_active_blocks = -1;
     int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
 
     CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
 
-    if (smem_size <= (48 << 10)) {
-
-      cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks,
-        Kernel<GemmKernel>,
-        GemmKernel::kThreadCount,
-        smem_size);
-
-      if (result == cudaSuccess) {
-        CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
-        return max_active_blocks;
-      }
-    }
-    else {
-
-      // Query assuming zero shared memory then compute occupancy limit based on SMEM
-      cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks,
-        Kernel<GemmKernel>,
-        GemmKernel::kThreadCount,
-        0);
+    cudaError_t result;
+    if (smem_size > (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
 
       if (result != cudaSuccess) {
-
         CUTLASS_TRACE_HOST(
-          "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
+          "  cudaFuncSetAttribute() returned error "
           << cudaGetErrorString(result));
-
         return -1;
       }
+    }
 
-      if (smem_capacity < 0) {
-        int device_idx = 0;
-        result = cudaGetDevice(&device_idx);
-
-        if (result != cudaSuccess) {
-          return -1;
-        }
-
-        cudaDeviceProp properties;
-        result = cudaGetDeviceProperties(&properties, device_idx);
-
-        if (result != cudaSuccess) {
-          return -1;
-        }
-
-        smem_capacity = static_cast<int>(properties.sharedMemPerMultiprocessor);
-      }
-
-      int occupancy = std::min(max_active_blocks, smem_capacity / smem_size);
-
-      CUTLASS_TRACE_HOST("  occupancy: " << occupancy);
+    int max_active_blocks = -1;
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        Kernel<GemmKernel>,
+        GemmKernel::kThreadCount,
+        smem_size);
 
-      return occupancy;
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
+        << cudaGetErrorString(result));
+      return -1;
     }
 
-    CUTLASS_TRACE_HOST("  returning internal error");
-
-    return -1;
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
   }
 
   /// Initializes GEMM state from arguments.

diff --git a/test/unit/gemm/device/testbed_grouped.h b/test/unit/gemm/device/testbed_grouped.h
@@ -419,12 +419,6 @@ struct TestbedGrouped {
   /// Returns the number of threadblocks to launch if the kernel can run on the target
   /// device. Otherwise, returns zero.
   int sufficient() const {
-    //
-    // Determine SMEM requirements and waive if not satisfied
-    //
-
-    int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage));
-
     cudaDeviceProp properties;
     int device_idx;
     cudaError_t result = cudaGetDevice(&device_idx);
@@ -439,7 +433,7 @@ struct TestbedGrouped {
       throw std::runtime_error("cudaGetDeviceProperties() failed");
     }
 
-    int occupancy = std::min(2, int(properties.sharedMemPerMultiprocessor / smem_size));
+    int occupancy = Gemm::maximum_active_blocks();
 
     return properties.multiProcessorCount * occupancy;
   }