[gpu] Drop subgroup size query callback (iree-org#18020)

Now we have an unified `#iree_gpu.target` attached to the dispatch we can just query from it, instead of using C++ pipeline callbacks.
nirvedhmeshram · Jul 30, 2024 · 20e2719 · 20e2719
1 parent 45323df
commit 20e2719
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 43 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
@@ -99,15 +99,9 @@ createGPUTensorAlloc(GPUPromoteSharedMemPattern promoteSharedMemPattern =
                          GPUPromoteSharedMemPattern::ContractionOpPattern);
 
 // Distributes vector ops to all threads/warps in a GPU workgroup.
-// `getWarpSize` is for deciding the warp size to use; it takes the
-// current function containing those vector ops as the argument.
-// If nullptr, warp size 32 will be used.
-// TODO: This kind of call back function is a really really bad idea
-// This should be easier to resolve than doing this.
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createConvertVectorReductionToGPUPass(
-    bool expandSubgroupReduction = true,
-    std::function<int(mlir::FunctionOpInterface)> getWarpSize = nullptr);
+createConvertVectorReductionToGPUPass(bool expandSubgroupReduction = true,
+                                      bool pickLargestSubroupSize = false);
 
 enum class ReorderWorkgroupsStrategy { None, Swizzle, Transpose };
 

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
@@ -193,11 +193,10 @@ static Value simpleWarpShuffleFunction(Location loc, OpBuilder &builder,
 
 struct VectorReductionToGPUPass final
     : impl::VectorReductionToGPUPassBase<VectorReductionToGPUPass> {
-  VectorReductionToGPUPass(
-      bool expandSubgroupReduction,
-      std::function<int(mlir::FunctionOpInterface)> getWarpSize)
+  VectorReductionToGPUPass(bool expandSubgroupReduction,
+                           bool pickLargestSubroupSize)
       : expandSubgroupReduction(expandSubgroupReduction),
-        getWarpSize(getWarpSize) {}
+        pickLargestSubroupSize(pickLargestSubroupSize) {}
 
   void runOnOperation() override {
     FunctionOpInterface funcOp = getOperation();
@@ -259,12 +258,17 @@ struct VectorReductionToGPUPass final
     // 4. Distribute transfer write operations and propagate vector
     // distribution.
     {
-      int warpSize = this->getWarpSize ? this->getWarpSize(funcOp) : 32;
+      std::optional<int> subgroupSize =
+          getGPUSubgroupSize(funcOp, pickLargestSubroupSize);
+      if (!subgroupSize) {
+        funcOp->emitOpError("missing subgroup size");
+        return signalPassFailure();
+      }
       auto groupReductionFn = [=](Location loc, OpBuilder &builder, Value input,
                                   vector::CombiningKind kind,
                                   uint32_t size) -> Value {
-        return emitGPUGroupReduction(loc, builder, input, kind, size, warpSize,
-                                     expandSubgroupReduction);
+        return emitGPUGroupReduction(loc, builder, input, kind, size,
+                                     *subgroupSize, expandSubgroupReduction);
       };
       auto distributionFn = [](Value val) {
         auto vecType = llvm::dyn_cast<VectorType>(val.getType());
@@ -312,17 +316,16 @@ struct VectorReductionToGPUPass final
 
 private:
   bool expandSubgroupReduction;
-  std::function<int(mlir::FunctionOpInterface)> getWarpSize;
+  bool pickLargestSubroupSize;
 };
 
 } // namespace
 
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createConvertVectorReductionToGPUPass(
-    bool expandSubgroupReduction,
-    std::function<int(mlir::FunctionOpInterface)> getWarpSize) {
+createConvertVectorReductionToGPUPass(bool expandSubgroupReduction,
+                                      bool pickLargestSubroupSize) {
   return std::make_unique<VectorReductionToGPUPass>(expandSubgroupReduction,
-                                                    getWarpSize);
+                                                    pickLargestSubroupSize);
 }
 
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
@@ -2,7 +2,7 @@
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline='builtin.module(func.func(iree-codegen-vector-reduction-to-gpu, cse))' %s | FileCheck %s --check-prefix=CDNA3
 
 #map = affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)>
-#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1] subgroup_size = 32>
 module {
   func.func @simple_reduce() attributes {translation_info = #translation_info} {
     %c0 = arith.constant 0 : index
@@ -69,7 +69,7 @@ module {
 
 // Make sure memref.load from uniform buffers are hoisted out as uniform code.
 
-#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1] subgroup_size = 32>
 #map = affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)>
 module {
   func.func @reduce_uniform_buffer_offset() attributes {translation_info = #translation_info} {
@@ -125,7 +125,7 @@ module {
 
 
 #map = affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)>
-#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1] subgroup_size = 32>
 module {
   func.func @reduce_storage_buffer_offset() attributes {translation_info = #translation_info} {
     %c0 = arith.constant 0 : index
@@ -176,7 +176,7 @@ module {
 
 // -----
 
-#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1] subgroup_size = 32>
 module {
   func.func @shared_memory_copy() attributes {translation_info = #translation_info} {
     %c0 = arith.constant 0 : index
@@ -209,7 +209,7 @@ module {
 
 // Check that we multi-row matvec gets distributed across subgroup threads.
 
-#translation_info = #iree_codegen.translation_info<None workgroup_size = [64, 1, 1]>
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [64, 1, 1] subgroup_size = 64>
 #map = affine_map<()[s0] -> (s0 * 4)>
 #map1 = affine_map<(d0, d1) -> (0, d1)>
 module {
@@ -258,7 +258,7 @@ module {
 //  CDNA3-NEXT:   return
 
 // -----
-#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1] subgroup_size = 32>
 module {
   func.func @simple_nd_write() attributes {translation_info = #translation_info} {
     %c0 = arith.constant 0 : index

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -845,18 +845,9 @@ void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager) {
   funcPassManager.addPass(createForOpCanonicalizationPass());
   funcPassManager.addPass(createCanonicalizerPass());
 
-  auto getSubgroupSizeFn = [](mlir::FunctionOpInterface func) -> int {
-    // TODO: This kind of call back function is a really really bad idea
-    // This should be easier to resolve than doing this.
-    if (std::optional<int64_t> maybeSubgroupSize = getSubgroupSize(func)) {
-      return maybeSubgroupSize.value();
-    }
-    return kDefaultSubgroupSize;
-  };
-
   // vector -> simt gpu + vector
   funcPassManager.addPass(createConvertVectorReductionToGPUPass(
-      /*expandSubgroupReduction=*/true, getSubgroupSizeFn));
+      /*expandSubgroupReduction=*/true, /*pickLargestSubgroupSize=*/false));
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
 }

diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
@@ -598,15 +598,9 @@ void addSPIRVSubgroupReducePassPipeline(OpPassManager &funcPassManager) {
   funcPassManager.addPass(createForOpCanonicalizationPass());
   funcPassManager.addPass(createCanonicalizerPass());
 
-  auto getWarpSize = [](mlir::FunctionOpInterface func) -> int {
-    // TODO: This kind of call back function is a really really bad idea
-    // This should be easier to resolve than doing this.
-    return *getGPUSubgroupSize(func, /*pickLargest=*/true);
-  };
-
   // Handle vector reduction operations specifically.
   funcPassManager.addPass(createConvertVectorReductionToGPUPass(
-      /*expandSubgroupReduction=*/false, getWarpSize));
+      /*expandSubgroupReduction=*/false, /*pickLargestSubgroupSize=*/true));
   // Perform normal vector unrolling and lowering transformations. This breaks
   // vectors down to native machine size.
   addSPIRVVectorLoweringPasses(funcPassManager);