Skip to content

Commit

Permalink
[gpu] Drop subgroup size query callback (iree-org#18020)
Browse files Browse the repository at this point in the history
Now we have an unified `#iree_gpu.target` attached to the dispatch we
can just query from it, instead of using C++ pipeline callbacks.
  • Loading branch information
antiagainst authored Jul 30, 2024
1 parent 45323df commit 20e2719
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 43 deletions.
10 changes: 2 additions & 8 deletions compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,9 @@ createGPUTensorAlloc(GPUPromoteSharedMemPattern promoteSharedMemPattern =
GPUPromoteSharedMemPattern::ContractionOpPattern);

// Distributes vector ops to all threads/warps in a GPU workgroup.
// `getWarpSize` is for deciding the warp size to use; it takes the
// current function containing those vector ops as the argument.
// If nullptr, warp size 32 will be used.
// TODO: This kind of call back function is a really really bad idea
// This should be easier to resolve than doing this.
std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createConvertVectorReductionToGPUPass(
bool expandSubgroupReduction = true,
std::function<int(mlir::FunctionOpInterface)> getWarpSize = nullptr);
createConvertVectorReductionToGPUPass(bool expandSubgroupReduction = true,
bool pickLargestSubroupSize = false);

enum class ReorderWorkgroupsStrategy { None, Swizzle, Transpose };

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,11 +193,10 @@ static Value simpleWarpShuffleFunction(Location loc, OpBuilder &builder,

struct VectorReductionToGPUPass final
: impl::VectorReductionToGPUPassBase<VectorReductionToGPUPass> {
VectorReductionToGPUPass(
bool expandSubgroupReduction,
std::function<int(mlir::FunctionOpInterface)> getWarpSize)
VectorReductionToGPUPass(bool expandSubgroupReduction,
bool pickLargestSubroupSize)
: expandSubgroupReduction(expandSubgroupReduction),
getWarpSize(getWarpSize) {}
pickLargestSubroupSize(pickLargestSubroupSize) {}

void runOnOperation() override {
FunctionOpInterface funcOp = getOperation();
Expand Down Expand Up @@ -259,12 +258,17 @@ struct VectorReductionToGPUPass final
// 4. Distribute transfer write operations and propagate vector
// distribution.
{
int warpSize = this->getWarpSize ? this->getWarpSize(funcOp) : 32;
std::optional<int> subgroupSize =
getGPUSubgroupSize(funcOp, pickLargestSubroupSize);
if (!subgroupSize) {
funcOp->emitOpError("missing subgroup size");
return signalPassFailure();
}
auto groupReductionFn = [=](Location loc, OpBuilder &builder, Value input,
vector::CombiningKind kind,
uint32_t size) -> Value {
return emitGPUGroupReduction(loc, builder, input, kind, size, warpSize,
expandSubgroupReduction);
return emitGPUGroupReduction(loc, builder, input, kind, size,
*subgroupSize, expandSubgroupReduction);
};
auto distributionFn = [](Value val) {
auto vecType = llvm::dyn_cast<VectorType>(val.getType());
Expand Down Expand Up @@ -312,17 +316,16 @@ struct VectorReductionToGPUPass final

private:
bool expandSubgroupReduction;
std::function<int(mlir::FunctionOpInterface)> getWarpSize;
bool pickLargestSubroupSize;
};

} // namespace

std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createConvertVectorReductionToGPUPass(
bool expandSubgroupReduction,
std::function<int(mlir::FunctionOpInterface)> getWarpSize) {
createConvertVectorReductionToGPUPass(bool expandSubgroupReduction,
bool pickLargestSubroupSize) {
return std::make_unique<VectorReductionToGPUPass>(expandSubgroupReduction,
getWarpSize);
pickLargestSubroupSize);
}

} // namespace mlir::iree_compiler
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline='builtin.module(func.func(iree-codegen-vector-reduction-to-gpu, cse))' %s | FileCheck %s --check-prefix=CDNA3

#map = affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)>
#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1] subgroup_size = 32>
module {
func.func @simple_reduce() attributes {translation_info = #translation_info} {
%c0 = arith.constant 0 : index
Expand Down Expand Up @@ -69,7 +69,7 @@ module {

// Make sure memref.load from uniform buffers are hoisted out as uniform code.

#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1] subgroup_size = 32>
#map = affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)>
module {
func.func @reduce_uniform_buffer_offset() attributes {translation_info = #translation_info} {
Expand Down Expand Up @@ -125,7 +125,7 @@ module {


#map = affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)>
#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1] subgroup_size = 32>
module {
func.func @reduce_storage_buffer_offset() attributes {translation_info = #translation_info} {
%c0 = arith.constant 0 : index
Expand Down Expand Up @@ -176,7 +176,7 @@ module {

// -----

#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1] subgroup_size = 32>
module {
func.func @shared_memory_copy() attributes {translation_info = #translation_info} {
%c0 = arith.constant 0 : index
Expand Down Expand Up @@ -209,7 +209,7 @@ module {

// Check that we multi-row matvec gets distributed across subgroup threads.

#translation_info = #iree_codegen.translation_info<None workgroup_size = [64, 1, 1]>
#translation_info = #iree_codegen.translation_info<None workgroup_size = [64, 1, 1] subgroup_size = 64>
#map = affine_map<()[s0] -> (s0 * 4)>
#map1 = affine_map<(d0, d1) -> (0, d1)>
module {
Expand Down Expand Up @@ -258,7 +258,7 @@ module {
// CDNA3-NEXT: return

// -----
#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1] subgroup_size = 32>
module {
func.func @simple_nd_write() attributes {translation_info = #translation_info} {
%c0 = arith.constant 0 : index
Expand Down
11 changes: 1 addition & 10 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -845,18 +845,9 @@ void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager) {
funcPassManager.addPass(createForOpCanonicalizationPass());
funcPassManager.addPass(createCanonicalizerPass());

auto getSubgroupSizeFn = [](mlir::FunctionOpInterface func) -> int {
// TODO: This kind of call back function is a really really bad idea
// This should be easier to resolve than doing this.
if (std::optional<int64_t> maybeSubgroupSize = getSubgroupSize(func)) {
return maybeSubgroupSize.value();
}
return kDefaultSubgroupSize;
};

// vector -> simt gpu + vector
funcPassManager.addPass(createConvertVectorReductionToGPUPass(
/*expandSubgroupReduction=*/true, getSubgroupSizeFn));
/*expandSubgroupReduction=*/true, /*pickLargestSubgroupSize=*/false));
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
}
Expand Down
8 changes: 1 addition & 7 deletions compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -598,15 +598,9 @@ void addSPIRVSubgroupReducePassPipeline(OpPassManager &funcPassManager) {
funcPassManager.addPass(createForOpCanonicalizationPass());
funcPassManager.addPass(createCanonicalizerPass());

auto getWarpSize = [](mlir::FunctionOpInterface func) -> int {
// TODO: This kind of call back function is a really really bad idea
// This should be easier to resolve than doing this.
return *getGPUSubgroupSize(func, /*pickLargest=*/true);
};

// Handle vector reduction operations specifically.
funcPassManager.addPass(createConvertVectorReductionToGPUPass(
/*expandSubgroupReduction=*/false, getWarpSize));
/*expandSubgroupReduction=*/false, /*pickLargestSubgroupSize=*/true));
// Perform normal vector unrolling and lowering transformations. This breaks
// vectors down to native machine size.
addSPIRVVectorLoweringPasses(funcPassManager);
Expand Down

0 comments on commit 20e2719

Please sign in to comment.