Skip to content

Commit

Permalink
[llvmgpu] check if operands can be distributable for shmem promotion (i…
Browse files Browse the repository at this point in the history
…ree-org#13201)

When checking whether a value can be promoted to the shared memory, we need to check if the value's shape is a multiple of the desired vector size, since that's what the GPU shared memory tile and distribute expects.
  • Loading branch information
okkwon authored Apr 20, 2023
1 parent ab37989 commit 4dd4eac
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 2 deletions.
27 changes: 25 additions & 2 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,40 @@
namespace mlir {
namespace iree_compiler {

// For optimal performance we always want to copy 128 bits
static constexpr int copyVectorNumBits = 128;

/// Filter to decide which contract ops need allocations.
static bool contractOpFilter(Operation *op) {
auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
if (!linalgOp) return false;

if (!linalg::isaContractionOpInterface(linalgOp)) {
return false;
}

// The workgroup specialization already makes static shapes available for the
// main tile part and makes the partial tile computation small, so promoting
// to shared memory for the partial tile actually hurts the performance.
if (linalgOp.hasDynamicShape()) return false;

// Check if the shape is tile-distributable. The leading dimension must be a
// multiple of the target vector size, which is 128b / the element bit width.
auto isTileDistributable = [&](OpOperand *v) {
ShapedType ty = v->get().getType().cast<ShapedType>();
unsigned bitWidth = ty.getElementTypeBitWidth();
int targetVectorSize = copyVectorNumBits / bitWidth;
return ty.getShape().back() % targetVectorSize == 0;
};

if (!llvm::all_of(linalgOp.getDpsInputOperands(), isTileDistributable)) {
return false;
}

if (!llvm::all_of(linalgOp.getDpsInitOperands(), isTileDistributable)) {
return false;
}

SmallVector<unsigned> dims;
linalgOp.getParallelDims(dims);
SmallVector<int64_t, 4> shapes = linalgOp.getStaticLoopRanges();
Expand All @@ -39,8 +63,7 @@ static bool contractOpFilter(Operation *op) {
numNonUnitParallelLoop++;
}
}
return numNonUnitParallelLoop > 1 && linalg::isaContractionOpInterface(op) &&
linalgOp.getNumParallelLoops() >= 2 &&
return numNonUnitParallelLoop > 1 && linalgOp.getNumParallelLoops() >= 2 &&
linalgOp.getNumParallelLoops() <= 3;
}

Expand Down
47 changes: 47 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensor_alloc.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,50 @@ func.func @matmul_multi_uses() {
// CHECK: %[[PB:.*]] = bufferization.alloc_tensor() copy(%[[B]]) {bufferization.escape = [false]} : tensor<1024x128xf32>
// CHECK: %[[M:.*]] = linalg.matmul {{.*}} ins(%[[PA]], %[[PB]] : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%{{.*}} : tensor<32x128xf32>) -> tensor<32x128xf32>
// CHECK: "some_use"(%[[A]]) : (tensor<32x1024xf32>) -> ()

// -----

func.func @matmul_33x33x903168_f32() {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant 9.031680e+05 : f32
%cst_1 = arith.constant 0.949999988 : f32
%c32 = arith.constant 32 : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%0 = affine.min affine_map<()[s0] -> (s0 * -32 + 33, 32)>()[%workgroup_id_x]
%1 = arith.cmpi eq, %0, %c32 : index
scf.if %1 {
%2 = hal.interface.constant.load[0] : i32
%3 = hal.interface.constant.load[1] : i32
%4 = hal.interface.constant.load[2] : i32
%5 = arith.index_castui %2 {stream.alignment = 4096 : index, stream.values = [1240289280 : index, 1789415424 : index]} : i32 to index
%6 = arith.index_castui %3 {stream.alignment = 8192 : index, stream.values = [633077760 : index, 752295936 : index]} : i32 to index
%7 = arith.index_castui %4 {stream.alignment = 64 : index, stream.values = [1486349952 : index, 1486358464 : index]} : i32 to index
%8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<33x903168xf32>>
%9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<903168x33xf32>>
%10 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<33x33xf32>>
%11 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor<writeonly:tensor<33x33xf32>>
%12 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%13 = flow.dispatch.tensor.load %11, offsets = [%12, 0], sizes = [32, 33], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<33x33xf32>> -> tensor<32x33xf32>
%14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [903168, 33], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<903168x33xf32>> -> tensor<903168x33xf32>
%15 = flow.dispatch.tensor.load %10, offsets = [%12, 0], sizes = [32, 33], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<33x33xf32>> -> tensor<32x33xf32>
%16 = flow.dispatch.tensor.load %8, offsets = [%12, 0], sizes = [32, 903168], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<33x903168xf32>> -> tensor<32x903168xf32>
%17 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%13 : tensor<32x33xf32>) -> tensor<32x33xf32>
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%16, %14 : tensor<32x903168xf32>, tensor<903168x33xf32>) outs(%17 : tensor<32x33xf32>) -> tensor<32x33xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<32x33xf32>) outs(%18 : tensor<32x33xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} {
^bb0(%in: f32, %out: f32):
%20 = arith.divf %out, %cst_0 : f32
%21 = arith.mulf %in, %cst_1 : f32
%22 = arith.addf %21, %20 : f32
linalg.yield %22 : f32
} -> tensor<32x33xf32>
flow.dispatch.tensor.store %19, %11, offsets = [%12, 0], sizes = [32, 33], strides = [1, 1] : tensor<32x33xf32> -> !flow.dispatch.tensor<writeonly:tensor<33x33xf32>>
}
return
}

// The allocation should not happen when there is any unaligned size, e.g., 33 in this case.
//
// CHECK-LABEL: func.func @matmul_33x33x903168_f32
// CHECK-NOT: bufferization.alloc_tensor()

0 comments on commit 4dd4eac

Please sign in to comment.