[llvmgpu] check if operands can be distributable for shmem promotion (i…

…ree-org#13201) When checking whether a value can be promoted to the shared memory, we need to check if the value's shape is a multiple of the desired vector size, since that's what the GPU shared memory tile and distribute expects.
dbabokin · Apr 20, 2023 · 4dd4eac · 4dd4eac
1 parent ab37989
commit 4dd4eac
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 2 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorAlloc.cpp
@@ -19,16 +19,40 @@
 namespace mlir {
 namespace iree_compiler {
 
+// For optimal performance we always want to copy 128 bits
+static constexpr int copyVectorNumBits = 128;
+
 /// Filter to decide which contract ops need allocations.
 static bool contractOpFilter(Operation *op) {
   auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
   if (!linalgOp) return false;
 
+  if (!linalg::isaContractionOpInterface(linalgOp)) {
+    return false;
+  }
+
   // The workgroup specialization already makes static shapes available for the
   // main tile part and makes the partial tile computation small, so promoting
   // to shared memory for the partial tile actually hurts the performance.
   if (linalgOp.hasDynamicShape()) return false;
 
+  // Check if the shape is tile-distributable. The leading dimension must be a
+  // multiple of the target vector size, which is 128b / the element bit width.
+  auto isTileDistributable = [&](OpOperand *v) {
+    ShapedType ty = v->get().getType().cast<ShapedType>();
+    unsigned bitWidth = ty.getElementTypeBitWidth();
+    int targetVectorSize = copyVectorNumBits / bitWidth;
+    return ty.getShape().back() % targetVectorSize == 0;
+  };
+
+  if (!llvm::all_of(linalgOp.getDpsInputOperands(), isTileDistributable)) {
+    return false;
+  }
+
+  if (!llvm::all_of(linalgOp.getDpsInitOperands(), isTileDistributable)) {
+    return false;
+  }
+
   SmallVector<unsigned> dims;
   linalgOp.getParallelDims(dims);
   SmallVector<int64_t, 4> shapes = linalgOp.getStaticLoopRanges();
@@ -39,8 +63,7 @@ static bool contractOpFilter(Operation *op) {
       numNonUnitParallelLoop++;
     }
   }
-  return numNonUnitParallelLoop > 1 && linalg::isaContractionOpInterface(op) &&
-         linalgOp.getNumParallelLoops() >= 2 &&
+  return numNonUnitParallelLoop > 1 && linalgOp.getNumParallelLoops() >= 2 &&
          linalgOp.getNumParallelLoops() <= 3;
 }
 

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensor_alloc.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensor_alloc.mlir
@@ -82,3 +82,50 @@ func.func @matmul_multi_uses() {
 //         CHECK:    %[[PB:.*]] = bufferization.alloc_tensor() copy(%[[B]]) {bufferization.escape = [false]} : tensor<1024x128xf32>
 //         CHECK:    %[[M:.*]] = linalg.matmul {{.*}} ins(%[[PA]], %[[PB]] : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%{{.*}} : tensor<32x128xf32>) -> tensor<32x128xf32>
 //         CHECK:    "some_use"(%[[A]]) : (tensor<32x1024xf32>) -> ()
+
+// -----
+
+  func.func @matmul_33x33x903168_f32() {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant 9.031680e+05 : f32
+    %cst_1 = arith.constant 0.949999988 : f32
+    %c32 = arith.constant 32 : index
+    %workgroup_id_x = hal.interface.workgroup.id[0] : index
+    %0 = affine.min affine_map<()[s0] -> (s0 * -32 + 33, 32)>()[%workgroup_id_x]
+    %1 = arith.cmpi eq, %0, %c32 : index
+    scf.if %1 {
+      %2 = hal.interface.constant.load[0] : i32
+      %3 = hal.interface.constant.load[1] : i32
+      %4 = hal.interface.constant.load[2] : i32
+      %5 = arith.index_castui %2 {stream.alignment = 4096 : index, stream.values = [1240289280 : index, 1789415424 : index]} : i32 to index
+      %6 = arith.index_castui %3 {stream.alignment = 8192 : index, stream.values = [633077760 : index, 752295936 : index]} : i32 to index
+      %7 = arith.index_castui %4 {stream.alignment = 64 : index, stream.values = [1486349952 : index, 1486358464 : index]} : i32 to index
+      %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<33x903168xf32>>
+      %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<903168x33xf32>>
+      %10 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<33x33xf32>>
+      %11 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor<writeonly:tensor<33x33xf32>>
+      %12 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
+      %13 = flow.dispatch.tensor.load %11, offsets = [%12, 0], sizes = [32, 33], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<33x33xf32>> -> tensor<32x33xf32>
+      %14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [903168, 33], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<903168x33xf32>> -> tensor<903168x33xf32>
+      %15 = flow.dispatch.tensor.load %10, offsets = [%12, 0], sizes = [32, 33], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<33x33xf32>> -> tensor<32x33xf32>
+      %16 = flow.dispatch.tensor.load %8, offsets = [%12, 0], sizes = [32, 903168], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<33x903168xf32>> -> tensor<32x903168xf32>
+      %17 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%13 : tensor<32x33xf32>) -> tensor<32x33xf32>
+      %18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%16, %14 : tensor<32x903168xf32>, tensor<903168x33xf32>) outs(%17 : tensor<32x33xf32>) -> tensor<32x33xf32>
+      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<32x33xf32>) outs(%18 : tensor<32x33xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} {
+      ^bb0(%in: f32, %out: f32):
+        %20 = arith.divf %out, %cst_0 : f32
+        %21 = arith.mulf %in, %cst_1 : f32
+        %22 = arith.addf %21, %20 : f32
+        linalg.yield %22 : f32
+      } -> tensor<32x33xf32>
+      flow.dispatch.tensor.store %19, %11, offsets = [%12, 0], sizes = [32, 33], strides = [1, 1] : tensor<32x33xf32> -> !flow.dispatch.tensor<writeonly:tensor<33x33xf32>>
+    }
+    return
+  }
+
+// The allocation should not happen when there is any unaligned size, e.g., 33 in this case.
+//
+// CHECK-LABEL: func.func @matmul_33x33x903168_f32
+// CHECK-NOT: bufferization.alloc_tensor()
+