[OSPP] Add MatMul and Conv2d optimization pass. (buddy-compiler#78)

wzclly · Oct 30, 2022 · 2737425 · 2737425
1 parent ff35d75
commit 2737425
Show file tree

Hide file tree

Showing 10 changed files with 650 additions and 37 deletions.
diff --git a/examples/MLIRLinalg/linalg-conv2d_nchw_fchw.mlir b/examples/MLIRLinalg/linalg-conv2d_nchw_fchw.mlir
@@ -0,0 +1,62 @@
+module {
+  func.func private @printMemrefF32(memref<*xf32>)
+  func.func @alloc_2d_filled_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: f32) -> memref<?x?x?x?xf32> {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = memref.alloc(%arg0, %arg1, %arg2, %arg3) : memref<?x?x?x?xf32>
+    scf.for %arg5 = %c0 to %arg0 step %c1 {
+      scf.for %arg6 = %c0 to %arg1 step %c1 {
+        scf.for %arg7 = %c0 to %arg2 step %c1 {
+          scf.for %arg8 = %c0 to %arg3 step %c1 {
+            memref.store %arg4, %0[%arg5, %arg6, %arg7, %arg8] : memref<?x?x?x?xf32>
+          }
+        }
+      }
+    }
+    return %0 : memref<?x?x?x?xf32>
+  }
+  func.func @conv_2d_nchw_fchw(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
+    linalg.conv_2d_nchw_fchw ins(%arg0, %arg1 : memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) outs(%arg2 : memref<?x?x?x?xf32>)
+    return
+  }
+  func.func @main() {
+    // Intput(image, filter) and output value.
+    %cst = arith.constant 1.000000e+00 : f32
+    %cst_0 = arith.constant 0.000000e+00 : f32
+
+    %current_image_n = arith.constant 2 : index
+    %current_image_c = arith.constant 2 : index
+    %current_image_h = arith.constant 7 : index
+    %current_image_w = arith.constant 7 : index
+
+    %current_filter_n = arith.constant 2 : index
+    %current_filter_c = arith.constant 2 : index
+    %current_filter_h = arith.constant 4 : index
+    %current_filter_w = arith.constant 4 : index
+
+    %current_output_n = arith.constant 2 : index
+    %current_output_c = arith.constant 2 : index
+    %current_output_h = arith.constant 4 : index
+    %current_output_w = arith.constant 4 : index
+
+    // Image.
+    %image = call @alloc_2d_filled_f32(%current_image_n, %current_image_c, %current_image_h, %current_image_w, %cst) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+    // Filter.
+    %filter = call @alloc_2d_filled_f32(%current_filter_n, %current_filter_c, %current_filter_h, %current_filter_w, %cst) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+    // Output.
+    %output = call @alloc_2d_filled_f32(%current_output_n, %current_output_c, %current_output_h, %current_output_w, %cst_0) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+
+    call @conv_2d_nchw_fchw(%image, %filter, %output) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
+
+    %3 = memref.cast %output : memref<?x?x?x?xf32> to memref<*xf32>
+
+    // Print output.
+    call @printMemrefF32(%3) : (memref<*xf32>) -> ()
+
+    memref.dealloc %output : memref<?x?x?x?xf32>
+    memref.dealloc %image : memref<?x?x?x?xf32>
+    memref.dealloc %filter : memref<?x?x?x?xf32>
+    return
+  }
+}
+
diff --git a/examples/MLIRLinalg/linalg-matmul.mlir b/examples/MLIRLinalg/linalg-matmul.mlir
@@ -1,47 +1,47 @@
-module {
-  func.func private @printMemrefF32(memref<*xf32>)
-
-  func.func @alloc_2d_filled_f32(%arg0: index, %arg1: index, %arg2: f32) -> memref<?x?xf32> {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %0 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
-    scf.for %arg3 = %c0 to %arg0 step %c1 {
-      scf.for %arg4 = %c0 to %arg1 step %c1 {
-        memref.store %arg2, %0[%arg3, %arg4] : memref<?x?xf32>
-      }
+module{
+    func.func private @printMemrefF32(memref<*xf32>)
+
+    func.func @matmul(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>) {
+      linalg.matmul 
+        ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
+       outs(%c:memref<?x?xf32>)
+      return
     }
-    return %0 : memref<?x?xf32>
-  }
 
-  func.func @matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
-    linalg.matmul ins (%arg0, %arg1: memref<?x?xf32>, memref<?x?xf32>)
-                  outs (%arg2: memref<?x?xf32>)
-    return
-  }
+    func.func @main(){
+       // Set up dims.
+       %cM = arith.constant 4 : index
+       %cN = arith.constant 4 : index
+       %cK = arith.constant 4 : index
 
-  func.func @main() {
-    %c2 = arith.constant 2 : index
-    %c3 = arith.constant 3 : index
-    %c5 = arith.constant 5 : index
+       // Set Init Value.
+       %cf1 = arith.constant 1.0 : f32
 
-    // Initial data of input and output.
-    %cst = arith.constant 1.000000e+00 : f32
-    %cst_0 = arith.constant 0.000000e+00 : f32
+       %A = memref.alloc(%cM, %cK) : memref<?x?xf32>
+       %B = memref.alloc(%cK, %cN) : memref<?x?xf32>
+       %C = memref.alloc(%cM, %cN) : memref<?x?xf32>
 
-    %input1 = call @alloc_2d_filled_f32(%c5, %c3, %cst) : (index, index, f32) -> memref<?x?xf32>
-    %input2 = call @alloc_2d_filled_f32(%c3, %c2, %cst) : (index, index, f32) -> memref<?x?xf32>
-    %output = call @alloc_2d_filled_f32(%c5, %c2, %cst_0) : (index, index, f32) -> memref<?x?xf32>
+       linalg.fill
+        ins(%cf1 : f32)
+       outs(%A:memref<?x?xf32>)
 
-    call @matmul(%input1, %input2, %output) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
+       linalg.fill
+        ins(%cf1 : f32)
+       outs(%B:memref<?x?xf32>)
 
-    // Print output.
-    %print_output = memref.cast %output : memref<?x?xf32> to memref<*xf32>
-    call @printMemrefF32(%print_output) : (memref<*xf32>) -> ()
+       linalg.fill
+        ins(%cf1 : f32)
+       outs(%C:memref<?x?xf32>)
 
-    memref.dealloc %input1 : memref<?x?xf32>
-    memref.dealloc %input2 : memref<?x?xf32>
-    memref.dealloc %output : memref<?x?xf32>
+       call @matmul(%A, %B, %C) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
 
-    return
-  }
+       // Print output.
+       %print_C = memref.cast %C : memref<?x?xf32> to memref<*xf32>
+       call @printMemrefF32(%print_C) : (memref<*xf32>) -> ()
+
+       memref.dealloc %C : memref<?x?xf32>
+       memref.dealloc %B : memref<?x?xf32>
+       memref.dealloc %A : memref<?x?xf32>
+       return 
+    }
 }
diff --git a/examples/MLIRLinalg/makefile b/examples/MLIRLinalg/makefile
@@ -101,3 +101,64 @@ linalg-matmul-run:
 		-convert-vector-to-llvm -convert-memref-to-llvm -convert-arith-to-llvm \
 		-convert-func-to-llvm -reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+linalg-matmul-optimize-lower:
+	@${BUDDY_OPT} linalg-matmul.mlir ${MLIR_OPT_OPTIONS} \
+		--matmul-optimize="vec-size=16 kernel-m=2 kernel-n=4" \
+		-o ./log.mlir
+
+linalg-matmul-optimize-translate:
+	@${BUDDY_OPT} linalg-matmul.mlir ${MLIR_OPT_OPTIONS} \
+		--matmul-optimize="vec-size=16 kernel-m=2 kernel-n=4" -convert-linalg-to-loops \
+		-lower-affine -convert-scf-to-cf -convert-vector-to-llvm \
+		-convert-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts | \
+	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+linalg-matmul-optimize-run:
+	@${BUDDY_OPT} linalg-matmul.mlir ${MLIR_OPT_OPTIONS} \
+		--matmul-optimize="vec-size=16 kernel-m=2 kernel-n=4" -convert-linalg-to-loops \
+		-lower-affine -convert-scf-to-cf -convert-vector-to-llvm \
+		-convert-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+linalg-conv2d_nchw_fchw-lower:
+	@${MLIR_OPT} ./linalg-conv2d_nchw_fchw.mlir \
+		-convert-linalg-to-loops -o ./log.mlir
+
+linalg-conv2d_nchw_fchw-translate:
+	@${MLIR_OPT} ./linalg-conv2d_nchw_fchw.mlir \
+		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+		-convert-vector-to-llvm -convert-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts | \
+	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+linalg-conv2d_nchw_fchw-run:
+	@${MLIR_OPT} linalg-conv2d_nchw_fchw.mlir ${MLIR_OPT_OPTIONS} \
+		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+		-convert-vector-to-llvm -convert-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+linalg-conv2d_nchw_fchw-optimize-lower:
+	@${BUDDY_OPT} ./linalg-conv2d_nchw_fchw.mlir \
+		--conv-optimize="kernel-m=2 kernel-n=2 vec-size=16" \
+		-o ./log.mlir
+
+linalg-conv2d_nchw_fchw-optimize-translate:
+	@${BUDDY_OPT} ./linalg-conv2d_nchw_fchw.mlir \
+		--conv-optimize="kernel-m=2 kernel-n=3 vec-size=16" \
+		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+		-convert-vector-to-llvm -convert-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts | \
+	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+linalg-conv2d_nchw_fchw-optimize-run:
+	@${BUDDY_OPT} ./linalg-conv2d_nchw_fchw.mlir ${MLIR_OPT_OPTIONS} \
+		--conv-optimize="kernel-m=2 kernel-n=3 vec-size=16" \
+		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+		-convert-vector-to-llvm -convert-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
@@ -3,4 +3,6 @@ add_subdirectory(LowerBud)
 add_subdirectory(LowerDIP)
 add_subdirectory(LowerRVV)
 add_subdirectory(LowerDAP)
+add_subdirectory(MatMulOptimization)
+add_subdirectory(ConvOptimization)
 add_subdirectory(LowerVectorExp)
diff --git a/lib/Conversion/ConvOptimization/CMakeLists.txt b/lib/Conversion/ConvOptimization/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_mlir_library(ConvOptimization
+	ConvOptimize.cpp
+  )