Add code to evaluate different options

NodLabs · Jun 29, 2021 · 8827332 · 8827332
1 parent d572b54
commit 8827332
Show file tree

Hide file tree

Showing 8 changed files with 657 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+*.S
+*.dump
+*.out
+*.o
diff --git a/README.md b/README.md
@@ -1,4 +1,22 @@
-MLIR Conv-1d Vectorization Experiments
+# MLIR Conv-1d Vectorization Experiments
 
 This repo contains all the experiments used to evaluate options
 for direct vectorization of 1d convolutions.
+
+
+The code in this repo takes an mlir file, lowers it to LLVMIR,
+extracts the assembly for the conv1d function and runs it through
+llvm-mca.
+
+## How to run
+
+```
+./run.py -m [path to mlir build dir] -o [option name]
+```
+
+Currently, the supported options are
+- scalar
+- scalar_unrolled
+- multi_reduction
+- unrolled_contraction
+- shuffled_contraction
diff --git a/multi_reduction.mlir b/multi_reduction.mlir
@@ -0,0 +1,69 @@
+func @conv1d_multi_reduction(%input : memref<16xf32>, %filter : memref<3xf32>, %output : memref<14xf32>) 
+  attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  %f0 = constant 0.0 : f32
+  %z0 = vector.broadcast %f0 : f32 to vector<16xf32>
+  %v0 = vector.broadcast %f0 : f32 to vector<2x3x16xf32>
+  %1 = vector.transfer_read %filter[%c0], %f0 : memref<3xf32>, vector<3xf32>
+  %2 = vector.constant_mask [14] : vector<16xi1>
+  %3 = vector.expandload %input[%c0], %2, %z0 : memref<16xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+  %v1 = vector.insert %3, %v0[0, 0] : vector<16xf32> into vector<2x3x16xf32>
+  %4 = vector.expandload %input[%c1], %2, %z0 : memref<16xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+  %v2 = vector.insert %3, %v1[0, 1] : vector<16xf32> into vector<2x3x16xf32>
+  %5 = vector.expandload %input[%c2], %2, %z0 : memref<16xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+  %v3 = vector.insert %3, %v2[0, 2] : vector<16xf32> into vector<2x3x16xf32>
+  %6 = vector.broadcast %1 : vector<3xf32> to vector<16x3xf32>
+  %7 = vector.transpose %6, [1, 0] : vector<16x3xf32> to vector<3x16xf32>
+  %v4 = vector.insert %7, %v3[1] : vector<3x16xf32> into vector<2x3x16xf32>
+  %v5 = vector.multi_reduction #vector.kind<mul>, %v4 [0] : vector<2x3x16xf32> to vector<3x16xf32>
+  %v6 = vector.multi_reduction #vector.kind<add>, %v5 [0] : vector<3x16xf32> to vector<16xf32>
+  %v7 = vector.extract_strided_slice %v6 {offsets = [0], sizes=[14], strides=[1]} : vector<16xf32> to vector<14xf32>
+  //vector.print %v7 : vector<14xf32>
+  vector.transfer_write %v7, %output[%c0] : vector<14xf32> , memref<14xf32>
+  return
+}
+
+func @print_perf(%iters: index, %total_time: f64) {
+  %cF = constant 3 : index
+  %cO = constant 14 : index
+  %flops_per_iter = muli %cF, %cO : index
+  %flops = muli %iters, %flops_per_iter : index
+  %flops_i64 = index_cast %flops : index to i64
+  %flops_f = sitofp %flops_i64 : i64 to f64
+  %flops_per_s = divf %flops_f, %total_time : f64
+  vector.print %flops_per_s : f64
+  return
+}
+
+func @main() {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  %f1 = constant 0.02914738655090332 : f32
+  %f2 = constant 0.8740115165710449 : f32
+  %f3 = constant -0.858701229095459 : f32
+  %f4 = constant 1.0533758 : f32
+  %iters = constant 1 : index
+  %input = memref.alloc() : memref<16xf32>
+  %filter = memref.alloc() : memref<3xf32>
+  %output = memref.alloc() : memref<14xf32>
+  memref.store %f1, %filter[%c0] : memref<3xf32>
+  memref.store %f2, %filter[%c1] : memref<3xf32>
+  memref.store %f3, %filter[%c2] : memref<3xf32>
+  linalg.fill(%f4, %input) : f32, memref<16xf32>
+  scf.for %arg0 = %c0 to %iters step %c1 {
+    call @conv1d_multi_reduction(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> ()
+  }
+  %t_start = call @rtclock() : () -> f64
+  scf.for %arg0 = %c0 to %iters step %c1 {
+    call @conv1d_multi_reduction(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> ()
+  }
+  %t_end = call @rtclock() : () -> f64
+  %t_conv = subf %t_end, %t_start: f64
+  call @print_perf(%iters, %t_conv) : (index, f64) -> ()
+  return
+}
+
+func private @rtclock() -> f64
diff --git a/run.py b/run.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+import argparse
+import os
+import subprocess
+
+objdump_binary = 'objdump'
+option_choices = ['scalar', 'scalar_unrolled', 'multi_reduction', 
+                  'unrolled_contraction', 'shuffled_contraction']
+mlir_opt_flags = [
+  '-test-vector-multi-reduction-lowering-patterns',
+  '-canonicalize',
+  '-convert-linalg-to-loops',
+  '-convert-vector-to-llvm',
+  '-convert-scf-to-std',
+  '-convert-std-to-llvm',
+]
+mlir_cpu_runner_flags = lambda build_dir, object_filename : [
+  '-O3',
+  '-entry-point-result=void',
+  f'-shared-libs={build_dir}/lib/libmlir_c_runner_utils.so',
+  '-dump-object-file',
+  f'-object-filename={object_filename}',
+]
+objdump_flags = ['-D']
+llvm_mca_flags = []
+
+def profile(args, obj_name):
+    # Run objdump to get asm
+    dumpfile = args.o + '.dump'
+    f = open(dumpfile, 'w')
+    subprocess.run([objdump_binary] + objdump_flags + [obj_name], stdout=f)
+    f.close()
+
+    # Extract asm of relevant section
+    with open(dumpfile, 'r') as f:
+        data = f.readlines()
+    captured = []
+    capturing = False
+    for line in data:
+        if 'conv1d_' + args.o in line:
+            capturing = True
+        if capturing:
+            captured.append(line)
+        if capturing and 'retq' in line:
+            break
+
+    asm = []
+    for line in captured:
+        splits = line.split('\t')
+        if len(splits) == 3:
+            asm.append(splits[-1])
+    asm_file = args.o + '.S'
+    with open(asm_file, 'w') as f:
+        for line in asm:
+            f.write(line)
+
+    # Run llvm-mca on asm
+    llvm_mca_out_file = args.o + '_llvm_mca.out'
+    f = open(llvm_mca_out_file, 'w')
+    res = subprocess.run([args.llvm_mca] + llvm_mca_flags + [asm_file], stdout=f)
+    f.close()
+
+    with open(llvm_mca_out_file, 'r') as f:
+        count = 0
+        for line in f.readlines():
+            if count < 10:
+                print(line.strip())
+            count += 1
+
+def compile_and_run(args):
+    # Run mlir-opt
+    mlir_opt = os.path.join(args.m, 'bin/mlir-opt')
+    mlir_file = args.o + '.mlir'
+    mlir_outfile = args.o + 'mlir.out'
+    f = open(mlir_outfile, 'w')
+    subprocess.run([mlir_opt] + mlir_opt_flags + [mlir_file], stdout=f)
+    f.close()
+
+    # Run mlir-cpu-runner
+    mlir_cpu_runner = os.path.join(args.m, 'bin/mlir-cpu-runner')
+    obj_name = args.o + '.o'
+    res = subprocess.run([mlir_cpu_runner] + mlir_cpu_runner_flags(args.m, obj_name) + [mlir_outfile],
+                         capture_output=True)
+    return obj_name
+
+def run(args):
+    print(f"Evaluating ... {args.o}.mlir")
+    obj_name = compile_and_run(args)
+    profile(args, obj_name)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Utility to evaluate conv1d vectorization options')
+    parser.add_argument('-m', '-mlir_build_dir', help='path to mlir build dir', required=True)
+    parser.add_argument('-o', '-option', default='scalar', choices=option_choices,
+            help='which conv1d vectorization strategy to evaluate')
+    parser.add_argument('-llvm_mca', default='llvm-mca', help='llvm-mca binary to use for profiling')
+    args = parser.parse_args()
+    run(args)
+
diff --git a/scalar.mlir b/scalar.mlir
@@ -0,0 +1,66 @@
+func @conv1d_scalar(%input : memref<16xf32>, %filter : memref<3xf32>, %output : memref<14xf32>) 
+  attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c3 = constant 3 : index
+  %c14 = constant 14 : index
+  scf.for %i = %c0 to %c14 step %c1 {
+    %y = constant 0.0 : f32
+    %x = scf.for %j = %c0 to %c3 step %c1 iter_args(%acc = %y) -> (f32) {
+      %idx = addi %i, %j : index
+      %0 = memref.load %input[%idx] : memref<16xf32>
+      %1 = memref.load %filter[%j] : memref<3xf32>
+      %3 = mulf %0, %1 : f32
+      %4 = addf %acc, %3 : f32
+      scf.yield %4 : f32
+    }
+    memref.store %x, %output[%i] : memref<14xf32>
+    // vector.print %x : f32
+  }
+  return
+}
+
+func @print_perf(%iters: index, %total_time: f64) {
+  %cF = constant 3 : index
+  %cO = constant 14 : index
+  %flops_per_iter = muli %cF, %cO : index
+  %flops = muli %iters, %flops_per_iter : index
+  %flops_i64 = index_cast %flops : index to i64
+  %flops_f = sitofp %flops_i64 : i64 to f64
+  %flops_per_s = divf %flops_f, %total_time : f64
+  vector.print %flops_per_s : f64
+  return
+}
+
+func @main() {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  %f1 = constant 0.02914738655090332 : f32
+  %f2 = constant 0.8740115165710449 : f32
+  %f3 = constant -0.858701229095459 : f32
+  %f4 = constant 1.0533758 : f32
+  %iters = constant 1 : index
+  %input = memref.alloc() : memref<16xf32>
+  %filter = memref.alloc() : memref<3xf32>
+  %output = memref.alloc() : memref<14xf32>
+  memref.store %f1, %filter[%c0] : memref<3xf32>
+  memref.store %f2, %filter[%c1] : memref<3xf32>
+  memref.store %f3, %filter[%c2] : memref<3xf32>
+  linalg.fill(%f4, %input) : f32, memref<16xf32>
+  scf.for %arg0 = %c0 to %iters step %c1 {
+    call @conv1d_scalar(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> ()
+  }
+  %t_start = call @rtclock() : () -> f64
+  scf.for %arg0 = %c0 to %iters step %c1 {
+    call @conv1d_scalar(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> ()
+  }
+  %t_end = call @rtclock() : () -> f64
+  %t_conv = subf %t_end, %t_start: f64
+  call @print_perf(%iters, %t_conv) : (index, f64) -> ()
+  return
+}
+
+func private @rtclock() -> f64
+
+
-Original file line number
+Diff line change
@@ -0,0 +1,4 @@
+    *.S
+    *.dump
+    *.out
+    *.o