Skip to content

Commit

Permalink
Add code to evaluate different options
Browse files Browse the repository at this point in the history
  • Loading branch information
harsh-nod committed Jun 29, 2021
1 parent d572b54 commit 8827332
Show file tree
Hide file tree
Showing 8 changed files with 657 additions and 1 deletion.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*.S
*.dump
*.out
*.o
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
MLIR Conv-1d Vectorization Experiments
# MLIR Conv-1d Vectorization Experiments

This repo contains all the experiments used to evaluate options
for direct vectorization of 1d convolutions.


The code in this repo takes an mlir file, lowers it to LLVMIR,
extracts the assembly for the conv1d function and runs it through
llvm-mca.

## How to run

```
./run.py -m [path to mlir build dir] -o [option name]
```

Currently, the supported options are
- scalar
- scalar_unrolled
- multi_reduction
- unrolled_contraction
- shuffled_contraction
69 changes: 69 additions & 0 deletions multi_reduction.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
func @conv1d_multi_reduction(%input : memref<16xf32>, %filter : memref<3xf32>, %output : memref<14xf32>)
attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
%c0 = constant 0 : index
%c1 = constant 1 : index
%c2 = constant 2 : index
%f0 = constant 0.0 : f32
%z0 = vector.broadcast %f0 : f32 to vector<16xf32>
%v0 = vector.broadcast %f0 : f32 to vector<2x3x16xf32>
%1 = vector.transfer_read %filter[%c0], %f0 : memref<3xf32>, vector<3xf32>
%2 = vector.constant_mask [14] : vector<16xi1>
%3 = vector.expandload %input[%c0], %2, %z0 : memref<16xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
%v1 = vector.insert %3, %v0[0, 0] : vector<16xf32> into vector<2x3x16xf32>
%4 = vector.expandload %input[%c1], %2, %z0 : memref<16xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
%v2 = vector.insert %3, %v1[0, 1] : vector<16xf32> into vector<2x3x16xf32>
%5 = vector.expandload %input[%c2], %2, %z0 : memref<16xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
%v3 = vector.insert %3, %v2[0, 2] : vector<16xf32> into vector<2x3x16xf32>
%6 = vector.broadcast %1 : vector<3xf32> to vector<16x3xf32>
%7 = vector.transpose %6, [1, 0] : vector<16x3xf32> to vector<3x16xf32>
%v4 = vector.insert %7, %v3[1] : vector<3x16xf32> into vector<2x3x16xf32>
%v5 = vector.multi_reduction #vector.kind<mul>, %v4 [0] : vector<2x3x16xf32> to vector<3x16xf32>
%v6 = vector.multi_reduction #vector.kind<add>, %v5 [0] : vector<3x16xf32> to vector<16xf32>
%v7 = vector.extract_strided_slice %v6 {offsets = [0], sizes=[14], strides=[1]} : vector<16xf32> to vector<14xf32>
//vector.print %v7 : vector<14xf32>
vector.transfer_write %v7, %output[%c0] : vector<14xf32> , memref<14xf32>
return
}

func @print_perf(%iters: index, %total_time: f64) {
%cF = constant 3 : index
%cO = constant 14 : index
%flops_per_iter = muli %cF, %cO : index
%flops = muli %iters, %flops_per_iter : index
%flops_i64 = index_cast %flops : index to i64
%flops_f = sitofp %flops_i64 : i64 to f64
%flops_per_s = divf %flops_f, %total_time : f64
vector.print %flops_per_s : f64
return
}

func @main() {
%c0 = constant 0 : index
%c1 = constant 1 : index
%c2 = constant 2 : index
%f1 = constant 0.02914738655090332 : f32
%f2 = constant 0.8740115165710449 : f32
%f3 = constant -0.858701229095459 : f32
%f4 = constant 1.0533758 : f32
%iters = constant 1 : index
%input = memref.alloc() : memref<16xf32>
%filter = memref.alloc() : memref<3xf32>
%output = memref.alloc() : memref<14xf32>
memref.store %f1, %filter[%c0] : memref<3xf32>
memref.store %f2, %filter[%c1] : memref<3xf32>
memref.store %f3, %filter[%c2] : memref<3xf32>
linalg.fill(%f4, %input) : f32, memref<16xf32>
scf.for %arg0 = %c0 to %iters step %c1 {
call @conv1d_multi_reduction(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> ()
}
%t_start = call @rtclock() : () -> f64
scf.for %arg0 = %c0 to %iters step %c1 {
call @conv1d_multi_reduction(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> ()
}
%t_end = call @rtclock() : () -> f64
%t_conv = subf %t_end, %t_start: f64
call @print_perf(%iters, %t_conv) : (index, f64) -> ()
return
}

func private @rtclock() -> f64
99 changes: 99 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env python3
import argparse
import os
import subprocess

objdump_binary = 'objdump'
option_choices = ['scalar', 'scalar_unrolled', 'multi_reduction',
'unrolled_contraction', 'shuffled_contraction']
mlir_opt_flags = [
'-test-vector-multi-reduction-lowering-patterns',
'-canonicalize',
'-convert-linalg-to-loops',
'-convert-vector-to-llvm',
'-convert-scf-to-std',
'-convert-std-to-llvm',
]
mlir_cpu_runner_flags = lambda build_dir, object_filename : [
'-O3',
'-entry-point-result=void',
f'-shared-libs={build_dir}/lib/libmlir_c_runner_utils.so',
'-dump-object-file',
f'-object-filename={object_filename}',
]
objdump_flags = ['-D']
llvm_mca_flags = []

def profile(args, obj_name):
# Run objdump to get asm
dumpfile = args.o + '.dump'
f = open(dumpfile, 'w')
subprocess.run([objdump_binary] + objdump_flags + [obj_name], stdout=f)
f.close()

# Extract asm of relevant section
with open(dumpfile, 'r') as f:
data = f.readlines()
captured = []
capturing = False
for line in data:
if 'conv1d_' + args.o in line:
capturing = True
if capturing:
captured.append(line)
if capturing and 'retq' in line:
break

asm = []
for line in captured:
splits = line.split('\t')
if len(splits) == 3:
asm.append(splits[-1])
asm_file = args.o + '.S'
with open(asm_file, 'w') as f:
for line in asm:
f.write(line)

# Run llvm-mca on asm
llvm_mca_out_file = args.o + '_llvm_mca.out'
f = open(llvm_mca_out_file, 'w')
res = subprocess.run([args.llvm_mca] + llvm_mca_flags + [asm_file], stdout=f)
f.close()

with open(llvm_mca_out_file, 'r') as f:
count = 0
for line in f.readlines():
if count < 10:
print(line.strip())
count += 1

def compile_and_run(args):
# Run mlir-opt
mlir_opt = os.path.join(args.m, 'bin/mlir-opt')
mlir_file = args.o + '.mlir'
mlir_outfile = args.o + 'mlir.out'
f = open(mlir_outfile, 'w')
subprocess.run([mlir_opt] + mlir_opt_flags + [mlir_file], stdout=f)
f.close()

# Run mlir-cpu-runner
mlir_cpu_runner = os.path.join(args.m, 'bin/mlir-cpu-runner')
obj_name = args.o + '.o'
res = subprocess.run([mlir_cpu_runner] + mlir_cpu_runner_flags(args.m, obj_name) + [mlir_outfile],
capture_output=True)
return obj_name

def run(args):
print(f"Evaluating ... {args.o}.mlir")
obj_name = compile_and_run(args)
profile(args, obj_name)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Utility to evaluate conv1d vectorization options')
parser.add_argument('-m', '-mlir_build_dir', help='path to mlir build dir', required=True)
parser.add_argument('-o', '-option', default='scalar', choices=option_choices,
help='which conv1d vectorization strategy to evaluate')
parser.add_argument('-llvm_mca', default='llvm-mca', help='llvm-mca binary to use for profiling')
args = parser.parse_args()
run(args)

66 changes: 66 additions & 0 deletions scalar.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
func @conv1d_scalar(%input : memref<16xf32>, %filter : memref<3xf32>, %output : memref<14xf32>)
attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
%c0 = constant 0 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c14 = constant 14 : index
scf.for %i = %c0 to %c14 step %c1 {
%y = constant 0.0 : f32
%x = scf.for %j = %c0 to %c3 step %c1 iter_args(%acc = %y) -> (f32) {
%idx = addi %i, %j : index
%0 = memref.load %input[%idx] : memref<16xf32>
%1 = memref.load %filter[%j] : memref<3xf32>
%3 = mulf %0, %1 : f32
%4 = addf %acc, %3 : f32
scf.yield %4 : f32
}
memref.store %x, %output[%i] : memref<14xf32>
// vector.print %x : f32
}
return
}

func @print_perf(%iters: index, %total_time: f64) {
%cF = constant 3 : index
%cO = constant 14 : index
%flops_per_iter = muli %cF, %cO : index
%flops = muli %iters, %flops_per_iter : index
%flops_i64 = index_cast %flops : index to i64
%flops_f = sitofp %flops_i64 : i64 to f64
%flops_per_s = divf %flops_f, %total_time : f64
vector.print %flops_per_s : f64
return
}

func @main() {
%c0 = constant 0 : index
%c1 = constant 1 : index
%c2 = constant 2 : index
%f1 = constant 0.02914738655090332 : f32
%f2 = constant 0.8740115165710449 : f32
%f3 = constant -0.858701229095459 : f32
%f4 = constant 1.0533758 : f32
%iters = constant 1 : index
%input = memref.alloc() : memref<16xf32>
%filter = memref.alloc() : memref<3xf32>
%output = memref.alloc() : memref<14xf32>
memref.store %f1, %filter[%c0] : memref<3xf32>
memref.store %f2, %filter[%c1] : memref<3xf32>
memref.store %f3, %filter[%c2] : memref<3xf32>
linalg.fill(%f4, %input) : f32, memref<16xf32>
scf.for %arg0 = %c0 to %iters step %c1 {
call @conv1d_scalar(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> ()
}
%t_start = call @rtclock() : () -> f64
scf.for %arg0 = %c0 to %iters step %c1 {
call @conv1d_scalar(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> ()
}
%t_end = call @rtclock() : () -> f64
%t_conv = subf %t_end, %t_start: f64
call @print_perf(%iters, %t_conv) : (index, f64) -> ()
return
}

func private @rtclock() -> f64


Loading

0 comments on commit 8827332

Please sign in to comment.