-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add code to evaluate different options
- Loading branch information
Showing
8 changed files
with
657 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
*.S | ||
*.dump | ||
*.out | ||
*.o |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,22 @@ | ||
MLIR Conv-1d Vectorization Experiments | ||
# MLIR Conv-1d Vectorization Experiments | ||
|
||
This repo contains all the experiments used to evaluate options | ||
for direct vectorization of 1d convolutions. | ||
|
||
|
||
The code in this repo takes an mlir file, lowers it to LLVMIR, | ||
extracts the assembly for the conv1d function and runs it through | ||
llvm-mca. | ||
|
||
## How to run | ||
|
||
``` | ||
./run.py -m [path to mlir build dir] -o [option name] | ||
``` | ||
|
||
Currently, the supported options are | ||
- scalar | ||
- scalar_unrolled | ||
- multi_reduction | ||
- unrolled_contraction | ||
- shuffled_contraction |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
func @conv1d_multi_reduction(%input : memref<16xf32>, %filter : memref<3xf32>, %output : memref<14xf32>) | ||
attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} { | ||
%c0 = constant 0 : index | ||
%c1 = constant 1 : index | ||
%c2 = constant 2 : index | ||
%f0 = constant 0.0 : f32 | ||
%z0 = vector.broadcast %f0 : f32 to vector<16xf32> | ||
%v0 = vector.broadcast %f0 : f32 to vector<2x3x16xf32> | ||
%1 = vector.transfer_read %filter[%c0], %f0 : memref<3xf32>, vector<3xf32> | ||
%2 = vector.constant_mask [14] : vector<16xi1> | ||
%3 = vector.expandload %input[%c0], %2, %z0 : memref<16xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32> | ||
%v1 = vector.insert %3, %v0[0, 0] : vector<16xf32> into vector<2x3x16xf32> | ||
%4 = vector.expandload %input[%c1], %2, %z0 : memref<16xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32> | ||
%v2 = vector.insert %3, %v1[0, 1] : vector<16xf32> into vector<2x3x16xf32> | ||
%5 = vector.expandload %input[%c2], %2, %z0 : memref<16xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32> | ||
%v3 = vector.insert %3, %v2[0, 2] : vector<16xf32> into vector<2x3x16xf32> | ||
%6 = vector.broadcast %1 : vector<3xf32> to vector<16x3xf32> | ||
%7 = vector.transpose %6, [1, 0] : vector<16x3xf32> to vector<3x16xf32> | ||
%v4 = vector.insert %7, %v3[1] : vector<3x16xf32> into vector<2x3x16xf32> | ||
%v5 = vector.multi_reduction #vector.kind<mul>, %v4 [0] : vector<2x3x16xf32> to vector<3x16xf32> | ||
%v6 = vector.multi_reduction #vector.kind<add>, %v5 [0] : vector<3x16xf32> to vector<16xf32> | ||
%v7 = vector.extract_strided_slice %v6 {offsets = [0], sizes=[14], strides=[1]} : vector<16xf32> to vector<14xf32> | ||
//vector.print %v7 : vector<14xf32> | ||
vector.transfer_write %v7, %output[%c0] : vector<14xf32> , memref<14xf32> | ||
return | ||
} | ||
|
||
func @print_perf(%iters: index, %total_time: f64) { | ||
%cF = constant 3 : index | ||
%cO = constant 14 : index | ||
%flops_per_iter = muli %cF, %cO : index | ||
%flops = muli %iters, %flops_per_iter : index | ||
%flops_i64 = index_cast %flops : index to i64 | ||
%flops_f = sitofp %flops_i64 : i64 to f64 | ||
%flops_per_s = divf %flops_f, %total_time : f64 | ||
vector.print %flops_per_s : f64 | ||
return | ||
} | ||
|
||
func @main() { | ||
%c0 = constant 0 : index | ||
%c1 = constant 1 : index | ||
%c2 = constant 2 : index | ||
%f1 = constant 0.02914738655090332 : f32 | ||
%f2 = constant 0.8740115165710449 : f32 | ||
%f3 = constant -0.858701229095459 : f32 | ||
%f4 = constant 1.0533758 : f32 | ||
%iters = constant 1 : index | ||
%input = memref.alloc() : memref<16xf32> | ||
%filter = memref.alloc() : memref<3xf32> | ||
%output = memref.alloc() : memref<14xf32> | ||
memref.store %f1, %filter[%c0] : memref<3xf32> | ||
memref.store %f2, %filter[%c1] : memref<3xf32> | ||
memref.store %f3, %filter[%c2] : memref<3xf32> | ||
linalg.fill(%f4, %input) : f32, memref<16xf32> | ||
scf.for %arg0 = %c0 to %iters step %c1 { | ||
call @conv1d_multi_reduction(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> () | ||
} | ||
%t_start = call @rtclock() : () -> f64 | ||
scf.for %arg0 = %c0 to %iters step %c1 { | ||
call @conv1d_multi_reduction(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> () | ||
} | ||
%t_end = call @rtclock() : () -> f64 | ||
%t_conv = subf %t_end, %t_start: f64 | ||
call @print_perf(%iters, %t_conv) : (index, f64) -> () | ||
return | ||
} | ||
|
||
func private @rtclock() -> f64 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#!/usr/bin/env python3 | ||
import argparse | ||
import os | ||
import subprocess | ||
|
||
objdump_binary = 'objdump' | ||
option_choices = ['scalar', 'scalar_unrolled', 'multi_reduction', | ||
'unrolled_contraction', 'shuffled_contraction'] | ||
mlir_opt_flags = [ | ||
'-test-vector-multi-reduction-lowering-patterns', | ||
'-canonicalize', | ||
'-convert-linalg-to-loops', | ||
'-convert-vector-to-llvm', | ||
'-convert-scf-to-std', | ||
'-convert-std-to-llvm', | ||
] | ||
mlir_cpu_runner_flags = lambda build_dir, object_filename : [ | ||
'-O3', | ||
'-entry-point-result=void', | ||
f'-shared-libs={build_dir}/lib/libmlir_c_runner_utils.so', | ||
'-dump-object-file', | ||
f'-object-filename={object_filename}', | ||
] | ||
objdump_flags = ['-D'] | ||
llvm_mca_flags = [] | ||
|
||
def profile(args, obj_name): | ||
# Run objdump to get asm | ||
dumpfile = args.o + '.dump' | ||
f = open(dumpfile, 'w') | ||
subprocess.run([objdump_binary] + objdump_flags + [obj_name], stdout=f) | ||
f.close() | ||
|
||
# Extract asm of relevant section | ||
with open(dumpfile, 'r') as f: | ||
data = f.readlines() | ||
captured = [] | ||
capturing = False | ||
for line in data: | ||
if 'conv1d_' + args.o in line: | ||
capturing = True | ||
if capturing: | ||
captured.append(line) | ||
if capturing and 'retq' in line: | ||
break | ||
|
||
asm = [] | ||
for line in captured: | ||
splits = line.split('\t') | ||
if len(splits) == 3: | ||
asm.append(splits[-1]) | ||
asm_file = args.o + '.S' | ||
with open(asm_file, 'w') as f: | ||
for line in asm: | ||
f.write(line) | ||
|
||
# Run llvm-mca on asm | ||
llvm_mca_out_file = args.o + '_llvm_mca.out' | ||
f = open(llvm_mca_out_file, 'w') | ||
res = subprocess.run([args.llvm_mca] + llvm_mca_flags + [asm_file], stdout=f) | ||
f.close() | ||
|
||
with open(llvm_mca_out_file, 'r') as f: | ||
count = 0 | ||
for line in f.readlines(): | ||
if count < 10: | ||
print(line.strip()) | ||
count += 1 | ||
|
||
def compile_and_run(args): | ||
# Run mlir-opt | ||
mlir_opt = os.path.join(args.m, 'bin/mlir-opt') | ||
mlir_file = args.o + '.mlir' | ||
mlir_outfile = args.o + 'mlir.out' | ||
f = open(mlir_outfile, 'w') | ||
subprocess.run([mlir_opt] + mlir_opt_flags + [mlir_file], stdout=f) | ||
f.close() | ||
|
||
# Run mlir-cpu-runner | ||
mlir_cpu_runner = os.path.join(args.m, 'bin/mlir-cpu-runner') | ||
obj_name = args.o + '.o' | ||
res = subprocess.run([mlir_cpu_runner] + mlir_cpu_runner_flags(args.m, obj_name) + [mlir_outfile], | ||
capture_output=True) | ||
return obj_name | ||
|
||
def run(args): | ||
print(f"Evaluating ... {args.o}.mlir") | ||
obj_name = compile_and_run(args) | ||
profile(args, obj_name) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description='Utility to evaluate conv1d vectorization options') | ||
parser.add_argument('-m', '-mlir_build_dir', help='path to mlir build dir', required=True) | ||
parser.add_argument('-o', '-option', default='scalar', choices=option_choices, | ||
help='which conv1d vectorization strategy to evaluate') | ||
parser.add_argument('-llvm_mca', default='llvm-mca', help='llvm-mca binary to use for profiling') | ||
args = parser.parse_args() | ||
run(args) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
func @conv1d_scalar(%input : memref<16xf32>, %filter : memref<3xf32>, %output : memref<14xf32>) | ||
attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} { | ||
%c0 = constant 0 : index | ||
%c1 = constant 1 : index | ||
%c3 = constant 3 : index | ||
%c14 = constant 14 : index | ||
scf.for %i = %c0 to %c14 step %c1 { | ||
%y = constant 0.0 : f32 | ||
%x = scf.for %j = %c0 to %c3 step %c1 iter_args(%acc = %y) -> (f32) { | ||
%idx = addi %i, %j : index | ||
%0 = memref.load %input[%idx] : memref<16xf32> | ||
%1 = memref.load %filter[%j] : memref<3xf32> | ||
%3 = mulf %0, %1 : f32 | ||
%4 = addf %acc, %3 : f32 | ||
scf.yield %4 : f32 | ||
} | ||
memref.store %x, %output[%i] : memref<14xf32> | ||
// vector.print %x : f32 | ||
} | ||
return | ||
} | ||
|
||
func @print_perf(%iters: index, %total_time: f64) { | ||
%cF = constant 3 : index | ||
%cO = constant 14 : index | ||
%flops_per_iter = muli %cF, %cO : index | ||
%flops = muli %iters, %flops_per_iter : index | ||
%flops_i64 = index_cast %flops : index to i64 | ||
%flops_f = sitofp %flops_i64 : i64 to f64 | ||
%flops_per_s = divf %flops_f, %total_time : f64 | ||
vector.print %flops_per_s : f64 | ||
return | ||
} | ||
|
||
func @main() { | ||
%c0 = constant 0 : index | ||
%c1 = constant 1 : index | ||
%c2 = constant 2 : index | ||
%f1 = constant 0.02914738655090332 : f32 | ||
%f2 = constant 0.8740115165710449 : f32 | ||
%f3 = constant -0.858701229095459 : f32 | ||
%f4 = constant 1.0533758 : f32 | ||
%iters = constant 1 : index | ||
%input = memref.alloc() : memref<16xf32> | ||
%filter = memref.alloc() : memref<3xf32> | ||
%output = memref.alloc() : memref<14xf32> | ||
memref.store %f1, %filter[%c0] : memref<3xf32> | ||
memref.store %f2, %filter[%c1] : memref<3xf32> | ||
memref.store %f3, %filter[%c2] : memref<3xf32> | ||
linalg.fill(%f4, %input) : f32, memref<16xf32> | ||
scf.for %arg0 = %c0 to %iters step %c1 { | ||
call @conv1d_scalar(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> () | ||
} | ||
%t_start = call @rtclock() : () -> f64 | ||
scf.for %arg0 = %c0 to %iters step %c1 { | ||
call @conv1d_scalar(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> () | ||
} | ||
%t_end = call @rtclock() : () -> f64 | ||
%t_conv = subf %t_end, %t_start: f64 | ||
call @print_perf(%iters, %t_conv) : (index, f64) -> () | ||
return | ||
} | ||
|
||
func private @rtclock() -> f64 | ||
|
||
|
Oops, something went wrong.