Skip to content

Commit

Permalink
Refactor and implement 2 variations of shuffling.
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolasvasilache committed Jul 2, 2021
1 parent 8827332 commit 1958c19
Show file tree
Hide file tree
Showing 8 changed files with 380 additions and 258 deletions.
69 changes: 37 additions & 32 deletions multi_reduction.mlir
Original file line number Diff line number Diff line change
@@ -1,38 +1,39 @@
func @conv1d_multi_reduction(%input : memref<16xf32>, %filter : memref<3xf32>, %output : memref<14xf32>)
attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
func @conv1d_multi_reduction(%input : memref<${M}xf32>, %filter : memref<${K}xf32>, %output : memref<${N}xf32>)
attributes { passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
%c0 = constant 0 : index
%c1 = constant 1 : index
%c2 = constant 2 : index
%f0 = constant 0.0 : f32
%z0 = vector.broadcast %f0 : f32 to vector<16xf32>
%v0 = vector.broadcast %f0 : f32 to vector<2x3x16xf32>
%1 = vector.transfer_read %filter[%c0], %f0 : memref<3xf32>, vector<3xf32>
%2 = vector.constant_mask [14] : vector<16xi1>
%3 = vector.expandload %input[%c0], %2, %z0 : memref<16xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
%v1 = vector.insert %3, %v0[0, 0] : vector<16xf32> into vector<2x3x16xf32>
%4 = vector.expandload %input[%c1], %2, %z0 : memref<16xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
%v2 = vector.insert %3, %v1[0, 1] : vector<16xf32> into vector<2x3x16xf32>
%5 = vector.expandload %input[%c2], %2, %z0 : memref<16xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
%v3 = vector.insert %3, %v2[0, 2] : vector<16xf32> into vector<2x3x16xf32>
%6 = vector.broadcast %1 : vector<3xf32> to vector<16x3xf32>
%7 = vector.transpose %6, [1, 0] : vector<16x3xf32> to vector<3x16xf32>
%v4 = vector.insert %7, %v3[1] : vector<3x16xf32> into vector<2x3x16xf32>
%v5 = vector.multi_reduction #vector.kind<mul>, %v4 [0] : vector<2x3x16xf32> to vector<3x16xf32>
%v6 = vector.multi_reduction #vector.kind<add>, %v5 [0] : vector<3x16xf32> to vector<16xf32>
%v7 = vector.extract_strided_slice %v6 {offsets = [0], sizes=[14], strides=[1]} : vector<16xf32> to vector<14xf32>
//vector.print %v7 : vector<14xf32>
vector.transfer_write %v7, %output[%c0] : vector<14xf32> , memref<14xf32>
%z0 = vector.broadcast %f0 : f32 to vector<${M}xf32>
%v0 = vector.broadcast %f0 : f32 to vector<2x${K}x${M}xf32>
%1 = vector.transfer_read %filter[%c0], %f0 : memref<${K}xf32>, vector<${K}xf32>
%2 = vector.constant_mask [${N}] : vector<${M}xi1>
%3 = vector.expandload %input[%c0], %2, %z0 : memref<${M}xf32>, vector<${M}xi1>, vector<${M}xf32> into vector<${M}xf32>
%v1 = vector.insert %3, %v0[0, 0] : vector<${M}xf32> into vector<2x${K}x${M}xf32>
%4 = vector.expandload %input[%c1], %2, %z0 : memref<${M}xf32>, vector<${M}xi1>, vector<${M}xf32> into vector<${M}xf32>
%v2 = vector.insert %3, %v1[0, 1] : vector<${M}xf32> into vector<2x${K}x${M}xf32>
%5 = vector.expandload %input[%c2], %2, %z0 : memref<${M}xf32>, vector<${M}xi1>, vector<${M}xf32> into vector<${M}xf32>
%v3 = vector.insert %3, %v2[0, 2] : vector<${M}xf32> into vector<2x${K}x${M}xf32>
%6 = vector.broadcast %1 : vector<${K}xf32> to vector<${M}x${K}xf32>
%7 = vector.transpose %6, [1, 0] : vector<${M}x${K}xf32> to vector<${K}x${M}xf32>
%v4 = vector.insert %7, %v3[1] : vector<${K}x${M}xf32> into vector<2x${K}x${M}xf32>
%v5 = vector.multi_reduction #vector.kind<mul>, %v4 [0] : vector<2x${K}x${M}xf32> to vector<${K}x${M}xf32>
%v6 = vector.multi_reduction #vector.kind<add>, %v5 [0] : vector<${K}x${M}xf32> to vector<${M}xf32>
%v7 = vector.extract_strided_slice %v6 {offsets = [0], sizes=[${N}], strides=[1]} : vector<${M}xf32> to vector<${N}xf32>
//vector.print %v7 : vector<${N}xf32>
vector.transfer_write %v7, %output[%c0] : vector<${N}xf32> , memref<${N}xf32>
return
}

func @print_perf(%iters: index, %total_time: f64) {
%cF = constant 3 : index
%cO = constant 14 : index
%cF = constant ${K} : index
%cO = constant ${N} : index
%flops_per_iter = muli %cF, %cO : index
%flops = muli %iters, %flops_per_iter : index
%flops_i64 = index_cast %flops : index to i64
%flops_f = sitofp %flops_i64 : i64 to f64
%flops_per_s = divf %flops_f, %total_time : f64
vector.print %total_time : f64
vector.print %flops_per_s : f64
return
}
Expand All @@ -45,25 +46,29 @@ func @main() {
%f2 = constant 0.8740115165710449 : f32
%f3 = constant -0.858701229095459 : f32
%f4 = constant 1.0533758 : f32
%iters = constant 1 : index
%input = memref.alloc() : memref<16xf32>
%filter = memref.alloc() : memref<3xf32>
%output = memref.alloc() : memref<14xf32>
memref.store %f1, %filter[%c0] : memref<3xf32>
memref.store %f2, %filter[%c1] : memref<3xf32>
memref.store %f3, %filter[%c2] : memref<3xf32>
linalg.fill(%f4, %input) : f32, memref<16xf32>
%iters = constant ${ITERS} : index
%input = memref.alloc() : memref<${M}xf32>
%filter = memref.alloc() : memref<${K}xf32>
%output = memref.alloc() : memref<${N}xf32>
memref.store %f1, %filter[%c0] : memref<${K}xf32>
memref.store %f2, %filter[%c1] : memref<${K}xf32>
memref.store %f3, %filter[%c2] : memref<${K}xf32>
linalg.fill(%f4, %input) : f32, memref<${M}xf32>
scf.for %arg0 = %c0 to %iters step %c1 {
call @conv1d_multi_reduction(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> ()
call @conv1d_multi_reduction(%input, %filter, %output) : (memref<${M}xf32>, memref<${K}xf32>, memref<${N}xf32>) -> ()
}
%t_start = call @rtclock() : () -> f64
scf.for %arg0 = %c0 to %iters step %c1 {
call @conv1d_multi_reduction(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> ()
call @conv1d_multi_reduction(%input, %filter, %output) : (memref<${M}xf32>, memref<${K}xf32>, memref<${N}xf32>) -> ()
}
%t_end = call @rtclock() : () -> f64
%t_conv = subf %t_end, %t_start: f64
call @print_perf(%iters, %t_conv) : (index, f64) -> ()

%p = memref.cast %output : memref<${N}xf32> to memref<*xf32>
call @print_memref_f32(%p) : (memref<*xf32>) -> ()
return
}

func private @rtclock() -> f64
func private @print_memref_f32(%ptr : memref<*xf32>)
36 changes: 29 additions & 7 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,30 @@

objdump_binary = 'objdump'
option_choices = ['scalar', 'scalar_unrolled', 'multi_reduction',
'unrolled_contraction', 'shuffled_contraction']
'unrolled_contraction', 'shuffled_contraction_parallel_reduction', 'shuffled_contraction_reduction_parallel']
mlir_opt_flags = [
'-test-vector-multi-reduction-lowering-patterns',
'-test-vector-contraction-conversion=vector-outerproduct=1',
'-canonicalize',
'-convert-linalg-to-loops',
'-convert-vector-to-llvm',
'-convert-scf-to-std',
'-convert-std-to-llvm',
'-canonicalize',
]
mlir_cpu_runner_flags = lambda build_dir, object_filename : [
mlir_cpu_runner_dump_object_flags = lambda build_dir, object_filename : [
'-O3',
'-entry-point-result=void',
f'-shared-libs={build_dir}/lib/libmlir_c_runner_utils.so',
f'-shared-libs={build_dir}/lib/libmlir_c_runner_utils.so,{build_dir}/lib/libmlir_runner_utils.so',
'-dump-object-file',
f'-object-filename={object_filename}',
]
mlir_cpu_runner_run_flags = lambda build_dir, object_filename : [
'-O3',
'-e=main',
'-entry-point-result=void',
f'-shared-libs={build_dir}/lib/libmlir_c_runner_utils.so,{build_dir}/lib/libmlir_runner_utils.so',
]
objdump_flags = ['-D']
llvm_mca_flags = []

Expand All @@ -34,10 +42,11 @@ def profile(args, obj_name):
# Extract asm of relevant section
with open(dumpfile, 'r') as f:
data = f.readlines()

captured = []
capturing = False
for line in data:
if 'conv1d_' + args.o in line:
if '<conv1d_' in line:
capturing = True
if capturing:
captured.append(line)
Expand Down Expand Up @@ -72,15 +81,27 @@ def compile_and_run(args):
mlir_opt = os.path.join(args.m, 'bin/mlir-opt')
mlir_file = args.o + '.mlir'
mlir_outfile = args.o + 'mlir.out'

print(" ".join(['mkdir'] + ['-p'] + [os.path.dirname(mlir_outfile)]))
p = subprocess.Popen(['mkdir'] + ['-p'] + [os.path.dirname(mlir_outfile)])
p.wait()

f = open(mlir_outfile, 'w')
subprocess.run([mlir_opt] + mlir_opt_flags + [mlir_file], stdout=f)
cat = subprocess.Popen(['cat'] + [os.path.basename(mlir_file)] , stdout=subprocess.PIPE)
sed = subprocess.Popen(['sed'] + ['s/${ITERS}/1000000/g'] , stdin=cat.stdout, stdout=subprocess.PIPE)
sed = subprocess.Popen(['sed'] + ['s/${M}/16/g'] , stdin=sed.stdout, stdout=subprocess.PIPE)
sed = subprocess.Popen(['sed'] + ['s/${N}/14/g'] , stdin=sed.stdout, stdout=subprocess.PIPE)
sed = subprocess.Popen(['sed'] + ['s/${K}/3/g'] , stdin=sed.stdout, stdout=subprocess.PIPE)
subprocess.run([mlir_opt] + mlir_opt_flags + ['-'], stdin=sed.stdout, stdout=f)
cat.stdout.close()
f.close()

# Run mlir-cpu-runner
mlir_cpu_runner = os.path.join(args.m, 'bin/mlir-cpu-runner')
obj_name = args.o + '.o'
res = subprocess.run([mlir_cpu_runner] + mlir_cpu_runner_flags(args.m, obj_name) + [mlir_outfile],
capture_output=True)
subprocess.run([mlir_cpu_runner] + mlir_cpu_runner_run_flags(args.m, obj_name) + [mlir_outfile])
print(" ".join([mlir_cpu_runner] + mlir_cpu_runner_dump_object_flags(args.m, obj_name) + [mlir_outfile]))
subprocess.run([mlir_cpu_runner] + mlir_cpu_runner_dump_object_flags(args.m, obj_name) + [mlir_outfile])
return obj_name

def run(args):
Expand All @@ -95,5 +116,6 @@ def run(args):
help='which conv1d vectorization strategy to evaluate')
parser.add_argument('-llvm_mca', default='llvm-mca', help='llvm-mca binary to use for profiling')
args = parser.parse_args()
args.o = 'outputs/' + args.o + '/' + args.o
run(args)

46 changes: 24 additions & 22 deletions scalar.mlir
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
func @conv1d_scalar(%input : memref<16xf32>, %filter : memref<3xf32>, %output : memref<14xf32>)
attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
func @conv1d_scalar(%input : memref<${M}xf32>, %filter : memref<${K}xf32>, %output : memref<${N}xf32>)
attributes { passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
%c0 = constant 0 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c14 = constant 14 : index
%c3 = constant ${K} : index
%c14 = constant ${N} : index
scf.for %i = %c0 to %c14 step %c1 {
%y = constant 0.0 : f32
%x = scf.for %j = %c0 to %c3 step %c1 iter_args(%acc = %y) -> (f32) {
%idx = addi %i, %j : index
%0 = memref.load %input[%idx] : memref<16xf32>
%1 = memref.load %filter[%j] : memref<3xf32>
%0 = memref.load %input[%idx] : memref<${M}xf32>
%1 = memref.load %filter[%j] : memref<${K}xf32>
%3 = mulf %0, %1 : f32
%4 = addf %acc, %3 : f32
scf.yield %4 : f32
}
memref.store %x, %output[%i] : memref<14xf32>
// vector.print %x : f32
memref.store %x, %output[%i] : memref<${N}xf32>
}
return
}

func @print_perf(%iters: index, %total_time: f64) {
%cF = constant 3 : index
%cO = constant 14 : index
%cF = constant ${K} : index
%cO = constant ${N} : index
%flops_per_iter = muli %cF, %cO : index
%flops = muli %iters, %flops_per_iter : index
%flops_i64 = index_cast %flops : index to i64
%flops_f = sitofp %flops_i64 : i64 to f64
%flops_per_s = divf %flops_f, %total_time : f64
vector.print %total_time : f64
vector.print %flops_per_s : f64
return
}
Expand All @@ -40,27 +40,29 @@ func @main() {
%f2 = constant 0.8740115165710449 : f32
%f3 = constant -0.858701229095459 : f32
%f4 = constant 1.0533758 : f32
%iters = constant 1 : index
%input = memref.alloc() : memref<16xf32>
%filter = memref.alloc() : memref<3xf32>
%output = memref.alloc() : memref<14xf32>
memref.store %f1, %filter[%c0] : memref<3xf32>
memref.store %f2, %filter[%c1] : memref<3xf32>
memref.store %f3, %filter[%c2] : memref<3xf32>
linalg.fill(%f4, %input) : f32, memref<16xf32>
%iters = constant ${ITERS} : index
%input = memref.alloc() : memref<${M}xf32>
%filter = memref.alloc() : memref<${K}xf32>
%output = memref.alloc() : memref<${N}xf32>
memref.store %f1, %filter[%c0] : memref<${K}xf32>
memref.store %f2, %filter[%c1] : memref<${K}xf32>
memref.store %f3, %filter[%c2] : memref<${K}xf32>
linalg.fill(%f4, %input) : f32, memref<${M}xf32>
scf.for %arg0 = %c0 to %iters step %c1 {
call @conv1d_scalar(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> ()
call @conv1d_scalar(%input, %filter, %output) : (memref<${M}xf32>, memref<${K}xf32>, memref<${N}xf32>) -> ()
}
%t_start = call @rtclock() : () -> f64
scf.for %arg0 = %c0 to %iters step %c1 {
call @conv1d_scalar(%input, %filter, %output) : (memref<16xf32>, memref<3xf32>, memref<14xf32>) -> ()
call @conv1d_scalar(%input, %filter, %output) : (memref<${M}xf32>, memref<${K}xf32>, memref<${N}xf32>) -> ()
}
%t_end = call @rtclock() : () -> f64
%t_conv = subf %t_end, %t_start: f64
call @print_perf(%iters, %t_conv) : (index, f64) -> ()

%p = memref.cast %output : memref<${N}xf32> to memref<*xf32>
call @print_memref_f32(%p) : (memref<*xf32>) -> ()
return
}

func private @rtclock() -> f64


func private @print_memref_f32(%ptr : memref<*xf32>)
Loading

0 comments on commit 1958c19

Please sign in to comment.