Skip to content

Commit e508c45

Browse files
committed
C++/FastMathFun: Avoid SSE-AVX transition penalties
1 parent 0b98f3c commit e508c45

File tree

4 files changed

+6
-1
lines changed

4 files changed

+6
-1
lines changed

C++/FastMathFun/fma_dot.s

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ fma_dot:
4545
vaddps %ymm0,%ymm8,%ymm0
4646

4747
/* TODO: horizontally sum ymm0 */
48-
48+
vzeroupper
4949
retq
5050

5151
.data

C++/FastMathFun/fma_gemm48.s

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ fma_gemm48:
6868
dec %rcx
6969
jnz loop_row
7070

71+
vzeroupper
7172
pop %rbx
7273
retq
7374

C++/FastMathFun/fma_gemm96.s

+1
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ fma_gemm96:
186186
dec %rcx
187187
jnz loop_row
188188

189+
vzeroupper
189190
pop %rbx
190191
retq
191192

C++/FastMathFun/gen.py

+3
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def gen_gemm48():
2828
print(' dec %rcx')
2929
print(' jnz loop_row')
3030

31+
print(' vzeroupper')
3132
print(' retq')
3233

3334
print('.data')
@@ -69,6 +70,7 @@ def gen_gemm96():
6970
print(' dec %rcx')
7071
print(' jnz loop_row')
7172

73+
print(' vzeroupper')
7274
print(' retq')
7375

7476
print('.data')
@@ -129,6 +131,7 @@ def gen_transform_cols(working_set):
129131

130132
if working_set != 'L1':
131133
print(' sfence')
134+
print(' vzeroupper')
132135
print(' pop %rbx')
133136
print(' retq')
134137

0 commit comments

Comments
 (0)