Skip to content

Latest commit

 

History

History
1956 lines (1869 loc) · 62.7 KB

apple_m1_firestorm.md

File metadata and controls

1956 lines (1869 loc) · 62.7 KB

AArch64 latency / throughput benchmark report

Generated by https://github.com/ocxtal/insn_bench_aarch64 (commit: unknown).

CPU frequency estimation

measuring CPU frequency, assuming latency of 64bit addition is 1 cycle(s):

  • 3199.79 MHz
  • 3199.83 MHz
  • 3199.73 MHz

Scalar load

instruction latency throughput
ldr (imm; ofs = 0) 3.00 3.00
ldr (imm; ofs = 16) 3.00 3.00
ldr (imm; pre, ofs = 0) 3.00 3.00
ldr (imm; pre, ofs = 16) 3.00 -
ldr (imm; post, ofs = 0) 3.00 3.00
ldr (imm; post, ofs = 16) 3.00 -
ldr (ptr fwd.; imm; pre, ofs = 16) 1.00 -
ldr (ptr fwd.; imm; post, ofs = 16) 1.00 -
ldr (imm; ofs = 0; unaligned) 4.00 3.00
ldr (imm; ofs = 16; unaligned) 4.00 3.00
ldr (imm; ofs = 0; cross-cache) 4.01 3.00
ldr (imm; ofs = 0; cross-page) 31.00 0.03
ldr (reg) 4.00 3.00
ldr (reg; <<3) 4.00 3.00
ldr (literal) - 3.00
ldr (reg; unaligned) 4.00 3.00
ldr (reg; <<3; unaligned) 4.00 3.00
ldrb (imm; ofs = 0) 4.00 2.94
ldrb (imm; ofs = 16) 4.00 2.95
ldrb (reg) 3.00 3.00
ldrb (imm; ofs = 0; unaligned) 4.00 2.94
ldrb (imm; ofs = 16; unaligned) 4.01 2.95
ldrb (reg; unaligned) 4.00 3.00
ldrsb (imm; ofs = 0) 4.00 2.95
ldrsb (imm; ofs = 16) 4.00 2.93
ldrsb (reg) 4.00 3.00
ldrsb (imm; ofs = 0; unaligned) 4.00 2.94
ldrsb (imm; ofs = 16; unaligned) 4.00 2.93
ldrsb (reg; unaligned) 4.01 3.00
ldrh (imm; ofs = 0) 4.00 2.94
ldrh (imm; ofs = 16) 4.00 2.95
ldrh (reg) 3.00 3.00
ldrh (imm; ofs = 0; unaligned) 4.00 2.94
ldrh (imm; ofs = 16; unaligned) 4.00 2.93
ldrh (reg; unaligned) 4.00 3.00
ldrsh (imm; ofs = 0) 4.01 2.95
ldrsh (imm; ofs = 16) 4.00 2.94
ldrsh (reg) 4.00 3.00
ldrsh (imm; ofs = 0; unaligned) 4.00 2.94
ldrsh (imm; ofs = 16; unaligned) 4.00 2.94
ldrsh (reg; unaligned) 4.00 3.00
ldrsw (imm; ofs = 0) 4.00 2.95
ldrsw (imm; ofs = 16) 4.01 2.93
ldrsw (reg) 4.00 3.00
ldrsw (literal) - 3.00
ldrsw (imm; ofs = 0; unaligned) 4.00 2.93
ldrsw (imm; ofs = 16; unaligned) 4.00 2.94
ldrsw (reg; unaligned) 4.00 3.00
ldur (ofs = 0) 3.00 3.00
ldur (ofs = 16) 3.00 3.00
ldur (ofs = 0; unaligned) 4.00 3.00
ldur (ofs = 16; unaligned) 4.00 3.00
ldurb (ofs = 0) 4.00 2.94
ldurb (ofs = 16) 4.00 2.95
ldurb (ofs = 0; unaligned) 4.00 2.95
ldurb (ofs = 16; unaligned) 4.00 2.96
ldurh (ofs = 0) 4.00 2.94
ldurh (ofs = 16) 4.00 2.95
ldurh (ofs = 0; unaligned) 4.00 2.96
ldurh (ofs = 16; unaligned) 4.00 2.96
ldursb (ofs = 0) 4.00 2.94
ldursb (ofs = 16) 4.00 2.94
ldursb (ofs = 0; unaligned) 4.00 2.95
ldursb (ofs = 16; unaligned) 4.00 2.94
ldursh (ofs = 0) 4.00 2.96
ldursh (ofs = 16) 4.00 2.94
ldursh (ofs = 0; unaligned) 4.00 2.95
ldursh (ofs = 16; unaligned) 4.00 2.96
ldursw (ofs = 0) 4.00 2.96
ldursw (ofs = 16) 4.00 2.94
ldursw (ofs = 0; unaligned) 4.00 2.95
ldursw (ofs = 16; unaligned) 4.00 2.95
ldp (x; ofs = 0; 1st elem) 3.00 2.98
ldp (x; ofs = 0; 2nd elem) 4.00 2.98
ldp (w; ofs = 0; 1st elem) 4.00 2.43
ldp (w; ofs = 0; 2nd elem) 4.00 2.43
ldp (x; ofs = 0; 1st elem; unaligned) 4.00 2.93
ldp (x; ofs = 0; 2nd elem; unaligned) 4.00 2.88
ldp (w; ofs = 0; 1st elem; unaligned) 4.00 2.43
ldp (w; ofs = 0; 2nd elem; unaligned) 4.00 2.43
ldpsw (ofs = 0; 1st elem) 4.00 2.43
ldpsw (ofs = 0; 2nd elem) 4.00 2.43
ldpsw (ofs = 0; 1st elem; unaligned) 4.00 2.43
ldpsw (ofs = 0; 2nd elem; unaligned) 4.00 2.43
ldnp - 0.89

Scalar store (throughput)

instruction latency throughput
str (imm; ofs = 0) - 2.00
str (imm; ofs = 16) - 2.00
str (imm; pre, ofs = 0) - 2.00
str (imm; pre, ofs = 16) - 2.00
str (imm; post, ofs = 0) - 2.00
str (imm; post, ofs = 16) - 2.00
str (imm; ofs = 0; unaligned) - 2.00
str (imm; ofs = 16; unaligned) - 2.00
str (imm; pre, ofs = 0; unaligned) - 2.00
str (imm; pre, ofs = 16; unaligned) - 2.00
str (imm; post, ofs = 0; unaligned) - 2.00
str (imm; post, ofs = 16; unaligned) - 2.00
str (imm; ofs = 0; cross-cache) - 2.00
str (imm; pre, ofs = 0; cross-cache) - 2.00
str (imm; post, ofs = 0; cross-cache) - 2.00
str (imm; ofs = 0; cross-page) - 0.03
str (imm; pre, ofs = 0; cross-page) - 0.03
str (imm; post, ofs = 0; cross-page) - 0.03
str (reg) - 2.00
str (reg; <<3) - 2.00
strb (reg) - 2.00
strh (reg) - 2.00
stur (imm; ofs = 0) - 2.00
sturb (imm; ofs = 0) - 2.00
sturh (imm; ofs = 0) - 2.00
stp (ofs = 0) - 2.00

Scalar store-to-load

instruction latency throughput
str -> ldr (ofs = 0 -> ofs = 0) 5.82 1.72
str -> ldur (ofs = 0 -> ofs = 1) 5.86 1.81
stur -> ldr (ofs = 1 -> ofs = 1) 5.85 1.81
stur -> ldur (ofs = 1 -> ofs = 1) 5.84 1.73
stur -> ldur (cross-cache -> aligned) 5.84 1.79
stur -> ldur (aligned -> cross-cache) 7.22 1.75
stur -> ldur (cross-cache -> cross-cache) 7.23 1.76
stp -> ldp (ofs = 0 -> ofs = 0) 6.93 1.17
stp -> ldp (ofs = 0 -> ofs = 0; swap) 6.97 1.17
stp -> ldp (ofs = 0 -> ofs = 8) 6.99 1.07
stp -> ldp (ofs = 8 -> ofs = 0) 7.00 1.07

Branch

instruction latency throughput
b (pc+4) 1.01 0.99
b (pc+8) 1.01 0.99
b (pc+4) // add (chain) 1.01 0.99
b (pc+4) // add x 2 (chain) 2.00 0.50
adr -> br (pc+4) 1.01 0.99
adr -> br (pc+4) // add (chain) 1.01 0.99
adr -> br (pc+4) // add x 2 (chain) 2.00 0.50
bl-ret 2.01 0.50
bl-ret // add (chain) 2.01 0.50
bl-ret // add x 2 (chain) 2.01 0.50
bl-ret // add x 3 (chain) 3.00 0.33
blr-ret 2.01 0.50
blr-ret // add (chain) 2.01 0.50
blr-ret // add x 2 (chain) 2.01 0.50
blr-ret // add x 3 (chain) 3.00 0.33
cbz (pc+4; taken) 1.01 0.99
cbz (pc+4; taken) // add (chain) 1.01 0.99
cbz (pc+4; taken) // add x 2 (chain) 2.00 0.50
cbz (pc+4; taken) // b (pc+4) - 0.50
cbz (pc+4; not taken) 0.50 1.99
cbz (pc+4; not taken) // add (chain) 1.00 1.00
cbz (pc+4; not taken) // add x 2 (chain) 2.00 0.50
cbz (pc+4; not taken) // b (pc+4) - 1.00
cbnz (pc+4; taken) 1.01 0.99
cbnz (pc+4; not taken) 0.50 1.99
tbz (pc+4; taken) 1.01 0.99
tbz (pc+4; not taken) 0.50 1.99
tbnz (pc+4; taken) 1.01 0.99
tbnz (pc+4; not taken) 0.50 1.99
adds -> b.eq (pc+4; taken) 1.01 0.99
adds -> b.eq (pc+4; not taken) 1.11 0.90
fcmp -> b.eq (pc+4; taken) 1.01 0.99
fcmp -> b.ne (pc+4; not taken) 1.00 1.00
and -> cbz (pc+4; full random) 6.98 0.14
and -> cbnz (pc+4; full random) 6.97 0.14
and -> cbz (pc+4; full random) // add (chain) 7.25 0.14
and -> cbnz (pc+4; full random) // add (chain) 7.24 0.14
and -> cbz (pc+4; full random) // add x 2 (chain) 7.48 0.13
and -> cbnz (pc+4; full random) // add x 2 (chain) 7.47 0.13
tbz (pc+4; full random) 6.88 0.15
tbnz (pc+4; full random) 6.87 0.15
tbz (pc+4; full random) // add (chain) 6.99 0.14
tbnz (pc+4; full random) // add (chain) 6.97 0.14
tbz (pc+4; full random) // add x 2 (chain) 7.27 0.14
tbnz (pc+4; full random) // add x 2 (chain) 7.25 0.14
tbz (pc+4; full random) 13.00 0.08
tbnz (pc+4; full random) 13.00 0.08
tbz (pc+4; full random) // add (chain) 12.75 0.08
tbnz (pc+4; full random) // add (chain) 12.76 0.08
tbz (pc+4; full random) // add x 2 (chain) 12.66 0.08
tbnz (pc+4; full random) // add x 2 (chain) 12.63 0.08

Scalar nop and move

instruction latency throughput
nop - 8.00
mov (x -> x) 0.13 8.00
mov (x -> x; chain) - 7.90
mov (v.b -> v.b) 1.75 8.00
mov (v.b -> v.b; chain) - 0.57
mov / movz (imm; 0x00) - 8.00
mov / movz (imm; 0x1ffc) - 7.65
mov / movz (imm; 0x1ffc<<16) - 7.65
mov (mask imm; 0x1ffffffffffc) - 7.64
mov / movn (imm; 0x1ffc) - 7.65
mov / movn (imm; 0x1ffc<<16) - 7.66
movk (0x00) - 6.00
movk (0x1ffc) - 6.00
movk (0x1ffc<<16) - 6.00
eor (reg; clearing idiom) 1.00 6.00
sub (reg; clearing idiom) 1.00 6.00
eor.b (clearing idiom) 2.00 4.00
sub.b (clearing idiom) 2.00 4.00

Scalar integer add, sub, and neg

instruction latency throughput
add (reg) 1.00 6.01
add (reg<<2) 2.00 3.00
add (reg<<17) 2.00 3.00
add (reg>>17) 2.00 3.00
add (reg>>17; signed) 2.00 3.00
add (imm) 1.00 6.00
add (imm<<12) 1.00 6.00
adds (reg) 1.00 3.00
adds (reg<<2) 2.00 1.50
adds (reg<<17) 2.00 1.50
adds (reg>>17) 2.00 1.50
adds (imm) 1.00 3.00
adds (imm<<12) 1.00 3.00
adc 1.00 3.00
adcs 1.00 2.63
sub (reg) 1.00 6.00
sub (reg<<2) 2.00 3.00
sub (imm) 1.00 6.01
sub (imm<<12) 1.00 6.00
subs (reg) 1.00 3.00
subs (reg<<2) 2.00 1.50
subs (imm) 1.00 3.00
subs (imm<<12) 1.00 3.00
sbc 1.00 3.00
sbcs 1.00 2.63
sub - 6.00
adr - 2.00
adrp - 2.00
neg (reg) 1.00 6.01
neg (reg<<2) 2.00 3.00
negs (reg) 1.00 3.00
negs (reg<<2) 2.00 1.50
ngc 1.00 3.00
ngcs 1.00 2.63

Scalar integer multiply and multiply-accumulate

instruction latency throughput
mul 3.00 2.00
mneg 3.00 2.00
madd 3.00 1.00
msub 3.00 1.00
smull 3.00 2.00
smnegl 3.00 2.00
smaddl 3.00 1.00
smsubl 3.00 1.00
smulh 3.00 2.00
umull 3.00 2.00
umnegl 3.00 2.00
umaddl 3.00 1.00
umsubl 3.00 1.00
umulh 3.00 2.00

Scalar integer divide

instruction latency throughput
sdiv 7.00 0.50
udiv 7.00 0.50

Scalar integer sign extend

instruction latency throughput
sxtb 1.00 6.00
sxth 1.00 6.01
sxtw 1.00 6.00
uxtb 1.00 6.00
uxth 1.00 6.00

Scalar shift and bit maniplation

instruction latency throughput
lsl (reg) / lslv 1.00 6.00
lsl (imm) 1.00 6.00
lsr (reg) / lsrv 1.00 6.00
lsr (imm) 1.00 6.00
asr (reg) / asrv 1.00 6.00
asr (imm) 1.00 6.00
ror (imm) 1.00 6.00
ror (reg) / rorv 1.00 6.01
extr (imm; >>1) 1.00 6.00
extr (imm; >>17) 1.00 6.00
bfc 1.00 1.00
bfi 1.00 1.00
bfm 1.00 1.00
bfxil 1.00 1.00
sbfm 1.00 6.00
sbfx 1.00 6.00
sbfiz 1.00 6.00
ubfm 1.00 6.00
ubfx 1.00 6.01
ubfiz 1.00 6.00
bic (reg) 1.00 6.00
bic (reg<<2) 2.00 3.00
bics (reg) 1.00 3.00
bics (reg<<2) 2.00 1.50
rbit 1.00 6.00
rev (rev16) 1.00 6.00
rev (rev32) 1.00 6.00
rev (rev64) 1.00 6.00
clz 1.00 6.00
cls 1.00 6.00

Scalar bitwise logic

instruction latency throughput
and (reg) 1.00 6.00
and (reg<<2) 2.00 3.00
and (reg<<17) 2.00 3.00
and (reg>>17) 2.00 3.00
and (reg>>17; signed) 2.00 3.00
and (reg>>17; rotate) 2.00 3.00
and (mask imm) 1.00 6.00
ands (reg) 1.00 3.00
ands (reg<<2) 2.00 1.50
ands (reg<<17) 2.00 1.50
ands (reg>>17) 2.00 1.50
ands (reg>>17; signed) 2.00 1.50
ands (reg>>17; rotate) 2.00 1.50
ands (mask imm) 1.00 3.00
orr (reg) 1.00 5.99
orr (reg<<2) 2.00 3.00
orr (mask imm) 1.00 6.00
orn (reg) 1.00 6.01
orn (reg<<2) 2.00 3.00
eor (reg) 1.00 6.00
eor (reg<<2) 2.00 3.00
eon (reg) 1.00 6.00
eon (reg<<2) 2.00 3.00
mvn (reg) 1.00 6.00
mvn (reg<<2) 2.00 3.00

Scalar integer compare and flag manipulation

instruction latency throughput
ccmn (reg; eq) 0.99 2.66
ccmn (reg; lt) 1.00 2.66
ccmn (imm; eq) 0.99 2.66
ccmn (imm; lt) 0.99 2.66
ccmp (reg; eq) 0.99 2.66
ccmp (reg; lt) 1.00 2.64
ccmp (imm; eq) 1.00 2.65
ccmp (imm; lt) 0.99 2.66
tst (reg) 0.99 2.67
tst (reg<<2) 1.99 1.32
tst (imm) 0.99 2.67
rmif 0.99 2.66
setf8 0.99 2.64
setf16 1.00 2.65
cfinv 1.00 2.65

Scalar condtional arithmetic

instruction latency throughput
csinc (eq) 1.00 3.00
csinc (lt) 1.00 3.00
cinc (eq) 1.00 3.00
cinc (lt) 1.00 3.00
csinv (eq) 1.00 3.00
csinv (lt) 1.00 3.00
cset (eq) - 3.00
cset (lt) - 3.00
csetm (eq) - 3.00
csetm (lt) - 3.00
cinv (eq) 1.00 3.00
cinv (lt) 1.00 3.00
csneg (eq) 1.00 3.00
csneg (lt) 1.00 3.00
cneg (eq) 1.00 3.00
cneg (lt) 1.00 3.00

Hash and crypto

instruction latency throughput
crc32x 3.00 1.00
crc32cx 3.00 1.00
sha1c 4.99 0.25
sha1h 2.00 1.00
sha1m 5.01 0.25
sha1p 5.00 0.25
sha1su0 2.00 1.00
sha1su1 2.00 1.00
sha256h 5.00 0.50
sha256h2 5.00 0.50
sha256su0 2.00 1.00
sha256su1 3.00 1.00
sha512h 3.00 0.50
sha512h2 3.00 0.50
sha512su0 2.00 1.00
sha512su1 2.00 1.00
aese 3.00 4.01
aesd 3.00 4.01
aesmc 2.00 4.01
aesimc 2.00 4.01
sm3partw1 n/a n/a
sm3partw2 n/a n/a
sm3ss1 n/a n/a
sm3tt1a ([0]) n/a n/a
sm3tt1a ([3]) n/a n/a
sm3tt1b ([0]) n/a n/a
sm3tt1b ([3]) n/a n/a
sm3tt2a ([0]) n/a n/a
sm3tt2a ([3]) n/a n/a
sm3tt2b ([0]) n/a n/a
sm3tt2b ([3]) n/a n/a
sm4e n/a n/a
sm4ekey n/a n/a

Atomic memory operation

instruction latency throughput
casal 21.06 0.05
caspal 18.02 0.06
casalb 21.05 0.05
casalh 21.06 0.05
ldaddal 21.99 0.05
ldaddalb 22.00 0.05
ldaddalh 22.00 0.05
ldclral 21.99 0.05
ldclralb 22.00 0.05
ldclralh 22.00 0.05
ldsetal 21.99 0.05
ldsetalb 21.99 0.05
ldsetalh 22.00 0.05
ldeoral 21.99 0.05
ldeoralb 21.99 0.05
ldeoralh 22.00 0.05
ldumaxal 20.98 0.05
ldumaxalb 21.00 0.05
ldumaxalh 20.99 0.05
ldsmaxal 20.99 0.05
ldsmaxalb 20.99 0.05
ldsmaxalh 21.00 0.05
lduminal 20.99 0.05
lduminalb 21.00 0.05
lduminalh 20.99 0.05
ldsminal 21.00 0.05
ldsminalb 20.99 0.05
ldsminalh 20.99 0.05

Vector load (might be incorrect due to a wrong d -> x latency estimation)

instruction latency throughput
ldr.q (imm; ofs = 0) 5.00 3.00
ldr.q (imm; ofs = 16) 5.00 3.00
ldr.q (imm; ofs = 0; unaligned) 5.00 3.00
ldr.q (imm; ofs = 16; unaligned) 5.00 3.00
ldur.q (imm; ofs = 0) 5.00 3.00
ldur.q (imm; ofs = 16) 5.00 3.00
ldur.q (imm; ofs = 0; unaligned) 5.00 3.00
ldur.q (imm; ofs = 16; unaligned) 5.00 3.00
ldp.q (ofs = 0; 1st elem) 5.00 1.50
ldp.q (ofs = 0; 2nd elem) 5.08 1.50
ldp.q (ofs = 0; unaligned) 5.08 1.50
ldnp.q - 1.50
ld1.b (multi; 1 reg) 5.00 3.00
ld1.b (multi; 2 reg) 5.08 1.50
ld1.b (multi; 3 reg) 5.56 1.00
ld1.b (multi; 4 reg) 5.99 0.75
ld1.h (multi; 1 reg) 5.00 3.00
ld1.h (multi; 2 reg) 5.08 1.50
ld1.h (multi; 3 reg) 5.56 1.00
ld1.h (multi; 4 reg) 6.00 0.75
ld1.s (multi; 1 reg) 5.00 3.00
ld1.s (multi; 2 reg) 5.08 1.50
ld1.s (multi; 3 reg) 5.57 1.00
ld1.s (multi; 4 reg) 6.00 0.75
ld1.d (multi; 1 reg) 4.99 3.00
ld1.d (multi; 2 reg) 5.09 1.50
ld1.d (multi; 3 reg) 5.56 1.00
ld1.d (multi; 4 reg) 5.99 0.75
ld2.b (multi) 7.00 1.00
ld3.b (multi) 7.46 1.00
ld4.b (multi) 10.19 0.50
ld2.h (multi) 7.00 1.00
ld3.h (multi) 7.47 1.00
ld4.h (multi) 10.19 0.50
ld2.s (multi) 7.00 1.00
ld3.s (multi) 7.46 1.00
ld4.s (multi) 10.19 0.50
ld2.d (multi) 7.01 1.00
ld3.d (multi) 7.47 1.00
ld4.d (multi) 10.19 0.50
ld1.b (single; [15]) 7.00 3.00
ld2.b (single; [15]) 6.99 1.99
ld3.b (single; [15]) 7.47 1.00
ld4.b (single; [15]) 7.30 0.96
ld1.h (single; [7]) 6.99 3.00
ld2.h (single; [7]) 7.00 1.99
ld3.h (single; [7]) 7.48 1.00
ld4.h (single; [7]) 7.32 0.98
ld1.s (single; [3]) 7.00 3.00
ld2.s (single; [3]) 7.00 2.00
ld3.s (single; [3]) 7.47 1.00
ld4.s (single; [3]) 7.30 0.97
ld1.d (single; [1]) 7.00 3.00
ld2.d (single; [1]) 6.99 2.00
ld3.d (single; [1]) 7.47 1.00
ld4.d (single; [1]) 7.80 1.00
ld1r.b 7.00 3.00
ld2r.b 7.00 2.00
ld3r.b 7.47 1.00
ld4r.b 7.30 1.00
ld1r.h 7.00 3.00
ld2r.h 6.99 2.00
ld3r.h 7.47 1.00
ld4r.h 7.31 1.00
ld1r.s 7.00 3.00
ld2r.s 7.00 2.00
ld3r.s 7.47 1.00
ld4r.s 7.30 1.00
ld1r.d 6.99 3.00
ld2r.d 6.99 2.00
ld3r.d 7.47 1.00
ld4r.d 7.79 1.00

Vector store (throughput)

instruction latency throughput
str.q (imm; ofs = 0) - 2.00
str.q (imm; ofs = 16) - 2.00
str.q (imm; pre, ofs = 0) - 2.00
str.q (imm; pre, ofs = 16) - 2.00
str.q (imm; post, ofs = 0) - 2.00
str.q (imm; post, ofs = 16) - 2.00
str.q (imm; ofs = 0; unaligned) - 2.00
str.q (imm; ofs = 16; unaligned) - 2.00
str.q (imm; pre, ofs = 0; unaligned) - 2.00
str.q (imm; pre, ofs = 16; unaligned) - 2.00
str.q (imm; post, ofs = 0; unaligned) - 2.00
str.q (imm; post, ofs = 16; unaligned) - 2.00
stur.q (imm; ofs = 0) - 2.00
stp.q (ofs = 0) - 1.00
st1.b (multi) - 2.00
st2.b (multi) - 1.00
st3.b (multi) - 0.67
st4.b (multi) - 0.47
st1.h (multi) - 2.00
st2.h (multi) - 1.00
st3.h (multi) - 0.67
st4.h (multi) - 0.47
st1.s (multi) - 2.00
st2.s (multi) - 1.00
st3.s (multi) - 0.67
st4.s (multi) - 0.47
st1.d (multi) - 2.00
st2.d (multi) - 1.00
st3.d (multi) - 0.67
st4.d (multi) - 0.47
st1.b (single; [15]) - 2.00
st2.b (single; [15]) - 2.00
st3.b (single; [15]) - 2.00
st4.b (single; [15]) - 1.87
st1.h (single; [7]) - 2.00
st2.h (single; [7]) - 2.00
st3.h (single; [7]) - 2.00
st4.h (single; [7]) - 1.86
st1.s (single; [3]) - 2.00
st2.s (single; [3]) - 2.00
st3.s (single; [3]) - 2.00
st4.s (single; [3]) - 1.86
st1.d (single; [1]) - 2.00
st2.d (single; [1]) - 2.00
st3.d (single; [1]) - 1.00
st4.d (single; [1]) - 1.00

Vector store-to-load

instruction latency throughput
str.q -> ldr.q (ofs = 0 -> ofs = 0) 6.25 1.57
str.q -> ldr.q (ofs = 0 -> ofs = 1) 6.27 1.58
str.q -> ldr.q (ofs = 1 -> ofs = 0) 6.27 1.61
str.q -> ldr.q (ofs = 1 -> ofs = 1) 6.26 1.59
stp.s -> ldr.q (ofs = 0) 13.32 0.98
stp.d -> ldr.q (ofs = 0) 13.48 0.99
stp.q -> ldr.q (ofs = 0 -> 1st reg) 6.50 0.74
stp.q -> ldr.q (ofs = 16 -> 2nd reg) 6.49 0.64
stp.q -> ldr.q (false dep. ofs = 0 -> 2nd reg) 1.34 0.43
stp.q -> ldr.q (false dep. ofs = 16 -> 1st reg) 1.34 0.43
st1.b (single) -> ldr.q (ofs = 0) 13.31 0.99
st2.b (single) -> ldr.q (ofs = 0) 13.29 0.76
st3.b (single) -> ldr.q (ofs = 0) 13.31 0.54
st4.b (single) -> ldr.q (ofs = 0) 14.09 0.40
st1.h (single) -> ldr.q (ofs = 0) 13.31 0.99
st2.h (single) -> ldr.q (ofs = 0) 13.36 0.76
st3.h (single) -> ldr.q (ofs = 0) 13.26 0.54
st4.h (single) -> ldr.q (ofs = 0) 14.12 0.40
st1.s (single) -> ldr.q (ofs = 0) 13.28 0.99
st2.s (single) -> ldr.q (ofs = 0) 13.32 0.76
st3.s (single) -> ldr.q (ofs = 0) 13.26 0.54
st4.s (single) -> ldr.q (ofs = 0) 14.09 0.45
st1.d (single) -> ldr.q (ofs = 0) 13.37 0.99
st2.d (single) -> ldr.q (ofs = 0) 13.48 0.80
st3.d (single) -> ldr.q (ofs = 0) 12.61 0.51
st4.d (single) -> ldr.q (ofs = 0) 11.86 0.50
st1.b (multi; 1 reg) -> ldr.q (1st reg) 6.25 1.60
st1.b (multi; 2 regs) -> ldr.q (2nd reg) 6.49 0.73
st1.b (multi; 3 regs) -> ldr.q (3rd reg) 5.90 0.45
st1.b (multi; 4 regs) -> ldr.q (4th reg) 5.66 0.38
st1.b (multi; 2 regs) -> ldr.q (false dep. ofs = 16 -> 1st reg) 1.40 0.73
st1.b (multi; 3 regs) -> ldr.q (false dep. ofs = 32 -> 1st reg) 2.85 0.33
st1.b (multi; 4 regs) -> ldr.q (false dep. ofs = 48 -> 1st reg) 2.34 0.33
st1.b (multi; 2 regs) -> ldr.q (false dep. ofs = 0 -> 2nd reg) 1.39 0.73
st1.b (multi; 3 regs) -> ldr.q (false dep. ofs = 0 -> 3rd reg) 2.97 0.33
st1.b (multi; 4 regs) -> ldr.q (false dep. ofs = 0 -> 4th reg) 2.34 0.33
st1.b (multi; 2 regs) -> ldr.q (false dep. ofs = 0 -> 2nd reg; unaligned) 1.34 0.75
st1.b (multi; 3 regs) -> ldr.q (false dep. ofs = 0 -> 3rd reg; unaligned) 2.69 0.30
st1.b (multi; 4 regs) -> ldr.q (false dep. ofs = 0 -> 4th reg; unaligned) 2.49 0.37
st1.b (multi; 2 regs) -> ldr.q (false dep. ofs = 0 -> 2nd reg; unaligned; cross-cache) 5.16 0.15
st1.b (multi; 3 regs) -> ldr.q (false dep. ofs = 0 -> 3rd reg; unaligned; cross-cache) 3.34 0.32
st1.b (multi; 4 regs) -> ldr.q (false dep. ofs = 0 -> 4th reg; unaligned; cross-cache) 8.66 0.09
st2.b (multi) -> ldr.q (ofs = 16) 12.72 0.62
st3.b (multi) -> ldr.q (ofs = 32) 13.11 0.49
st4.b (multi) -> ldr.q (ofs = 48) 14.63 0.34
st2.h (multi) -> ldr.q (ofs = 16) 12.72 0.62
st3.h (multi) -> ldr.q (ofs = 32) 13.23 0.49
st4.h (multi) -> ldr.q (ofs = 48) 14.59 0.34
st2.s (multi) -> ldr.q (ofs = 16) 12.75 0.62
st3.s (multi) -> ldr.q (ofs = 32) 13.26 0.49
st4.s (multi) -> ldr.q (ofs = 48) 14.54 0.34
st2.d (multi) -> ldr.q (ofs = 16) 12.81 0.62
st3.d (multi) -> ldr.q (ofs = 32) 13.08 0.49
st4.d (multi) -> ldr.q (ofs = 48) 14.57 0.34

Vector element move

instruction latency throughput
movi.b (0x00) - 8.01
movi.h (0x00) - 8.00
movi.h (0x00<<8) - 8.00
movi.s (0x00) - 8.00
movi.s (0x00<<8) - 8.00
movi.b (0xff) - 4.00
movi.h (0xff) - 4.00
movi.h (0xff<<8) - 4.01
movi.s (0xff) - 4.01
movi.s (0xff<<8) - 4.00
mvni.h (0x00) - 4.00
mvni.s (0x00) - 4.00
mvni.h (0x11) - 4.00
mvni.s (0x11) - 4.00
mvni.h (0x11<<8) - 4.00
mvni.s (0x11<<8) - 4.01
mov.s (v.s[0] <-> w) 12.00 -
mov.d (v.d[0] <-> x) 12.00 -
mov.s (v.s[3] <-> w) 12.00 -
mov.d (v.d[1] <-> x) 11.99 -
smov.b (v.b[0] <-> x) 12.00 -
smov.h (v.h[0] <-> x) 12.00 -
smov.s (v.s[0] <-> x) 12.00 -
umov.b (v.b[0] <-> w) 11.99 -
umov.h (v.h[0] <-> w) 12.00 -
umov.s (v.s[0] <-> w) 12.00 -
umov.d (v.d[0] <-> x) 12.00 -
fmov.d (v.d[0] <-> x) 12.00 -
dup.b (vec; lane = 0) 2.00 4.01
dup.b (vec; lane = 15) 2.00 4.01
dup.h (vec; lane = 7) 2.00 4.00
dup.s (vec; lane = 3) 2.00 4.00
dup.d (vec; lane = 1) 2.00 4.00
dup.b (elem) 6.00 3.00
dup.h (elem) 6.00 3.00
dup.s (elem) 6.00 3.00
dup.d (elem) 6.00 3.00
xtn.h 2.00 4.01
xtn.s 2.00 4.00
xtn.d 2.00 4.00
sqxtn.h (scl) 3.00 4.00
sqxtn.s (scl) 3.00 4.00
sqxtn.d (scl) 3.00 4.00
sqxtn.h (vec) 3.00 4.00
sqxtn.s (vec) 3.00 4.00
sqxtn.d (vec) 3.00 4.00
sqxtn2.h (vec) 3.00 4.00
sqxtn2.s (vec) 3.00 4.00
sqxtn2.d (vec) 3.00 4.01
uqxtn.h (scl) 3.00 4.00
uqxtn.s (scl) 3.00 4.00
uqxtn.d (scl) 3.00 4.01
uqxtn.h (vec) 3.00 4.00
uqxtn.s (vec) 3.00 4.00
uqxtn.d (vec) 3.00 4.00
uqxtn2.h (vec) 3.00 4.00
uqxtn2.s (vec) 3.00 4.00
uqxtn2.d (vec) 3.00 4.00
sqxtun.h (scl) 3.00 4.00
sqxtun.s (scl) 3.00 4.01
sqxtun.d (scl) 3.00 4.00
sqxtun.h (vec) 3.00 4.00
sqxtun.s (vec) 3.00 4.00
sqxtun.d (vec) 3.00 4.00
sqxtun2.h (vec) 3.00 4.01
sqxtun2.s (vec) 3.00 4.00
sqxtun2.d (vec) 3.00 4.00
sxtl.b (vec) 2.00 4.00
sxtl.h (vec) 2.00 4.00
sxtl.s (vec) 2.00 4.00
sxtl2.b (vec) 2.00 4.00
sxtl2.h (vec) 2.00 4.00
sxtl2.s (vec) 2.00 4.00
uxtl.b (vec) 2.00 4.00
uxtl.h (vec) 2.00 4.00
uxtl.s (vec) 2.00 4.00
uxtl2.b (vec) 2.00 4.00
uxtl2.h (vec) 2.00 4.00
uxtl2.s (vec) 2.00 4.00

Vector integer add, sub, abs, and neg

instruction latency throughput
add.b 2.00 4.00
add.h 2.00 4.00
add.s 2.00 4.00
add.d 2.00 4.00
sqadd.b 3.00 4.00
sqadd.h 3.00 4.00
sqadd.s 3.00 4.00
sqadd.d 3.00 4.00
uqadd.b 3.00 4.00
uqadd.h 3.00 4.01
uqadd.s 3.00 4.00
uqadd.d 3.00 4.00
suqadd.b (scl) 3.00 4.00
suqadd.h (scl) 3.00 4.00
suqadd.s (scl) 3.00 4.00
suqadd.d (scl) 3.00 4.01
suqadd.b (vec) 3.00 4.00
suqadd.h (vec) 3.00 4.01
suqadd.s (vec) 3.00 4.01
suqadd.d (vec) 3.00 4.00
usqadd.b (scl) 3.00 4.01
usqadd.h (scl) 3.00 4.00
usqadd.s (scl) 3.00 4.00
usqadd.d (scl) 3.00 4.00
usqadd.b (vec) 3.00 4.00
usqadd.h (vec) 3.00 4.00
usqadd.s (vec) 3.00 4.01
usqadd.d (vec) 3.00 4.00
sub.b 2.00 4.01
sub.h 2.00 4.00
sub.s 2.00 4.00
sub.d 2.00 4.01
sqsub.b 3.00 4.00
sqsub.h 3.00 4.01
sqsub.s 3.00 4.00
sqsub.d 3.00 4.00
uqsub.b 3.00 4.00
uqsub.h 3.00 4.00
uqsub.s 3.00 4.01
uqsub.d 3.00 4.01
abs.b 3.00 4.00
abs.h 3.00 4.00
abs.s 3.00 4.00
abs.d 3.00 4.00
sqabs.b 3.00 4.00
sqabs.h 3.00 4.00
sqabs.s 3.00 4.01
sqabs.d 3.00 4.00
neg.b 2.00 4.01
neg.h 2.00 4.00
neg.s 2.00 4.00
neg.d 2.00 4.01
sqneg.b 3.00 4.00
sqneg.h 3.00 4.00
sqneg.s 3.00 4.00
sqneg.d 3.00 4.00

Vector integer add and sub (widening, narrowing, and horizontal)

instruction latency throughput
saddl.b 2.00 4.00
saddl.h 2.00 4.00
saddl.s 2.00 4.00
saddl2.b 2.00 4.00
saddl2.h 2.00 4.00
saddl2.s 2.00 4.00
ssubl.b 2.00 4.00
ssubl.h 2.00 4.00
ssubl.s 2.00 4.00
ssubl2.b 2.00 4.00
ssubl2.h 2.00 4.00
ssubl2.s 2.00 4.00
usubl.b 2.00 4.00
usubl.h 2.00 4.00
usubl.s 2.00 4.00
usubl2.b 2.00 4.00
usubl2.h 2.00 4.00
usubl2.s 2.00 4.00
saddlp.b 2.00 4.00
saddlp.h 2.00 4.00
saddlp.s 2.00 4.00
saddlv.b 3.00 4.00
saddlv.h 3.00 4.01
saddlv.s 3.00 4.00
saddw.h 2.00 4.00
saddw.s 2.00 4.01
saddw.d 2.00 4.00
saddw2.h 2.00 4.01
saddw2.s 2.00 4.00
saddw2.d 2.00 4.00
uaddw.h 2.00 4.00
uaddw.s 2.00 4.00
uaddw.d 2.00 4.00
uaddw2.h 2.00 4.00
uaddw2.s 2.00 4.00
uaddw2.d 2.00 4.00
ssubw.h 2.00 4.00
ssubw.s 2.00 4.01
ssubw.d 2.00 4.00
ssubw2.h 2.00 4.00
ssubw2.s 2.00 4.00
ssubw2.d 2.00 4.00
usubw.h 2.00 4.00
usubw.s 2.00 4.00
usubw.d 2.00 4.00
usubw2.h 2.00 4.00
usubw2.s 2.00 4.01
usubw2.d 2.00 4.00
addhn.h 3.00 4.01
addhn.s 3.00 4.01
addhn.d 3.00 4.00
addhn2.h 3.00 4.00
addhn2.s 3.00 4.00
addhn2.d 3.00 4.01
subhn.h 3.00 4.00
subhn.s 3.00 4.00
subhn.d 3.00 4.00
subhn2.h 3.00 4.00
subhn2.s 3.00 4.00
subhn2.d 3.00 4.00
raddhn.h 3.00 4.00
raddhn.s 3.00 4.00
raddhn.d 3.00 4.00
raddhn2.h 3.00 4.00
raddhn2.s 3.00 4.00
raddhn2.d 3.00 4.00
rsubhn.h 3.00 4.00
rsubhn.s 3.00 4.01
rsubhn.d 3.00 4.00
rsubhn2.h 3.00 4.00
rsubhn2.s 3.00 4.00
rsubhn2.d 3.00 4.00
shadd.b 2.00 4.00
shadd.h 2.00 4.00
shadd.s 2.00 4.00
shsub.b 2.00 4.00
shsub.h 2.00 4.00
shsub.s 2.00 4.00
uhadd.b 2.00 4.01
uhadd.h 2.00 4.00
uhadd.s 2.00 4.00
uhsub.b 2.00 4.00
uhsub.h 2.00 4.00
uhsub.s 2.00 4.00
srhadd.b 2.00 4.00
srhadd.h 2.00 4.00
srhadd.s 2.00 4.00
urhadd.b 2.00 4.00
urhadd.h 2.00 4.00
urhadd.s 2.00 4.00
addp.b 2.00 4.00
addp.h 2.00 4.00
addp.s 2.00 4.00
addp.d 2.00 4.00
addv.b 3.00 4.00
addv.h 3.00 4.00
addv.s 3.00 4.00

Vector integer max and min

instruction latency throughput
smax.b 2.00 4.00
smax.h 2.00 4.00
smax.s 2.00 4.00
smin.b 2.00 4.00
smin.h 2.00 4.00
smin.s 2.00 4.01
smaxp.b 2.00 4.00
smaxp.h 2.00 4.00
smaxp.s 2.00 4.00
sminp.b 2.00 4.00
sminp.h 2.00 4.00
sminp.s 2.00 4.00
smaxv.b 3.00 4.00
smaxv.h 3.00 4.01
smaxv.s 3.00 4.01
sminv.b 3.00 4.00
sminv.h 3.00 4.00
sminv.s 3.00 4.00
umax.b 2.00 4.00
umax.h 2.00 4.00
umax.s 2.00 4.00
umin.b 2.00 4.00
umin.h 2.00 4.00
umin.s 2.00 4.00
umaxp.b 2.00 4.00
umaxp.h 2.00 4.00
umaxp.s 2.00 4.00
uminp.b 2.00 4.01
uminp.h 2.00 4.00
uminp.s 2.00 4.00
umaxv.b 3.00 4.00
umaxv.h 3.00 4.00
umaxv.s 3.00 4.00
uminv.b 3.00 4.00
uminv.h 3.00 4.00
uminv.s 3.00 4.00

Vector integer absolute difference

instruction latency throughput
sabd.b 3.00 4.00
sabd.h 3.00 4.00
sabd.s 3.00 4.00
uabd.b 3.00 4.00
uabd.h 3.00 4.00
uabd.s 3.00 4.00
sabdl.b 3.00 4.00
sabdl.h 3.00 4.00
sabdl.s 3.00 4.00
sabdl2.b 3.00 4.01
sabdl2.h 3.00 4.00
sabdl2.s 3.00 4.00
uabdl.b 3.00 4.01
uabdl.h 3.00 4.00
uabdl.s 3.00 4.00
uabdl2.b 3.00 4.00
uabdl2.h 3.00 4.00
uabdl2.s 3.00 4.01

Vector integer multiply

instruction latency throughput
pmul.b 3.00 4.00
pmull.b 3.00 4.00
pmull.d 3.00 4.00
pmull2.d 6.00 2.00
pmull2.d 5.00 2.00
mul.b (vec) 3.00 4.01
mul.h (vec) 3.00 4.00
mul.s (vec) 3.00 4.00
mul.h (elem; [0]) 3.00 4.00
mul.h (elem; [7]) 3.00 4.00
mul.s (elem; [0]) 3.00 4.00
mul.s (elem; [3]) 3.00 4.01
smull.b (vec) 3.00 4.00
smull.h (vec) 3.00 4.01
smull.s (vec) 3.00 4.00
smull2.b (vec) 3.00 4.00
smull2.h (vec) 3.00 4.00
smull2.s (vec) 3.00 4.00
smull.h (elem; [0]) 3.00 4.01
smull.h (elem; [7]) 3.00 4.00
smull.s (elem; [0]) 3.00 4.00
smull.s (elem; [3]) 3.00 4.00
smull2.h (elem; [0]) 3.00 4.01
smull2.h (elem; [7]) 3.00 4.00
smull2.s (elem; [0]) 3.00 4.00
smull2.s (elem; [3]) 3.00 4.00
umull.b (vec) 3.00 4.00
umull.h (vec) 3.00 4.00
umull.s (vec) 3.00 4.00
umull2.b (vec) 3.00 4.00
umull2.h (vec) 3.00 4.00
umull2.s (vec) 3.00 4.00
umull.h (elem; [0]) 3.00 4.00
umull.h (elem; [7]) 3.00 4.00
umull.s (elem; [0]) 3.00 4.00
umull.s (elem; [3]) 3.00 4.00
umull2.h (elem; [0]) 3.00 4.00
umull2.h (elem; [7]) 3.00 4.01
umull2.s (elem; [0]) 3.00 4.00
umull2.s (elem; [3]) 3.00 4.00
sqdmull.h (vec) 3.00 4.01
sqdmull.s (vec) 3.00 4.00
sqdmull2.h (vec) 3.00 4.00
sqdmull2.s (vec) 3.00 4.00
sqdmull.h (elem; v.h[0]) 3.00 4.00
sqdmull.h (elem; v.h[7]) 3.01 4.00
sqdmull.s (elem; v.s[0]) 3.00 4.00
sqdmull.s (elem; v.s[3]) 3.00 4.00
sqdmull2.h (elem; v.h[0]) 3.00 4.00
sqdmull2.h (elem; v.h[7]) 3.00 4.00
sqdmull2.s (elem; v.s[0]) 3.00 4.00
sqdmull2.s (elem; v.s[3]) 3.00 4.00
sqdmulh.h (vec) 3.00 4.00
sqdmulh.s (vec) 3.00 4.00
sqdmulh.h (elem; v.h[0]) 3.00 4.00
sqdmulh.h (elem; v.h[7]) 3.00 4.00
sqdmulh.s (elem; v.s[0]) 3.00 4.00
sqdmulh.s (elem; v.s[3]) 3.00 4.00
sqrdmulh.h (vec) 3.00 4.01
sqrdmulh.s (vec) 3.00 4.00
sqrdmulh.h (elem; v.h[0]) 3.00 4.00
sqrdmulh.h (elem; v.h[7]) 3.00 4.00
sqrdmulh.s (elem; v.s[0]) 3.00 4.00
sqrdmulh.s (elem; v.s[3]) 3.00 4.00
pmul.b 3.00 4.01
pmull.b 3.00 4.00
pmull.d 3.00 4.00
pmull2.b 3.00 4.00
pmull2.d 3.00 4.00

Vector integer multiply-accumulate

instruction latency throughput
mla.b (vec) 3.00 4.00
mla.h (vec) 3.00 4.00
mla.s (vec) 3.00 4.00
mla.b (vec; acc. fwd.) 3.00 -
mla.h (vec; acc. fwd.) 3.00 -
mla.s (vec; acc. fwd.) 3.00 -
mla.h (elem; [0]) 3.00 4.00
mla.h (elem; [7]) 3.00 4.00
mla.s (elem; [0]) 3.00 4.00
mla.s (elem; [3]) 3.00 4.00
smlal.h (vec) 3.00 4.00
smlal.s (vec) 3.00 4.01
smlal2.h (vec) 3.00 4.00
smlal2.s (vec) 3.00 4.00
smlal.h (vec; acc. fwd.) 3.00 -
smlal.s (vec; acc. fwd.) 3.00 -
smlal2.h (vec; acc. fwd.) 3.00 -
smlal2.s (vec; acc. fwd.) 3.00 -
smlal.h (elem; v.h[0]) 3.00 4.00
smlal.h (elem; v.h[7]) 3.00 4.00
smlal.s (elem; v.s[0]) 3.00 4.00
smlal.s (elem; v.s[3]) 3.00 4.01
smlal2.h (elem; v.h[0]) 3.00 4.00
smlal2.h (elem; v.h[7]) 3.00 4.00
smlal2.s (elem; v.s[0]) 3.00 4.00
smlal2.s (elem; v.s[3]) 3.00 4.00
umlal.h (vec) 3.00 4.00
umlal.s (vec) 3.00 4.00
umlal2.h (vec) 3.00 4.01
umlal2.s (vec) 3.00 4.00
umlal.h (vec; acc. fwd.) 3.00 -
umlal.s (vec; acc. fwd.) 3.00 -
umlal2.h (vec; acc. fwd.) 3.00 -
umlal2.s (vec; acc. fwd.) 3.00 -
umlal.h (elem; v.h[0]) 3.00 4.00
umlal.h (elem; v.h[7]) 3.00 4.00
umlal.s (elem; v.s[0]) 3.00 4.00
umlal.s (elem; v.s[3]) 3.00 4.01
umlal2.h (elem; v.h[0]) 3.00 4.00
umlal2.h (elem; v.h[7]) 3.00 4.00
umlal2.s (elem; v.s[0]) 3.00 4.00
umlal2.s (elem; v.s[3]) 3.00 4.00
sqdmlal.h (vec) 3.00 4.00
sqdmlal.s (vec) 3.00 4.00
sqdmlal2.h (vec) 3.00 4.00
sqdmlal2.s (vec) 3.00 4.00
sqdmlal.h (vec; acc. fwd.) 3.00 -
sqdmlal.s (vec; acc. fwd.) 3.00 -
sqdmlal2.h (vec; acc. fwd.) 3.00 -
sqdmlal2.s (vec; acc. fwd.) 3.00 -
sqdmlal.h (elem; v.h[0]) 3.00 4.00
sqdmlal.h (elem; v.h[7]) 3.00 4.00
sqdmlal.s (elem; v.s[0]) 3.00 4.00
sqdmlal.s (elem; v.s[3]) 3.00 4.00
sqdmlal2.h (elem; v.h[0]) 3.00 4.00
sqdmlal2.h (elem; v.h[7]) 3.00 4.00
sqdmlal2.s (elem; v.s[0]) 3.00 4.00
sqdmlal2.s (elem; v.s[3]) 3.00 4.00
sqrdmlah.h (vec) 3.00 4.00
sqrdmlah.s (vec) 3.00 4.00
sqrdmlah.h (vec; acc. fwd.) 3.00 -
sqrdmlah.s (vec; acc. fwd.) 3.00 -
sqrdmlah.h (elem; v.h[0]) 3.00 4.00
sqrdmlah.h (elem; v.h[7]) 3.00 4.00
sqrdmlah.s (elem; v.s[0]) 3.00 4.00
sqrdmlah.s (elem; v.s[3]) 3.00 4.00
mls.b (vec) 3.00 4.00
mls.h (vec) 3.00 4.00
mls.s (vec) 3.00 4.01
mls.b (vec; acc. fwd.) 3.00 -
mls.h (vec; acc. fwd.) 3.00 -
mls.s (vec; acc. fwd.) 3.00 -
mls.h (elem; [0]) 3.00 4.00
mls.h (elem; [7]) 3.00 4.00
mls.s (elem; [0]) 3.00 4.00
mls.s (elem; [3]) 3.00 4.01
smlsl.h (vec) 3.00 4.01
smlsl.s (vec) 3.00 4.00
smlsl2.h (vec) 3.00 4.00
smlsl2.s (vec) 3.00 4.00
smlsl.h (vec; acc. fwd.) 3.00 -
smlsl.s (vec; acc. fwd.) 3.00 -
smlsl2.h (vec; acc. fwd.) 3.00 -
smlsl2.s (vec; acc. fwd.) 3.00 -
smlsl.h (elem; v.h[0]) 3.00 4.00
smlsl.h (elem; v.h[7]) 3.00 4.00
smlsl.s (elem; v.s[0]) 3.00 4.00
smlsl.s (elem; v.s[3]) 3.00 4.00
smlsl2.h (elem; v.h[0]) 3.00 4.01
smlsl2.h (elem; v.h[7]) 3.00 4.00
smlsl2.s (elem; v.s[0]) 3.00 4.00
smlsl2.s (elem; v.s[3]) 3.00 4.00
umlsl.h (vec) 3.00 4.00
umlsl.s (vec) 3.00 4.00
umlsl2.h (vec) 3.00 4.01
umlsl2.s (vec) 3.00 4.00
umlsl.h (vec; acc. fwd.) 3.00 -
umlsl.s (vec; acc. fwd.) 3.00 -
umlsl2.h (vec; acc. fwd.) 3.00 -
umlsl2.s (vec; acc. fwd.) 3.00 -
umlsl.h (elem; v.h[0]) 3.00 4.00
umlsl.h (elem; v.h[7]) 3.00 4.01
umlsl.s (elem; v.s[0]) 3.00 4.00
umlsl.s (elem; v.s[3]) 3.00 4.00
umlsl2.h (elem; v.h[0]) 3.00 4.00
umlsl2.h (elem; v.h[7]) 3.00 4.00
umlsl2.s (elem; v.s[0]) 3.00 4.00
umlsl2.s (elem; v.s[3]) 3.00 4.00
sqdmlsl.h (vec) 3.00 4.01
sqdmlsl.s (vec) 3.00 4.00
sqdmlsl2.h (vec) 3.00 4.00
sqdmlsl2.s (vec) 3.00 4.00
sqdmlsl.h (vec; acc. fwd.) 3.00 -
sqdmlsl.s (vec; acc. fwd.) 3.00 -
sqdmlsl2.h (vec; acc. fwd.) 3.00 -
sqdmlsl2.s (vec; acc. fwd.) 3.00 -
sqdmlsl.h (elem; v.h[0]) 3.00 4.00
sqdmlsl.h (elem; v.h[7]) 3.00 4.00
sqdmlsl.s (elem; v.s[0]) 3.00 4.00
sqdmlsl.s (elem; v.s[3]) 3.00 4.00
sqdmlsl2.h (elem; v.h[0]) 3.00 4.00
sqdmlsl2.h (elem; v.h[7]) 3.00 4.01
sqdmlsl2.s (elem; v.s[0]) 3.00 4.01
sqdmlsl2.s (elem; v.s[3]) 3.00 4.00
sqrdmlsh.h (vec) 3.00 4.01
sqrdmlsh.s (vec) 3.00 4.01
sqrdmlsh.h (vec; acc. fwd.) 3.00 -
sqrdmlsh.s (vec; acc. fwd.) 3.00 -
sqrdmlsh.h (elem; v.h[0]) 3.00 4.00
sqrdmlsh.h (elem; v.h[7]) 3.00 4.00
sqrdmlsh.s (elem; v.s[0]) 3.00 4.00
sqrdmlsh.s (elem; v.s[3]) 3.00 4.00
sdot.b (vec) 3.00 4.00
sdot.b (elem; v.b[0]) 3.00 4.00
sdot.b (elem; v.b[3]) 3.00 4.00

Vector integer absolute difference accumulate

instruction latency throughput
saba.b 3.00 4.00
saba.h 3.00 4.00
saba.s 3.00 4.01
saba.b (acc. fwd.) 3.00 -
saba.h (acc. fwd.) 3.00 -
saba.s (acc. fwd.) 3.00 -
uaba.b 3.00 4.00
uaba.h 3.00 4.00
uaba.s 3.00 4.00
uaba.b (acc. fwd.) 3.00 -
uaba.h (acc. fwd.) 3.00 -
uaba.s (acc. fwd.) 3.00 -
sabal.b 3.00 4.00
sabal.h 3.00 4.00
sabal.s 3.00 4.00
sabal2.b 3.00 4.00
sabal2.h 3.00 4.00
sabal2.s 3.00 4.00
uabal.b 3.00 4.01
uabal.h 3.00 4.01
uabal.s 3.00 4.00
uabal2.b 3.00 4.01
uabal2.h 3.00 4.00
uabal2.s 3.00 4.01
sadalp.b 3.00 4.00
sadalp.h 3.00 4.00
sadalp.s 3.00 4.00
uadalp.b 3.00 4.00
uadalp.h 3.00 4.00
uadalp.s 3.00 4.00

Vector integer divide

instruction latency throughput
urecpe.s 3.00 1.00
ursqrte.s 3.00 1.00

Vector bitwise logic

instruction latency throughput
and.b 2.00 4.01
orr.b 2.00 4.00
orr.h 2.00 4.00
orr.s 2.00 4.00
orr.h 2.00 4.00
orr.s 2.00 4.00
orn.b 2.00 4.00
eor.b 2.00 4.00
eor3.b 2.00 4.00
not.b 2.00 4.00
mvn.b 2.00 4.01

Vector bit manipulation

instruction latency throughput
bic.b (reg) 2.00 4.00
bic.h (imm) 2.00 4.00
bic.h (imm; <<8) 2.00 4.00
bic.s (imm) 2.00 4.00
bic.s (imm; <<8) 2.00 4.00
bif.b 2.00 4.00
bit.b 2.00 4.00
bsl.b 2.00 4.01
bcax.b 2.00 4.00
rax1.d 2.00 4.01
xar.d 2.00 4.00
rbit.b 2.00 4.01
rev16.b 2.00 4.01
rev32.b 2.00 4.00
rev32.h 2.00 4.01
rev64.b 2.00 4.00
rev64.h 2.00 4.00
rev64.s 2.00 4.00
cls.b 2.00 4.00
cls.h 2.00 4.00
cls.s 2.00 4.01
clz.b 2.00 4.00
clz.h 2.00 4.00
clz.s 2.00 4.00
cnt.b 2.00 4.00

Vector shift

instruction latency throughput
shl.b (imm; <<2) 2.00 4.01
shl.h (imm; <<2) 2.00 4.00
shl.s (imm; <<2) 2.00 4.00
shl.d (imm; <<2) 2.00 4.00
sshl.b (reg) 2.00 4.00
sshl.h (reg) 2.00 4.00
sshl.s (reg) 2.00 4.00
sshl.d (reg) 2.00 4.00
ushl.b (reg) 2.00 4.00
ushl.h (reg) 2.00 4.00
ushl.s (reg) 2.00 4.00
ushl.d (reg) 2.00 4.00
sqshl.b (imm; <<2) 2.00 4.00
sqshl.h (imm; <<2) 2.00 4.00
sqshl.s (imm; <<2) 2.00 4.00
sqshl.d (imm; <<2) 2.00 4.00
sqshl.b (reg; <<2) 2.00 4.00
sqshl.h (reg; <<2) 2.00 4.00
sqshl.s (reg; <<2) 2.00 4.00
sqshl.d (reg; <<2) 2.00 4.00
uqshl.b (imm; <<2) 2.00 4.00
uqshl.h (imm; <<2) 2.00 4.00
uqshl.s (imm; <<2) 2.00 4.00
uqshl.d (imm; <<2) 2.00 4.00
uqshl.b (reg; <<2) 2.00 4.00
uqshl.h (reg; <<2) 2.00 4.00
uqshl.s (reg; <<2) 2.00 4.01
uqshl.d (reg; <<2) 2.00 4.00
sqshlu.b (<<2) 2.00 4.00
sqshlu.h (<<2) 2.00 4.00
sqshlu.s (<<2) 2.00 4.00
sqshlu.d (<<2) 2.00 4.00
srshl.b (reg) 3.00 4.00
srshl.h (reg) 3.00 4.00
srshl.s (reg) 3.00 4.00
srshl.d (reg) 3.00 4.00
urshl.b (reg) 3.00 4.00
urshl.h (reg) 3.00 4.00
urshl.s (reg) 3.00 4.00
urshl.d (reg) 3.00 4.01
uqrshl.b (reg) 3.00 4.00
uqrshl.h (reg) 3.00 4.00
uqrshl.s (reg) 3.00 4.01
uqrshl.d (reg) 3.00 4.00
sqshlu.b (imm; <<2) 2.00 4.00
sqshlu.h (imm; <<2) 2.00 4.01
sqshlu.s (imm; <<2) 2.00 4.00
sqshlu.d (imm; <<2) 2.00 4.01
sqrshl.b 3.00 4.00
sqrshl.h 3.00 4.00
sqrshl.s 3.00 4.00
sqrshl.d 3.00 4.01
shll.b (<<8) 2.00 4.00
shll.h (<<16) 2.00 4.01
shll.s (<<32) 2.00 4.01
shll2.b (<<8) 2.00 4.00
shll2.h (<<16) 2.00 4.00
shll2.s (<<32) 2.00 4.01
sshll.b (<<2) 2.00 4.00
sshll.h (<<2) 2.00 4.00
sshll.s (<<2) 2.00 4.00
sshll2.b (<<2) 2.00 4.00
sshll2.h (<<2) 2.00 4.00
sshll2.s (<<2) 2.00 4.00
ushll.b (<<2) 2.00 4.00
ushll.h (<<2) 2.00 4.01
ushll.s (<<2) 2.00 4.01
ushll2.b (<<2) 2.00 4.00
ushll2.h (<<2) 2.00 4.00
ushll2.s (<<2) 2.00 4.00
sli.b (vec; <<2) 2.00 4.00
sli.h (vec; <<2) 2.00 4.01
sli.s (vec; <<2) 2.00 4.00
sli.d (vec; <<2) 2.00 4.00
sshr.b (imm; >>2) 2.00 4.00
sshr.h (imm; >>2) 2.00 4.00
sshr.s (imm; >>2) 2.00 4.00
sshr.d (imm; >>2) 2.00 4.00
ushr.b (imm; >>2) 2.00 4.00
ushr.h (imm; >>2) 2.00 4.00
ushr.s (imm; >>2) 2.00 4.00
ushr.d (imm; >>2) 2.00 4.01
srshr.b (imm; >>2) 3.00 4.00
srshr.h (imm; >>2) 3.00 4.00
srshr.s (imm; >>2) 3.00 4.00
srshr.d (imm; >>2) 3.00 4.00
urshr.b (imm; >>2) 3.00 4.00
urshr.h (imm; >>2) 3.00 4.00
urshr.s (imm; >>2) 3.00 4.00
urshr.d (imm; >>2) 3.00 4.00
ssra.b (imm; >>2) 3.00 4.00
ssra.h (imm; >>2) 3.00 4.00
ssra.s (imm; >>2) 3.00 4.01
ssra.d (imm; >>2) 3.00 4.00
usra.b (imm; >>2) 3.00 4.01
usra.h (imm; >>2) 3.00 4.00
usra.s (imm; >>2) 3.00 4.01
usra.d (imm; >>2) 3.00 4.00
srsra.b (imm; >>2) 3.00 4.00
srsra.h (imm; >>2) 3.00 4.00
srsra.s (imm; >>2) 3.00 4.00
srsra.d (imm; >>2) 3.00 4.01
ursra.b (imm; >>2) 3.00 4.00
ursra.h (imm; >>2) 3.00 4.00
ursra.s (imm; >>2) 3.00 4.00
ursra.d (imm; >>2) 3.00 4.00
shrn.h (>>2) 3.00 4.01
shrn.s (>>2) 3.00 4.00
shrn.d (>>2) 3.00 4.00
shrn2.h (>>2) 3.00 4.00
shrn2.s (>>2) 3.00 4.00
shrn2.d (>>2) 3.00 4.00
sqshrn.h (>>2) 3.00 4.00
sqshrn.s (>>2) 3.00 4.00
sqshrn.d (>>2) 3.00 4.00
sqshrn2.h (>>2) 3.00 4.00
sqshrn2.s (>>2) 3.00 4.00
sqshrn2.d (>>2) 3.00 4.00
uqshrn.h (>>2) 3.00 4.00
uqshrn.s (>>2) 3.00 4.00
uqshrn.d (>>2) 3.00 4.01
uqshrn2.h (>>2) 3.00 4.00
uqshrn2.s (>>2) 3.00 4.00
uqshrn2.d (>>2) 3.00 4.00
sqshrun.h (>>2) 3.00 4.00
sqshrun.s (>>2) 3.00 4.00
sqshrun.d (>>2) 3.00 4.00
sqshrun2.h (>>2) 3.00 4.00
sqshrun2.s (>>2) 3.00 4.01
sqshrun2.d (>>2) 3.00 4.00
rshrn.h (>>2) 3.00 4.00
rshrn.s (>>2) 3.00 4.00
rshrn.d (>>2) 3.00 4.00
rshrn2.h (>>2) 3.00 4.00
rshrn2.s (>>2) 3.00 4.00
rshrn2.d (>>2) 3.00 4.01
sqrshrn.h (>>2) 3.00 4.00
sqrshrn.s (>>2) 3.00 4.00
sqrshrn.d (>>2) 3.00 4.00
sqrshrn2.h (>>2) 3.00 4.00
sqrshrn2.s (>>2) 3.00 4.00
sqrshrn2.d (>>2) 3.00 4.00
uqrshrn.h (>>2) 3.00 4.00
uqrshrn.s (>>2) 3.00 4.00
uqrshrn.d (>>2) 3.00 4.00
uqrshrn2.h (>>2) 3.00 4.00
uqrshrn2.s (>>2) 3.00 4.00
uqrshrn2.d (>>2) 3.00 4.00
sqrshrun.h (>>2) 3.00 4.00
sqrshrun.s (>>2) 3.00 4.00
sqrshrun.d (>>2) 3.00 4.00
sqrshrun2.h (>>2) 3.00 4.00
sqrshrun2.s (>>2) 3.00 4.00
sqrshrun2.d (>>2) 3.00 4.00
sri.b (vec; >>2) 2.00 4.00
sri.h (vec; >>2) 2.00 4.00
sri.s (vec; >>2) 2.00 4.00
sri.d (vec; >>2) 2.00 4.00

Vector integer compare

instruction latency throughput
cmeq.b (reg) 2.00 4.00
cmeq.h (reg) 2.00 4.00
cmeq.s (reg) 2.00 4.00
cmeq.d (reg) 2.00 4.00
cmeq.b (zero) 2.00 4.00
cmeq.h (zero) 2.00 4.00
cmeq.s (zero) 2.00 4.00
cmeq.d (zero) 2.00 4.00
cmge.b (reg) 2.00 4.01
cmge.h (reg) 2.00 4.01
cmge.s (reg) 2.00 4.01
cmge.d (reg) 2.00 4.00
cmge.b (zero) 2.00 4.00
cmge.h (zero) 2.00 4.00
cmge.s (zero) 2.00 4.00
cmge.d (zero) 2.00 4.00
cmgt.b (reg) 2.00 4.00
cmgt.h (reg) 2.00 4.00
cmgt.s (reg) 2.00 4.00
cmgt.d (reg) 2.00 4.00
cmgt.b (zero) 2.00 4.00
cmgt.h (zero) 2.00 4.00
cmgt.s (zero) 2.00 4.00
cmgt.d (zero) 2.00 4.00
cmle.b (zero) 2.00 4.00
cmle.h (zero) 2.00 4.01
cmle.s (zero) 2.00 4.00
cmle.d (zero) 2.00 4.01
cmlt.b (zero) 2.00 4.00
cmlt.h (zero) 2.00 4.00
cmlt.s (zero) 2.00 4.00
cmlt.d (zero) 2.00 4.00
cmhi.b (reg) 2.00 4.01
cmhi.h (reg) 2.00 4.00
cmhi.s (reg) 2.00 4.00
cmhi.d (reg) 2.00 4.00
cmhs.b (reg) 2.00 4.00
cmhs.h (reg) 2.00 4.00
cmhs.s (reg) 2.00 4.00
cmhs.d (reg) 2.00 4.01
cmtst.b (reg) 2.00 4.00
cmtst.h (reg) 2.00 4.00
cmtst.s (reg) 2.00 4.01
cmtst.d (reg) 2.00 4.00

Vector permute

instruction latency throughput
ext.b (>>1) 2.00 4.00
ext.b (>>15) 2.00 4.00
tbl (len == 1) 2.00 4.00
tbl (len == 2) 2.00 4.00
tbl (len == 3) 4.00 2.00
tbl (len == 4) 4.00 1.33
tbx (len == 1) 2.00 4.00
tbx (len == 2) 4.00 2.00
tbx (len == 3) 6.00 1.33
tbx (len == 4) 8.00 0.98
trn1.b 2.00 4.00
trn2.b 2.00 4.00
trn1.h 2.00 4.00
trn2.h 2.00 4.00
trn1.s 2.00 4.00
trn2.s 2.00 4.01
trn1.d 2.00 4.00
trn2.d 2.00 4.00
zip1.b 2.00 4.00
zip2.b 2.00 4.00
zip1.h 2.00 4.00
zip2.h 2.00 4.01
zip1.s 2.00 4.01
zip2.s 2.00 4.00
zip1.d 2.00 4.01
zip2.d 2.00 4.00

Floating point add, sub, abs, and neg

instruction latency throughput
fadd.h (scl) 3.00 4.00
fadd.s (scl) 3.00 4.00
fadd.d (scl) 3.00 4.00
fadd.h (vec) 3.00 4.00
fadd.s (vec) 3.00 4.00
fadd.d (vec) 3.00 4.00
faddp.h (scl) 3.00 4.00
faddp.s (scl) 3.00 4.00
faddp.d (scl) 3.00 4.00
faddp.h (vec) 3.00 4.00
faddp.s (vec) 3.00 4.00
faddp.d (vec) 3.00 4.01
fsub.h (scl) 3.00 4.00
fsub.s (scl) 3.00 4.00
fsub.d (scl) 3.00 4.00
fsub.h (vec) 3.00 4.00
fsub.s (vec) 3.00 4.00
fsub.d (vec) 3.00 4.00
fcadd.h (deg = 90) 3.00 4.00
fcadd.s (deg = 90) 3.00 4.01
fcadd.d (deg = 90) 3.00 4.00
fabs.h (scl) 2.00 4.00
fabs.s (scl) 2.00 4.01
fabs.d (scl) 2.00 4.00
fabs.h (vec) 2.00 4.00
fabs.s (vec) 2.00 4.00
fabs.d (vec) 2.00 4.00
fabd.h (scl) 3.00 4.00
fabd.s (scl) 3.00 4.00
fabd.d (scl) 3.00 4.00
fabd.h (vec) 3.00 4.00
fabd.s (vec) 3.00 4.00
fabd.d (vec) 3.00 4.00
fneg.h (scl) 2.00 4.00
fneg.s (scl) 2.00 4.00
fneg.d (scl) 2.00 4.00
fneg.h (vec) 2.00 4.00
fneg.s (vec) 2.00 4.00
fneg.d (vec) 2.00 4.01

Floating point max / min

instruction latency throughput
fmax.h (scl) 2.00 4.01
fmax.s (scl) 2.00 4.00
fmax.d (scl) 2.00 4.00
fmax.h (vec) 2.00 4.00
fmax.s (vec) 2.00 4.00
fmax.d (vec) 2.00 4.00
fmaxp.h (scl) 2.00 4.00
fmaxp.s (scl) 2.00 4.00
fmaxp.d (scl) 2.00 4.00
fmaxp.h (vec) 2.00 4.00
fmaxp.s (vec) 2.00 4.00
fmaxp.d (vec) 2.00 4.00
fmaxv.h 3.00 4.01
fmaxv.s 3.00 4.01
fmaxnm.h (scl) 2.00 4.00
fmaxnm.s (scl) 2.00 4.00
fmaxnm.d (scl) 2.00 4.01
fmaxnm.h (vec) 2.00 4.00
fmaxnm.s (vec) 2.00 4.00
fmaxnm.d (vec) 2.00 4.00
fmaxnmp.h (scl) 2.00 4.00
fmaxnmp.s (scl) 2.00 4.01
fmaxnmp.d (scl) 2.00 4.00
fmaxnmp.h (vec) 2.00 4.00
fmaxnmp.s (vec) 2.00 4.00
fmaxnmp.d (vec) 2.00 4.00
fmaxnmv.h 3.00 4.00
fmaxnmv.s 3.00 4.00
fmin.h (scl) 2.00 4.00
fmin.s (scl) 2.00 4.00
fmin.d (scl) 2.00 4.00
fmin.h (vec) 2.00 4.00
fmin.s (vec) 2.00 4.00
fmin.d (vec) 2.00 4.00
fminp.h (scl) 2.00 4.00
fminp.s (scl) 2.00 4.00
fminp.d (scl) 2.00 4.00
fminp.h (vec) 2.00 4.00
fminp.s (vec) 2.00 4.00
fminp.d (vec) 2.00 4.00
fminv.h 3.00 4.00
fminv.s 3.00 4.00
fminnm.h (scl) 2.00 4.01
fminnm.s (scl) 2.00 4.00
fminnm.d (scl) 2.00 4.01
fminnm.h (vec) 2.00 4.00
fminnm.s (vec) 2.00 4.01
fminnm.d (vec) 2.00 4.00
fminnmp.h (scl) 2.00 4.00
fminnmp.s (scl) 2.00 4.00
fminnmp.d (scl) 2.00 4.00
fminnmp.h (vec) 2.00 4.00
fminnmp.s (vec) 2.00 4.00
fminnmp.d (vec) 2.00 4.00
fminnmv.h 3.00 4.01
fminnmv.s 3.00 4.00

Floating point multiply

instruction latency throughput
fmul.h (scl) 4.00 4.00
fmul.s (scl) 4.00 4.00
fmul.d (scl) 4.00 4.00
fmul.h (vec) 4.00 4.01
fmul.s (vec) 4.00 4.00
fmul.d (vec) 4.00 4.00
fmulx.h (scl) 4.00 4.00
fmulx.s (scl) 4.00 4.01
fmulx.d (scl) 4.00 4.00
fmulx.h (vec) 4.00 4.00
fmulx.s (vec) 4.00 4.00
fmulx.d (vec) 4.00 4.01
fnmul.h (scl) 4.00 4.00
fnmul.s (scl) 4.00 4.00
fnmul.d (scl) 4.00 4.01

Floating point multiply-accumulate and fused-multiply add

instruction latency throughput
fmla.h (vec) 4.00 4.00
fmla.s (vec) 4.00 4.00
fmla.d (vec) 4.00 4.00
fmla.h (vec; acc. fwd.) 4.00 -
fmla.s (vec; acc. fwd.) 4.00 -
fmla.d (vec; acc. fwd.) 4.00 -
fmla.h (elem; [7]) 4.00 3.43
fmla.s (elem; [3]) 4.00 3.43
fmla.d (elem; [1]) 4.00 3.43
fmlal.h (vec) 4.00 4.00
fmlal2.h (vec) 4.00 4.00
fmlal.h (vec; acc. fwd.) 4.00 -
fmlal2.h (vec; acc. fwd.) 4.00 -
fmls.h (vec) 4.00 4.00
fmls.s (vec) 4.00 4.00
fmls.d (vec) 4.00 4.00
fmls.h (vec; acc. fwd.) 4.00 -
fmls.s (vec; acc. fwd.) 4.00 -
fmls.d (vec; acc. fwd.) 4.00 -
fmls.h (elem; [7]) 4.00 3.42
fmls.s (elem; [3]) 4.00 3.43
fmls.d (elem; [1]) 4.00 3.43
fmlsl.h (vec) 4.00 4.00
fmlsl2.h (vec) 4.00 4.00
fmlsl.h (vec; acc. fwd.) 4.00 -
fmlsl2.h (vec; acc. fwd.) 4.00 -
fmadd.h 4.00 4.00
fmadd.s 4.00 4.01
fmadd.d 4.00 4.01
fmadd.h (acc. fwd.) 4.00 -
fmadd.s (acc. fwd.) 4.00 -
fmadd.d (acc. fwd.) 4.00 -
fmsub.h 4.00 4.00
fmsub.s 4.00 4.00
fmsub.d 4.00 4.00
fmsub.h (acc. fwd.) 4.00 -
fmsub.s (acc. fwd.) 4.00 -
fmsub.d (acc. fwd.) 4.00 -
fnmadd.h 4.00 4.00
fnmadd.s 4.00 4.00
fnmadd.d 4.00 4.00
fnmadd.h (acc. fwd.) 4.00 -
fnmadd.s (acc. fwd.) 4.00 -
fnmadd.d (acc. fwd.) 4.00 -
fnmsub.h 4.00 4.00
fnmsub.s 4.00 4.00
fnmsub.d 4.00 4.00
fnmsub.h (acc. fwd.) 4.00 -
fnmsub.s (acc. fwd.) 4.00 -
fnmsub.d (acc. fwd.) 4.00 -
fcmla.h (vec; deg = 0) 4.00 4.00
fcmla.s (vec; deg = 0) 4.00 4.01
fcmla.d (vec; deg = 0) 4.00 4.00
fcmla.h (vec; deg = 90) 4.00 4.00
fcmla.s (vec; deg = 90) 4.00 4.00
fcmla.d (vec; deg = 90) 4.00 4.00
fcmla.h (vec; deg = 0; acc. fwd.) 4.00 -
fcmla.s (vec; deg = 0; acc. fwd.) 4.00 -
fcmla.d (vec; deg = 0; acc. fwd.) 4.00 -
fcmla.h (vec; deg = 90; acc. fwd.) 4.00 -
fcmla.s (vec; deg = 90; acc. fwd.) 4.00 -
fcmla.d (vec; deg = 90; acc. fwd.) 4.00 -
fcmla.h (elem; deg = 0; v.h[7]) 4.00 3.43
fcmla.s (elem; deg = 0; v.s[3]) 4.00 3.43
fcmla.h (elem; deg = 90; v.h[7]) 4.00 3.43
fcmla.s (elem; deg = 90; v.s[3]) 4.00 3.43

Floating point divide and reciprocal

instruction latency throughput
frecpe.h (scl) 3.00 1.00
frecpe.s (scl) 3.00 1.00
frecpe.d (scl) 3.00 1.00
frecpe.h (vec) 3.00 1.00
frecpe.s (vec) 3.00 1.00
frecpe.d (vec) 3.00 1.00
frecps.h (scl) 4.00 4.00
frecps.s (scl) 4.00 4.00
frecps.d (scl) 4.00 4.00
frecps.h (vec) 4.00 4.00
frecps.s (vec) 4.00 4.00
frecps.d (vec) 4.00 4.00
frecpx.h (scl) 3.00 1.00
frecpx.s (scl) 3.00 1.00
frecpx.d (scl) 3.00 1.00

Floating point math

instruction latency throughput
fsqrt.h (scl) 8.00 0.50
fsqrt.s (scl) 10.00 0.50
fsqrt.d (scl) 13.00 0.50
fsqrt.h (vec) 8.00 0.50
fsqrt.s (vec) 10.00 0.50
fsqrt.d (vec) 13.00 0.50
frsqrte.h (scl) 3.00 1.00
frsqrte.s (scl) 3.00 1.00
frsqrte.d (scl) 3.00 1.00
frsqrte.h (vec) 3.00 1.00
frsqrte.s (vec) 3.00 1.00
frsqrte.d (vec) 3.00 1.00
frsqrts.h (scl) 4.00 4.00
frsqrts.s (scl) 4.00 4.01
frsqrts.d (scl) 4.00 4.00
frsqrts.h (vec) 4.00 4.00
frsqrts.s (vec) 4.00 4.00
frsqrts.d (vec) 4.00 4.00

Floating point compare

instruction latency throughput
facge.h (scl) 2.00 4.00
facge.s (scl) 2.00 4.00
facge.d (scl) 2.00 4.00
facge.h (vec) 2.00 4.00
facge.s (vec) 2.00 4.00
facge.d (vec) 2.00 4.00
facgt.h (scl) 2.00 4.00
facgt.s (scl) 2.00 4.00
facgt.d (scl) 2.00 4.01
facgt.h (vec) 2.00 4.01
facgt.s (vec) 2.00 4.00
facgt.d (vec) 2.00 4.00
fcmp.h (reg) 2.00 1.00
fcmp.h (zero) 2.00 1.00
fcmp.s (reg) 2.00 1.00
fcmp.s (zero) 2.00 1.00
fcmp.d (reg) 2.00 1.00
fcmp.d (zero) 2.00 1.00
fcmpe.h (reg) 2.00 1.00
fcmpe.h (zero) 2.00 1.00
fcmpe.s (reg) 2.00 1.00
fcmpe.s (zero) 2.00 1.00
fcmpe.d (reg) 2.00 1.00
fcmpe.d (zero) 2.00 1.00
fccmp.h (eq) 2.00 1.00
fccmp.h (le) 2.00 1.00
fccmp.s (eq) 2.00 1.00
fccmp.s (le) 2.00 1.00
fccmp.d (eq) 2.00 1.00
fccmp.d (le) 2.00 1.00
fccmpe.h (eq) 2.00 1.00
fccmpe.h (le) 2.00 1.00
fccmpe.s (eq) 2.00 1.00
fccmpe.s (le) 2.00 1.00
fccmpe.d (eq) 2.00 1.00
fccmpe.d (le) 2.00 1.00
fcmeq.h (scl) 2.00 4.01
fcmeq.s (scl) 2.00 4.00
fcmeq.d (scl) 2.00 4.00
fcmeq.h (vec) 2.00 4.00
fcmeq.s (vec) 2.00 4.00
fcmeq.d (vec) 2.00 4.00
fcmeq.h (scl; zero) 2.00 4.00
fcmeq.s (scl; zero) 2.00 4.00
fcmeq.d (scl; zero) 2.00 4.00
fcmeq.h (vec; zero) 2.00 4.00
fcmeq.s (vec; zero) 2.00 4.00
fcmeq.d (vec; zero) 2.00 4.00
fcmge.h (scl) 2.00 4.00
fcmge.s (scl) 2.00 4.00
fcmge.d (scl) 2.00 4.01
fcmge.h (vec) 2.00 4.01
fcmge.s (vec) 2.00 4.00
fcmge.d (vec) 2.00 4.00
fcmge.h (scl; zero) 2.00 4.00
fcmge.s (scl; zero) 2.00 4.00
fcmge.d (scl; zero) 2.00 4.01
fcmge.h (vec; zero) 2.00 4.00
fcmge.s (vec; zero) 2.00 4.00
fcmge.d (vec; zero) 2.00 4.00
fcmgt.h (scl) 2.00 4.00
fcmgt.s (scl) 2.00 4.01
fcmgt.d (scl) 2.00 4.00
fcmgt.h (vec) 2.00 4.00
fcmgt.s (vec) 2.00 4.01
fcmgt.d (vec) 2.00 4.00
fcmgt.h (scl; zero) 2.00 4.00
fcmgt.s (scl; zero) 2.00 4.00
fcmgt.d (scl; zero) 2.00 4.00
fcmgt.h (vec; zero) 2.00 4.00
fcmgt.s (vec; zero) 2.00 4.00
fcmgt.d (vec; zero) 2.00 4.00
fcmle.h (scl; zero) 2.00 4.00
fcmle.s (scl; zero) 2.00 4.00
fcmle.d (scl; zero) 2.00 4.00
fcmle.h (vec; zero) 2.00 4.01
fcmle.s (vec; zero) 2.00 4.00
fcmle.d (vec; zero) 2.00 4.01
fcmlt.h (scl; zero) 2.00 4.00
fcmlt.s (scl; zero) 2.00 4.00
fcmlt.d (scl; zero) 2.00 4.00
fcmlt.h (vec; zero) 2.00 4.00
fcmlt.s (vec; zero) 2.00 4.00
fcmlt.d (vec; zero) 2.00 4.00

Floating point condtional select

instruction latency throughput
fcsel.h (eq) 2.00 2.00
fcsel.h (le) 2.00 2.00
fcsel.s (eq) 2.00 2.00
fcsel.s (le) 2.00 2.00
fcsel.d (eq) 2.00 2.00
fcsel.d (le) 2.00 2.00

Floating point convert

instruction latency throughput
scvtf.h (scl; >>2) 8.00 3.00
scvtf.s (scl; >>2) 7.99 3.00
scvtf.d (scl; >>2) 8.00 3.00
scvtf.h (scl; int) 8.00 3.00
scvtf.s (scl; int) 8.00 3.00
scvtf.d (scl; int) 7.99 3.00
scvtf.h (vec; >>2) 3.00 4.00
scvtf.s (vec; >>2) 3.00 4.00
scvtf.d (vec; >>2) 3.00 4.00
scvtf.h (vec; int) 3.00 4.00
scvtf.s (vec; int) 3.00 4.00
scvtf.d (vec; int) 3.00 4.00
fcvt (h -> s) 3.00 4.00
fcvt (h -> d) 3.00 4.01
fcvt (s -> h) 3.00 4.01
fcvt (s -> d) 3.00 4.00
fcvt (d -> h) 3.00 4.00
fcvt (d -> s) 3.00 4.00
fcvtl (h -> s) 3.00 4.01
fcvtl (s -> d) 3.00 4.00
fcvtl2 (h -> s) 3.00 4.00
fcvtl2 (s -> d) 3.00 4.00
fcvtn (s -> h) 3.00 4.00
fcvtn (d -> s) 3.00 4.01
fcvtn2 (s -> h) 3.00 4.00
fcvtn2 (d -> s) 3.00 4.00
fcvtxn 3.00 4.00
fcvtxn2 3.00 4.00
fcvtas.h (scl) 3.00 4.00
fcvtas.s (scl) 3.00 4.00
fcvtas.d (scl) 3.00 4.00
fcvtas.h (vec) 3.00 4.00
fcvtas.s (vec) 3.00 4.00
fcvtas.d (vec) 3.00 4.00
fcvtas.h (scl -> reg) 10.00 2.00
fcvtas.s (scl -> reg) 10.00 2.00
fcvtas.d (scl -> reg) 10.00 2.00
fcvtau.h (scl) 3.00 4.00
fcvtau.s (scl) 3.00 4.00
fcvtau.d (scl) 3.00 4.00
fcvtau.h (vec) 3.00 4.00
fcvtau.s (vec) 3.00 4.00
fcvtau.d (vec) 3.00 4.00
fcvtau.h (scl -> reg) 10.00 2.00
fcvtau.s (scl -> reg) 10.00 2.00
fcvtau.d (scl -> reg) 10.00 2.00
fjcvtzs 10.00 1.00
frinta.h (scl) 3.00 4.00
frinta.s (scl) 3.00 4.00
frinta.d (scl) 3.00 4.00
frinta.h (vec) 3.00 4.00
frinta.s (vec) 3.00 4.00
frinta.d (vec) 3.00 4.01
frinti.h (scl) 3.00 4.00
frinti.s (scl) 3.00 4.00
frinti.d (scl) 3.00 4.00
frinti.h (vec) 3.00 4.00
frinti.s (vec) 3.00 4.00
frinti.d (vec) 3.00 4.00
frintm.h (scl) 3.00 4.00
frintm.s (scl) 3.00 4.01
frintm.d (scl) 3.00 4.00
frintm.h (vec) 3.00 4.00
frintm.s (vec) 3.00 4.00
frintm.d (vec) 3.00 4.00
frintn.h (scl) 3.00 4.00
frintn.s (scl) 3.00 4.00
frintn.d (scl) 3.00 4.00
frintn.h (vec) 3.00 4.00
frintn.s (vec) 3.00 4.00
frintn.d (vec) 3.00 4.00
frintp.h (scl) 3.00 4.00
frintp.s (scl) 3.00 4.00
frintp.d (scl) 3.00 4.00
frintp.h (vec) 3.00 4.00
frintp.s (vec) 3.00 4.00
frintp.d (vec) 3.00 4.00
frintx.h (scl) 3.00 4.01
frintx.s (scl) 3.00 4.00
frintx.d (scl) 3.00 4.00
frintx.h (vec) 3.00 4.00
frintx.s (vec) 3.00 4.00
frintx.d (vec) 3.00 4.00
frintz.h (scl) 3.00 4.00
frintz.s (scl) 3.00 4.00
frintz.d (scl) 3.00 4.00
frintz.h (vec) 3.00 4.00
frintz.s (vec) 3.00 4.00
frintz.d (vec) 3.00 4.00