Skip to content

Commit

Permalink
crypto: arm64/gcm - implement native driver using v8 Crypto Extensions
Browse files Browse the repository at this point in the history
Currently, the AES-GCM implementation for arm64 systems that support the
ARMv8 Crypto Extensions is based on the generic GCM module, which combines
the AES-CTR implementation using AES instructions with the PMULL based
GHASH driver. This is suboptimal, given the fact that the input data needs
to be loaded twice, once for the encryption and again for the MAC
calculation.

On Cortex-A57 (r1p2) and other recent cores that implement micro-op fusing
for the AES instructions, AES executes at less than 1 cycle per byte, which
means that any cycles wasted on loading the data twice hurt even more.

So implement a new GCM driver that combines the AES and PMULL instructions
at the block level. This improves performance on Cortex-A57 by ~37% (from
3.5 cpb to 2.6 cpb)

Signed-off-by: Ard Biesheuvel <[email protected]>
Signed-off-by: Herbert Xu <[email protected]>
  • Loading branch information
Ard Biesheuvel authored and herbertx committed Aug 4, 2017
1 parent ec808bb commit 537c144
Show file tree
Hide file tree
Showing 3 changed files with 591 additions and 26 deletions.
4 changes: 3 additions & 1 deletion arch/arm64/crypto/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@ config CRYPTO_SHA2_ARM64_CE
select CRYPTO_SHA256_ARM64

config CRYPTO_GHASH_ARM64_CE
tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_GF128MUL
select CRYPTO_AES
select CRYPTO_AES_ARM64

config CRYPTO_CRCT10DIF_ARM64_CE
tristate "CRCT10DIF digest algorithm using PMULL instructions"
Expand Down
175 changes: 175 additions & 0 deletions arch/arm64/crypto/ghash-ce-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,178 @@ CPU_LE( rev64 T1.16b, T1.16b )
st1 {XL.2d}, [x1]
ret
ENDPROC(pmull_ghash_update)

KS .req v8
CTR .req v9
INP .req v10

.macro load_round_keys, rounds, rk
cmp \rounds, #12
blo 2222f /* 128 bits */
beq 1111f /* 192 bits */
ld1 {v17.4s-v18.4s}, [\rk], #32
1111: ld1 {v19.4s-v20.4s}, [\rk], #32
2222: ld1 {v21.4s-v24.4s}, [\rk], #64
ld1 {v25.4s-v28.4s}, [\rk], #64
ld1 {v29.4s-v31.4s}, [\rk]
.endm

.macro enc_round, state, key
aese \state\().16b, \key\().16b
aesmc \state\().16b, \state\().16b
.endm

.macro enc_block, state, rounds
cmp \rounds, #12
b.lo 2222f /* 128 bits */
b.eq 1111f /* 192 bits */
enc_round \state, v17
enc_round \state, v18
1111: enc_round \state, v19
enc_round \state, v20
2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
enc_round \state, \key
.endr
aese \state\().16b, v30.16b
eor \state\().16b, \state\().16b, v31.16b
.endm

.macro pmull_gcm_do_crypt, enc
ld1 {SHASH.2d}, [x4]
ld1 {XL.2d}, [x1]
ldr x8, [x5, #8] // load lower counter

movi MASK.16b, #0xe1
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
CPU_LE( rev x8, x8 )
shl MASK.2d, MASK.2d, #57
eor SHASH2.16b, SHASH2.16b, SHASH.16b

.if \enc == 1
ld1 {KS.16b}, [x7]
.endif

0: ld1 {CTR.8b}, [x5] // load upper counter
ld1 {INP.16b}, [x3], #16
rev x9, x8
add x8, x8, #1
sub w0, w0, #1
ins CTR.d[1], x9 // set lower counter

.if \enc == 1
eor INP.16b, INP.16b, KS.16b // encrypt input
st1 {INP.16b}, [x2], #16
.endif

rev64 T1.16b, INP.16b

cmp w6, #12
b.ge 2f // AES-192/256?

1: enc_round CTR, v21

ext T2.16b, XL.16b, XL.16b, #8
ext IN1.16b, T1.16b, T1.16b, #8

enc_round CTR, v22

eor T1.16b, T1.16b, T2.16b
eor XL.16b, XL.16b, IN1.16b

enc_round CTR, v23

pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
eor T1.16b, T1.16b, XL.16b

enc_round CTR, v24

pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)

enc_round CTR, v25

ext T1.16b, XL.16b, XH.16b, #8
eor T2.16b, XL.16b, XH.16b
eor XM.16b, XM.16b, T1.16b

enc_round CTR, v26

eor XM.16b, XM.16b, T2.16b
pmull T2.1q, XL.1d, MASK.1d

enc_round CTR, v27

mov XH.d[0], XM.d[1]
mov XM.d[1], XL.d[0]

enc_round CTR, v28

eor XL.16b, XM.16b, T2.16b

enc_round CTR, v29

ext T2.16b, XL.16b, XL.16b, #8

aese CTR.16b, v30.16b

pmull XL.1q, XL.1d, MASK.1d
eor T2.16b, T2.16b, XH.16b

eor KS.16b, CTR.16b, v31.16b

eor XL.16b, XL.16b, T2.16b

.if \enc == 0
eor INP.16b, INP.16b, KS.16b
st1 {INP.16b}, [x2], #16
.endif

cbnz w0, 0b

CPU_LE( rev x8, x8 )
st1 {XL.2d}, [x1]
str x8, [x5, #8] // store lower counter

.if \enc == 1
st1 {KS.16b}, [x7]
.endif

ret

2: b.eq 3f // AES-192?
enc_round CTR, v17
enc_round CTR, v18
3: enc_round CTR, v19
enc_round CTR, v20
b 1b
.endm

/*
* void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
* struct ghash_key const *k, u8 ctr[],
* int rounds, u8 ks[])
*/
ENTRY(pmull_gcm_encrypt)
pmull_gcm_do_crypt 1
ENDPROC(pmull_gcm_encrypt)

/*
* void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
* struct ghash_key const *k, u8 ctr[],
* int rounds)
*/
ENTRY(pmull_gcm_decrypt)
pmull_gcm_do_crypt 0
ENDPROC(pmull_gcm_decrypt)

/*
* void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
*/
ENTRY(pmull_gcm_encrypt_block)
cbz x2, 0f
load_round_keys w3, x2
0: ld1 {v0.16b}, [x1]
enc_block v0, w3
st1 {v0.16b}, [x0]
ret
ENDPROC(pmull_gcm_encrypt_block)
Loading

0 comments on commit 537c144

Please sign in to comment.