Skip to content

Commit

Permalink
crypto: arm64/sm4 - add CE implementation for XTS mode
Browse files Browse the repository at this point in the history
This patch is a CE-optimized assembly implementation for XTS mode.

Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 218 mode of
tcrypt, and compared the performance before and after this patch (the driver
used before this patch is xts(ecb-sm4-ce)). The abscissas are blocks of
different lengths. The data is tabulated and the unit is Mb/s:

Before:

xts(ecb-sm4-ce) |      16       64      128      256     1024     1420     4096
----------------+--------------------------------------------------------------
        XTS enc |  117.17   430.56   732.92  1134.98  2007.03  2136.23  2347.20
        XTS dec |  116.89   429.02   733.40  1132.96  2006.13  2130.50  2347.92

After:

xts-sm4-ce      |      16       64      128      256     1024     1420     4096
----------------+--------------------------------------------------------------
        XTS enc |  224.68   798.91  1248.08  1714.60  2413.73  2467.84  2612.62
        XTS dec |  229.85   791.34  1237.79  1720.00  2413.30  2473.84  2611.95

Signed-off-by: Tianjia Zhang <[email protected]>
Signed-off-by: Herbert Xu <[email protected]>
  • Loading branch information
uudiin authored and herbertx committed Nov 4, 2022
1 parent b1863fd commit 01f6331
Show file tree
Hide file tree
Showing 3 changed files with 504 additions and 2 deletions.
4 changes: 3 additions & 1 deletion arch/arm64/crypto/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ config CRYPTO_SM4_ARM64_CE
- NEON (Advanced SIMD) extensions

config CRYPTO_SM4_ARM64_CE_BLK
tristate "Ciphers: SM4, modes: ECB/CBC/CFB/CTR (ARMv8 Crypto Extensions)"
tristate "Ciphers: SM4, modes: ECB/CBC/CFB/CTR/XTS (ARMv8 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_SM4
Expand All @@ -242,6 +242,8 @@ config CRYPTO_SM4_ARM64_CE_BLK
- CBC (Cipher Block Chaining) mode (NIST SP800-38A)
- CFB (Cipher Feedback) mode (NIST SP800-38A)
- CTR (Counter) mode (NIST SP800-38A)
- XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E
and IEEE 1619)

Architecture: arm64 using:
- ARMv8 Crypto Extensions
Expand Down
343 changes: 343 additions & 0 deletions arch/arm64/crypto/sm4-ce-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#define RTMP3 v19

#define RIV v20
#define RMASK v21


.align 3
Expand Down Expand Up @@ -665,6 +666,348 @@ SYM_FUNC_START(sm4_ce_ctr_enc)
SYM_FUNC_END(sm4_ce_ctr_enc)


#define tweak_next(vt, vin, RTMP) \
sshr RTMP.2d, vin.2d, #63; \
and RTMP.16b, RTMP.16b, RMASK.16b; \
add vt.2d, vin.2d, vin.2d; \
ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \
eor vt.16b, vt.16b, RTMP.16b;

.align 3
SYM_FUNC_START(sm4_ce_xts_enc)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: tweak (big endian, 128 bit)
* w4: nbytes
* x5: round key array for IV
*/
ld1 {v8.16b}, [x3]

cbz x5, .Lxts_enc_nofirst

SM4_PREPARE(x5)

/* Generate first tweak */
SM4_CRYPT_BLK(v8)

.Lxts_enc_nofirst:
SM4_PREPARE(x0)

ands w5, w4, #15
lsr w4, w4, #4
sub w6, w4, #1
csel w4, w4, w6, eq
uxtw x5, w5

movi RMASK.2s, #0x1
movi RTMP0.2s, #0x87
uzp1 RMASK.4s, RMASK.4s, RTMP0.4s

cbz w4, .Lxts_enc_cts

.Lxts_enc_loop_8x:
sub w4, w4, #8
tbnz w4, #31, .Lxts_enc_4x

tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
tweak_next(v12, v11, RTMP3)
tweak_next(v13, v12, RTMP0)
tweak_next(v14, v13, RTMP1)
tweak_next(v15, v14, RTMP2)

ld1 {v0.16b-v3.16b}, [x2], #64
ld1 {v4.16b-v7.16b}, [x2], #64
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b

SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)

eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
st1 {v0.16b-v3.16b}, [x1], #64
st1 {v4.16b-v7.16b}, [x1], #64

tweak_next(v8, v15, RTMP3)

cbz w4, .Lxts_enc_cts
b .Lxts_enc_loop_8x

.Lxts_enc_4x:
add w4, w4, #8
cmp w4, #4
blt .Lxts_enc_loop_1x

sub w4, w4, #4

tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)

ld1 {v0.16b-v3.16b}, [x2], #64
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b

SM4_CRYPT_BLK4(v0, v1, v2, v3)

eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
st1 {v0.16b-v3.16b}, [x1], #64

tweak_next(v8, v11, RTMP3)

cbz w4, .Lxts_enc_cts

.Lxts_enc_loop_1x:
sub w4, w4, #1

ld1 {v0.16b}, [x2], #16
eor v0.16b, v0.16b, v8.16b

SM4_CRYPT_BLK(v0)

eor v0.16b, v0.16b, v8.16b
st1 {v0.16b}, [x1], #16

tweak_next(v8, v8, RTMP0)

cbnz w4, .Lxts_enc_loop_1x

.Lxts_enc_cts:
cbz x5, .Lxts_enc_end

/* cipher text stealing */

tweak_next(v9, v8, RTMP0)
ld1 {v0.16b}, [x2]
eor v0.16b, v0.16b, v8.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v8.16b

/* load permute table */
adr_l x6, .Lcts_permute_table
add x7, x6, #32
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.16b}, [x6]
ld1 {v4.16b}, [x7]

/* overlapping loads */
add x2, x2, x5
ld1 {v1.16b}, [x2]

/* create Cn from En-1 */
tbl v2.16b, {v0.16b}, v3.16b
/* padding Pn with En-1 at the end */
tbx v0.16b, {v1.16b}, v4.16b

eor v0.16b, v0.16b, v9.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v9.16b


/* overlapping stores */
add x5, x1, x5
st1 {v2.16b}, [x5]
st1 {v0.16b}, [x1]

b .Lxts_enc_ret

.Lxts_enc_end:
/* store new tweak */
st1 {v8.16b}, [x3]

.Lxts_enc_ret:
ret
SYM_FUNC_END(sm4_ce_xts_enc)

.align 3
SYM_FUNC_START(sm4_ce_xts_dec)
/* input:
* x0: round key array, CTX
* x1: dst
* x2: src
* x3: tweak (big endian, 128 bit)
* w4: nbytes
* x5: round key array for IV
*/
ld1 {v8.16b}, [x3]

cbz x5, .Lxts_dec_nofirst

SM4_PREPARE(x5)

/* Generate first tweak */
SM4_CRYPT_BLK(v8)

.Lxts_dec_nofirst:
SM4_PREPARE(x0)

ands w5, w4, #15
lsr w4, w4, #4
sub w6, w4, #1
csel w4, w4, w6, eq
uxtw x5, w5

movi RMASK.2s, #0x1
movi RTMP0.2s, #0x87
uzp1 RMASK.4s, RMASK.4s, RTMP0.4s

cbz w4, .Lxts_dec_cts

.Lxts_dec_loop_8x:
sub w4, w4, #8
tbnz w4, #31, .Lxts_dec_4x

tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)
tweak_next(v12, v11, RTMP3)
tweak_next(v13, v12, RTMP0)
tweak_next(v14, v13, RTMP1)
tweak_next(v15, v14, RTMP2)

ld1 {v0.16b-v3.16b}, [x2], #64
ld1 {v4.16b-v7.16b}, [x2], #64
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b

SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)

eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
st1 {v0.16b-v3.16b}, [x1], #64
st1 {v4.16b-v7.16b}, [x1], #64

tweak_next(v8, v15, RTMP3)

cbz w4, .Lxts_dec_cts
b .Lxts_dec_loop_8x

.Lxts_dec_4x:
add w4, w4, #8
cmp w4, #4
blt .Lxts_dec_loop_1x

sub w4, w4, #4

tweak_next( v9, v8, RTMP0)
tweak_next(v10, v9, RTMP1)
tweak_next(v11, v10, RTMP2)

ld1 {v0.16b-v3.16b}, [x2], #64
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b

SM4_CRYPT_BLK4(v0, v1, v2, v3)

eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
st1 {v0.16b-v3.16b}, [x1], #64

tweak_next(v8, v11, RTMP3)

cbz w4, .Lxts_dec_cts

.Lxts_dec_loop_1x:
sub w4, w4, #1

ld1 {v0.16b}, [x2], #16
eor v0.16b, v0.16b, v8.16b

SM4_CRYPT_BLK(v0)

eor v0.16b, v0.16b, v8.16b
st1 {v0.16b}, [x1], #16

tweak_next(v8, v8, RTMP0)

cbnz w4, .Lxts_dec_loop_1x

.Lxts_dec_cts:
cbz x5, .Lxts_dec_end

/* cipher text stealing */

tweak_next(v9, v8, RTMP0)
ld1 {v0.16b}, [x2]
eor v0.16b, v0.16b, v9.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v9.16b

/* load permute table */
adr_l x6, .Lcts_permute_table
add x7, x6, #32
add x6, x6, x5
sub x7, x7, x5
ld1 {v3.16b}, [x6]
ld1 {v4.16b}, [x7]

/* overlapping loads */
add x2, x2, x5
ld1 {v1.16b}, [x2]

/* create Cn from En-1 */
tbl v2.16b, {v0.16b}, v3.16b
/* padding Pn with En-1 at the end */
tbx v0.16b, {v1.16b}, v4.16b

eor v0.16b, v0.16b, v8.16b
SM4_CRYPT_BLK(v0)
eor v0.16b, v0.16b, v8.16b


/* overlapping stores */
add x5, x1, x5
st1 {v2.16b}, [x5]
st1 {v0.16b}, [x1]

b .Lxts_dec_ret

.Lxts_dec_end:
/* store new tweak */
st1 {v8.16b}, [x3]

.Lxts_dec_ret:
ret
SYM_FUNC_END(sm4_ce_xts_dec)


.section ".rodata", "a"
.align 4
.Lbswap128_mask:
Expand Down
Loading

0 comments on commit 01f6331

Please sign in to comment.