forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
crypto: chacha20 - Add a SSSE3 SIMD variant for x86_64
Implements an x86_64 assembler driver for the ChaCha20 stream cipher. This single block variant works on a single state matrix using SSE instructions. It requires SSSE3 due the use of pshufb for efficient 8/16-bit rotate operations. For large messages, throughput increases by ~65% compared to chacha20-generic: testing speed of chacha20 (chacha20-generic) encryption test 0 (256 bit key, 16 byte blocks): 45089207 operations in 10 seconds (721427312 bytes) test 1 (256 bit key, 64 byte blocks): 43839521 operations in 10 seconds (2805729344 bytes) test 2 (256 bit key, 256 byte blocks): 12702056 operations in 10 seconds (3251726336 bytes) test 3 (256 bit key, 1024 byte blocks): 3371173 operations in 10 seconds (3452081152 bytes) test 4 (256 bit key, 8192 byte blocks): 422468 operations in 10 seconds (3460857856 bytes) testing speed of chacha20 (chacha20-simd) encryption test 0 (256 bit key, 16 byte blocks): 43141886 operations in 10 seconds (690270176 bytes) test 1 (256 bit key, 64 byte blocks): 46845874 operations in 10 seconds (2998135936 bytes) test 2 (256 bit key, 256 byte blocks): 18458512 operations in 10 seconds (4725379072 bytes) test 3 (256 bit key, 1024 byte blocks): 5360533 operations in 10 seconds (5489185792 bytes) test 4 (256 bit key, 8192 byte blocks): 692846 operations in 10 seconds (5675794432 bytes) Benchmark results from a Core i5-4670T. Signed-off-by: Martin Willi <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
- Loading branch information
1 parent
31d7247
commit c9320b6
Showing
4 changed files
with
282 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
/* | ||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions | ||
* | ||
* Copyright (C) 2015 Martin Willi | ||
* | ||
* This program is free software; you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation; either version 2 of the License, or | ||
* (at your option) any later version. | ||
*/ | ||
|
||
#include <linux/linkage.h> | ||
|
||
.data | ||
.align 16 | ||
|
||
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 | ||
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 | ||
|
||
.text | ||
|
||
ENTRY(chacha20_block_xor_ssse3) | ||
# %rdi: Input state matrix, s | ||
# %rsi: 1 data block output, o | ||
# %rdx: 1 data block input, i | ||
|
||
# This function encrypts one ChaCha20 block by loading the state matrix | ||
# in four SSE registers. It performs matrix operation on four words in | ||
# parallel, but requireds shuffling to rearrange the words after each | ||
# round. 8/16-bit word rotation is done with the slightly better | ||
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses | ||
# traditional shift+OR. | ||
|
||
# x0..3 = s0..3 | ||
movdqa 0x00(%rdi),%xmm0 | ||
movdqa 0x10(%rdi),%xmm1 | ||
movdqa 0x20(%rdi),%xmm2 | ||
movdqa 0x30(%rdi),%xmm3 | ||
movdqa %xmm0,%xmm8 | ||
movdqa %xmm1,%xmm9 | ||
movdqa %xmm2,%xmm10 | ||
movdqa %xmm3,%xmm11 | ||
|
||
movdqa ROT8(%rip),%xmm4 | ||
movdqa ROT16(%rip),%xmm5 | ||
|
||
mov $10,%ecx | ||
|
||
.Ldoubleround: | ||
|
||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16) | ||
paddd %xmm1,%xmm0 | ||
pxor %xmm0,%xmm3 | ||
pshufb %xmm5,%xmm3 | ||
|
||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12) | ||
paddd %xmm3,%xmm2 | ||
pxor %xmm2,%xmm1 | ||
movdqa %xmm1,%xmm6 | ||
pslld $12,%xmm6 | ||
psrld $20,%xmm1 | ||
por %xmm6,%xmm1 | ||
|
||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8) | ||
paddd %xmm1,%xmm0 | ||
pxor %xmm0,%xmm3 | ||
pshufb %xmm4,%xmm3 | ||
|
||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7) | ||
paddd %xmm3,%xmm2 | ||
pxor %xmm2,%xmm1 | ||
movdqa %xmm1,%xmm7 | ||
pslld $7,%xmm7 | ||
psrld $25,%xmm1 | ||
por %xmm7,%xmm1 | ||
|
||
# x1 = shuffle32(x1, MASK(0, 3, 2, 1)) | ||
pshufd $0x39,%xmm1,%xmm1 | ||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2)) | ||
pshufd $0x4e,%xmm2,%xmm2 | ||
# x3 = shuffle32(x3, MASK(2, 1, 0, 3)) | ||
pshufd $0x93,%xmm3,%xmm3 | ||
|
||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16) | ||
paddd %xmm1,%xmm0 | ||
pxor %xmm0,%xmm3 | ||
pshufb %xmm5,%xmm3 | ||
|
||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12) | ||
paddd %xmm3,%xmm2 | ||
pxor %xmm2,%xmm1 | ||
movdqa %xmm1,%xmm6 | ||
pslld $12,%xmm6 | ||
psrld $20,%xmm1 | ||
por %xmm6,%xmm1 | ||
|
||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8) | ||
paddd %xmm1,%xmm0 | ||
pxor %xmm0,%xmm3 | ||
pshufb %xmm4,%xmm3 | ||
|
||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7) | ||
paddd %xmm3,%xmm2 | ||
pxor %xmm2,%xmm1 | ||
movdqa %xmm1,%xmm7 | ||
pslld $7,%xmm7 | ||
psrld $25,%xmm1 | ||
por %xmm7,%xmm1 | ||
|
||
# x1 = shuffle32(x1, MASK(2, 1, 0, 3)) | ||
pshufd $0x93,%xmm1,%xmm1 | ||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2)) | ||
pshufd $0x4e,%xmm2,%xmm2 | ||
# x3 = shuffle32(x3, MASK(0, 3, 2, 1)) | ||
pshufd $0x39,%xmm3,%xmm3 | ||
|
||
dec %ecx | ||
jnz .Ldoubleround | ||
|
||
# o0 = i0 ^ (x0 + s0) | ||
movdqu 0x00(%rdx),%xmm4 | ||
paddd %xmm8,%xmm0 | ||
pxor %xmm4,%xmm0 | ||
movdqu %xmm0,0x00(%rsi) | ||
# o1 = i1 ^ (x1 + s1) | ||
movdqu 0x10(%rdx),%xmm5 | ||
paddd %xmm9,%xmm1 | ||
pxor %xmm5,%xmm1 | ||
movdqu %xmm1,0x10(%rsi) | ||
# o2 = i2 ^ (x2 + s2) | ||
movdqu 0x20(%rdx),%xmm6 | ||
paddd %xmm10,%xmm2 | ||
pxor %xmm6,%xmm2 | ||
movdqu %xmm2,0x20(%rsi) | ||
# o3 = i3 ^ (x3 + s3) | ||
movdqu 0x30(%rdx),%xmm7 | ||
paddd %xmm11,%xmm3 | ||
pxor %xmm7,%xmm3 | ||
movdqu %xmm3,0x30(%rsi) | ||
|
||
ret | ||
ENDPROC(chacha20_block_xor_ssse3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
/* | ||
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code | ||
* | ||
* Copyright (C) 2015 Martin Willi | ||
* | ||
* This program is free software; you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation; either version 2 of the License, or | ||
* (at your option) any later version. | ||
*/ | ||
|
||
#include <crypto/algapi.h> | ||
#include <crypto/chacha20.h> | ||
#include <linux/crypto.h> | ||
#include <linux/kernel.h> | ||
#include <linux/module.h> | ||
#include <asm/fpu/api.h> | ||
#include <asm/simd.h> | ||
|
||
#define CHACHA20_STATE_ALIGN 16 | ||
|
||
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); | ||
|
||
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, | ||
unsigned int bytes) | ||
{ | ||
u8 buf[CHACHA20_BLOCK_SIZE]; | ||
|
||
while (bytes >= CHACHA20_BLOCK_SIZE) { | ||
chacha20_block_xor_ssse3(state, dst, src); | ||
bytes -= CHACHA20_BLOCK_SIZE; | ||
src += CHACHA20_BLOCK_SIZE; | ||
dst += CHACHA20_BLOCK_SIZE; | ||
state[12]++; | ||
} | ||
if (bytes) { | ||
memcpy(buf, src, bytes); | ||
chacha20_block_xor_ssse3(state, buf, buf); | ||
memcpy(dst, buf, bytes); | ||
} | ||
} | ||
|
||
static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
struct scatterlist *src, unsigned int nbytes) | ||
{ | ||
u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1]; | ||
struct blkcipher_walk walk; | ||
int err; | ||
|
||
if (!may_use_simd()) | ||
return crypto_chacha20_crypt(desc, dst, src, nbytes); | ||
|
||
state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN); | ||
|
||
blkcipher_walk_init(&walk, dst, src, nbytes); | ||
err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE); | ||
|
||
crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv); | ||
|
||
kernel_fpu_begin(); | ||
|
||
while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { | ||
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, | ||
rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); | ||
err = blkcipher_walk_done(desc, &walk, | ||
walk.nbytes % CHACHA20_BLOCK_SIZE); | ||
} | ||
|
||
if (walk.nbytes) { | ||
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, | ||
walk.nbytes); | ||
err = blkcipher_walk_done(desc, &walk, 0); | ||
} | ||
|
||
kernel_fpu_end(); | ||
|
||
return err; | ||
} | ||
|
||
static struct crypto_alg alg = { | ||
.cra_name = "chacha20", | ||
.cra_driver_name = "chacha20-simd", | ||
.cra_priority = 300, | ||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
.cra_blocksize = 1, | ||
.cra_type = &crypto_blkcipher_type, | ||
.cra_ctxsize = sizeof(struct chacha20_ctx), | ||
.cra_alignmask = sizeof(u32) - 1, | ||
.cra_module = THIS_MODULE, | ||
.cra_u = { | ||
.blkcipher = { | ||
.min_keysize = CHACHA20_KEY_SIZE, | ||
.max_keysize = CHACHA20_KEY_SIZE, | ||
.ivsize = CHACHA20_IV_SIZE, | ||
.geniv = "seqiv", | ||
.setkey = crypto_chacha20_setkey, | ||
.encrypt = chacha20_simd, | ||
.decrypt = chacha20_simd, | ||
}, | ||
}, | ||
}; | ||
|
||
static int __init chacha20_simd_mod_init(void) | ||
{ | ||
if (!cpu_has_ssse3) | ||
return -ENODEV; | ||
|
||
return crypto_register_alg(&alg); | ||
} | ||
|
||
static void __exit chacha20_simd_mod_fini(void) | ||
{ | ||
crypto_unregister_alg(&alg); | ||
} | ||
|
||
module_init(chacha20_simd_mod_init); | ||
module_exit(chacha20_simd_mod_fini); | ||
|
||
MODULE_LICENSE("GPL"); | ||
MODULE_AUTHOR("Martin Willi <[email protected]>"); | ||
MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated"); | ||
MODULE_ALIAS_CRYPTO("chacha20"); | ||
MODULE_ALIAS_CRYPTO("chacha20-simd"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters