From 50452b2e60b0e967b03a32462c29750a99de5ffe Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 5 Oct 2011 17:03:44 +0000 Subject: [PATCH] e_padlock: add CTR mode. --- engines/asm/e_padlock-x86.pl | 21 ++++++----- engines/asm/e_padlock-x86_64.pl | 67 +++++++++++++++++++++++++++++++-- engines/e_padlock.c | 48 +++++++++++++++++++++-- 3 files changed, 120 insertions(+), 16 deletions(-) diff --git a/engines/asm/e_padlock-x86.pl b/engines/asm/e_padlock-x86.pl index df8f56b5214d7..61e91d889f82b 100644 --- a/engines/asm/e_padlock-x86.pl +++ b/engines/asm/e_padlock-x86.pl @@ -183,7 +183,7 @@ sub generate_mode { &set_label("${mode}_pic_point"); &lea ($ctx,&DWP(16,$ctx)); # control word &xor ("eax","eax"); - if ($mode eq "ctr16") { + if ($mode eq "ctr32") { &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter } else { &xor ("ebx","ebx"); @@ -216,7 +216,7 @@ sub generate_mode { &mov (&DWP(8,"ebp"),$len); &mov ($len,$chunk); &mov (&DWP(12,"ebp"),$chunk); # chunk - if ($mode eq "ctr16") { + if ($mode eq "ctr32") { &mov ("ecx",&DWP(-4,$ctx)); &xor ($out,$out); &mov ("eax",&DWP(-8,$ctx)); # borrow $len @@ -257,7 +257,7 @@ sub generate_mode { } &mov ($out,&DWP(0,"ebp")); # restore parameters &mov ($chunk,&DWP(12,"ebp")); - if ($mode eq "ctr16") { + if ($mode eq "ctr32") { &mov ($inp,&DWP(4,"ebp")); &xor ($len,$len); &set_label("${mode}_xor"); @@ -284,7 +284,7 @@ sub generate_mode { &sub ($len,$chunk); &mov ($chunk,$PADLOCK_CHUNK); &jnz (&label("${mode}_loop")); - if ($mode ne "ctr16") { + if ($mode ne "ctr32") { &test ($out,0x0f); # out_misaligned &jz (&label("${mode}_done")); } @@ -296,7 +296,7 @@ sub generate_mode { &data_byte(0xf3,0xab); # rep stosl &set_label("${mode}_done"); &lea ("esp",&DWP(24,"ebp")); - if ($mode ne "ctr16") { + if ($mode ne "ctr32") { &jmp (&label("${mode}_exit")); &set_label("${mode}_aligned",16); @@ -311,7 +311,7 @@ sub generate_mode { &set_label("${mode}_exit"); } &mov ("eax",1); &lea ("esp",&DWP(4,"esp")); # popf - &emms () if ($mode eq "ctr16"); + &emms () if ($mode eq "ctr32"); &set_label("${mode}_abort"); &function_end("padlock_${mode}_encrypt"); } @@ -320,10 +320,11 @@ sub generate_mode { &generate_mode("cbc",0xd0); &generate_mode("cfb",0xe0); &generate_mode("ofb",0xe8); -&generate_mode("ctr16",0xc8); # yes, it implements own ctr with ecb opcode, - # because hardware ctr was introduced later - # and even has errata on certain CPU stepping. - # own implementation *always* works... +&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode, + # because hardware CTR was introduced later + # and even has errata on certain C7 stepping. + # own implementation *always* works, though + # ~15% slower than dedicated hardware... &function_begin_B("padlock_xstore"); &push ("edi"); diff --git a/engines/asm/e_padlock-x86_64.pl b/engines/asm/e_padlock-x86_64.pl index d5cc5fe00e785..13c371be67ed5 100644 --- a/engines/asm/e_padlock-x86_64.pl +++ b/engines/asm/e_padlock-x86_64.pl @@ -9,7 +9,8 @@ # September 2011 # -# Assembler helpers for Padlock engine. +# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for +# details. $flavour = shift; $output = shift; @@ -26,7 +27,7 @@ $code=".text\n"; -$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 +$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 $ctx="%rdx"; $out="%rdi"; @@ -234,9 +235,23 @@ sub generate_mode { neg %rax and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK lea (%rax,%rbp),%rsp +___ +$code.=<<___ if ($mode eq "ctr32"); + mov -4($ctx),%eax # pull 32-bit counter + bswap %eax + neg %eax + and \$`$PADLOCK_CHUNK/16-1`,%eax + jz .L${mode}_loop + shl \$4,%eax + cmp %rax,$len + cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK +___ +$code.=<<___; jmp .L${mode}_loop .align 16 .L${mode}_loop: + cmp $len,$chunk # ctr32 artefact + cmova $len,$chunk # ctr32 artefact mov $out,%r8 # save parameters mov $inp,%r9 mov $len,%r10 @@ -261,6 +276,16 @@ sub generate_mode { movdqa (%rax),%xmm0 movdqa %xmm0,-16($ctx) # copy [or refresh] iv ___ +$code.=<<___ if ($mode eq "ctr32"); + mov -4($ctx),%eax # pull 32-bit counter + test \$0xffff0000,%eax + jnz .L${mode}_no_corr + bswap %eax + add \$0x10000,%eax + bswap %eax + mov %eax,-4($ctx) +.L${mode}_no_corr: +___ $code.=<<___; mov %r8,$out # restore paramters mov %r11,$chunk @@ -295,6 +320,29 @@ sub generate_mode { .align 16 .L${mode}_aligned: +___ +$code.=<<___ if ($mode eq "ctr32"); + mov -4($ctx),%eax # pull 32-bit counter + mov \$`16*0x10000`,$chunk + bswap %eax + cmp $len,$chunk + cmova $len,$chunk + neg %eax + and \$0xffff,%eax + jz .L${mode}_aligned_loop + shl \$4,%eax + cmp %rax,$len + cmova %rax,$chunk # don't let counter cross 2^16 + jmp .L${mode}_aligned_loop +.align 16 +.L${mode}_aligned_loop: + cmp $len,$chunk + cmova $len,$chunk + mov $len,%r10 # save parameters + mov $chunk,$len + mov $chunk,%r11 +___ +$code.=<<___; lea -16($ctx),%rax # ivp lea 16($ctx),%rbx # key shr \$4,$len # len/=AES_BLOCK_SIZE @@ -304,6 +352,19 @@ sub generate_mode { movdqa (%rax),%xmm0 movdqa %xmm0,-16($ctx) # copy [or refresh] iv ___ +$code.=<<___ if ($mode eq "ctr32"); + mov -4($ctx),%eax # pull 32-bit counter + bswap %eax + add \$0x10000,%eax + bswap %eax + mov %eax,-4($ctx) + + mov %r11,$chunk # restore paramters + mov %r10,$len + sub $chunk,$len + mov \$`16*0x10000`,$chunk + jnz .L${mode}_aligned_loop +___ $code.=<<___; .L${mode}_exit: mov \$1,%eax @@ -320,7 +381,7 @@ sub generate_mode { &generate_mode("cbc",0xd0); &generate_mode("cfb",0xe0); &generate_mode("ofb",0xe8); -&generate_mode("ctr16",0xd8); +&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR... $code.=<<___; .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by " diff --git a/engines/e_padlock.c b/engines/e_padlock.c index 7f78135165f4e..a19f173536038 100644 --- a/engines/e_padlock.c +++ b/engines/e_padlock.c @@ -76,6 +76,7 @@ #endif #include #include +#include #ifndef OPENSSL_NO_HW #ifndef OPENSSL_NO_HW_PADLOCK @@ -337,16 +338,19 @@ static int padlock_cipher_nids[] = { NID_aes_128_cbc, NID_aes_128_cfb, NID_aes_128_ofb, + NID_aes_128_ctr, NID_aes_192_ecb, NID_aes_192_cbc, NID_aes_192_cfb, NID_aes_192_ofb, + NID_aes_192_ctr, NID_aes_256_ecb, NID_aes_256_cbc, NID_aes_256_cfb, NID_aes_256_ofb, + NID_aes_256_ctr }; static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids)/ sizeof(padlock_cipher_nids[0])); @@ -505,10 +509,35 @@ padlock_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, return 1; } +static void padlock_ctr32_encrypt_glue(const unsigned char *in, + unsigned char *out, size_t blocks, + struct padlock_cipher_data *ctx, + const unsigned char *ivec) +{ + memcpy(ctx->iv,ivec,AES_BLOCK_SIZE); + padlock_ctr32_encrypt(out,in,ctx,AES_BLOCK_SIZE*blocks); +} + +static int +padlock_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, + const unsigned char *in_arg, size_t nbytes) +{ + struct padlock_cipher_data *cdata = ALIGNED_CIPHER_DATA(ctx); + unsigned int num = ctx->num; + + CRYPTO_ctr128_encrypt_ctr32(in_arg,out_arg,nbytes, + cdata,ctx->iv,ctx->buf,&num, + (ctr128_f)padlock_ctr32_encrypt_glue); + + ctx->num = (size_t)num; + return 1; +} + #define EVP_CIPHER_block_size_ECB AES_BLOCK_SIZE #define EVP_CIPHER_block_size_CBC AES_BLOCK_SIZE #define EVP_CIPHER_block_size_OFB 1 #define EVP_CIPHER_block_size_CFB 1 +#define EVP_CIPHER_block_size_CTR 1 /* Declaring so many ciphers by hand would be a pain. Instead introduce a bit of preprocessor magic :-) */ @@ -533,16 +562,19 @@ DECLARE_AES_EVP(128,ecb,ECB); DECLARE_AES_EVP(128,cbc,CBC); DECLARE_AES_EVP(128,cfb,CFB); DECLARE_AES_EVP(128,ofb,OFB); +DECLARE_AES_EVP(128,ctr,CTR); DECLARE_AES_EVP(192,ecb,ECB); DECLARE_AES_EVP(192,cbc,CBC); DECLARE_AES_EVP(192,cfb,CFB); DECLARE_AES_EVP(192,ofb,OFB); +DECLARE_AES_EVP(192,ctr,CTR); DECLARE_AES_EVP(256,ecb,ECB); DECLARE_AES_EVP(256,cbc,CBC); DECLARE_AES_EVP(256,cfb,CFB); DECLARE_AES_EVP(256,ofb,OFB); +DECLARE_AES_EVP(256,ctr,CTR); static int padlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid) @@ -567,6 +599,9 @@ padlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid case NID_aes_128_ofb: *cipher = &padlock_aes_128_ofb; break; + case NID_aes_128_ctr: + *cipher = &padlock_aes_128_ctr; + break; case NID_aes_192_ecb: *cipher = &padlock_aes_192_ecb; @@ -580,6 +615,9 @@ padlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid case NID_aes_192_ofb: *cipher = &padlock_aes_192_ofb; break; + case NID_aes_192_ctr: + *cipher = &padlock_aes_192_ctr; + break; case NID_aes_256_ecb: *cipher = &padlock_aes_256_ecb; @@ -593,6 +631,9 @@ padlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid case NID_aes_256_ofb: *cipher = &padlock_aes_256_ofb; break; + case NID_aes_256_ctr: + *cipher = &padlock_aes_256_ctr; + break; default: /* Sorry, we don't support this NID */ @@ -610,6 +651,7 @@ padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key, { struct padlock_cipher_data *cdata; int key_len = EVP_CIPHER_CTX_key_length(ctx) * 8; + unsigned long mode = EVP_CIPHER_CTX_mode(ctx); if (key==NULL) return 0; /* ERROR */ @@ -617,7 +659,7 @@ padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key, memset(cdata, 0, sizeof(struct padlock_cipher_data)); /* Prepare Control word. */ - if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE) + if (mode == EVP_CIPH_OFB_MODE || mode == EVP_CIPH_CTR_MODE) cdata->cword.b.encdec = 0; else cdata->cword.b.encdec = (ctx->encrypt == 0); @@ -640,8 +682,8 @@ padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key, and is listed as hardware errata. They most likely will fix it at some point and then a check for stepping would be due here. */ - if ((EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_ECB_MODE || - EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_CBC_MODE) + if ((mode == EVP_CIPH_ECB_MODE || + mode == EVP_CIPH_CBC_MODE) && !enc) AES_set_decrypt_key(key, key_len, &cdata->ks); else