forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaesni-intel_asm.S
2842 lines (2615 loc) · 76.6 KB
/
aesni-intel_asm.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Implement AES algorithm in Intel AES-NI instructions.
*
* The white paper of AES-NI instructions can be downloaded from:
* http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
*
* Copyright (C) 2008, Intel Corp.
* Author: Huang Ying <[email protected]>
* Vinodh Gopal <[email protected]>
* Kahraman Akdemir
*
* Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
* interface for 64-bit kernels.
* Authors: Erdinc Ozturk ([email protected])
* Aidan O'Mahony ([email protected])
* Adrian Hoban <[email protected]>
* James Guilford ([email protected])
* Gabriele Paoloni <[email protected]>
* Tadeusz Struk ([email protected])
* Wajdi Feghali ([email protected])
* Copyright (c) 2010, Intel Corporation.
*
* Ported x86_64 version to x86:
* Author: Mathias Krause <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/linkage.h>
#include <asm/inst.h>
#include <asm/frame.h>
#include <asm/nospec-branch.h>
/*
* The following macros are used to move an (un)aligned 16 byte value to/from
* an XMM register. This can done for either FP or integer values, for FP use
* movaps (move aligned packed single) or integer use movdqa (move double quad
* aligned). It doesn't make a performance difference which instruction is used
* since Nehalem (original Core i7) was released. However, the movaps is a byte
* shorter, so that is the one we'll use for now. (same for unaligned).
*/
#define MOVADQ movaps
#define MOVUDQ movups
#ifdef __x86_64__
# constants in mergeable sections, linker can reorder and merge
.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
.align 16
.Lgf128mul_x_ble_mask:
.octa 0x00000000000000010000000000000087
.section .rodata.cst16.POLY, "aM", @progbits, 16
.align 16
POLY: .octa 0xC2000000000000000000000000000001
.section .rodata.cst16.TWOONE, "aM", @progbits, 16
.align 16
TWOONE: .octa 0x00000001000000000000000000000001
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
.align 16
SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
.section .rodata.cst16.MASK1, "aM", @progbits, 16
.align 16
MASK1: .octa 0x0000000000000000ffffffffffffffff
.section .rodata.cst16.MASK2, "aM", @progbits, 16
.align 16
MASK2: .octa 0xffffffffffffffff0000000000000000
.section .rodata.cst16.ONE, "aM", @progbits, 16
.align 16
ONE: .octa 0x00000000000000000000000000000001
.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
.align 16
F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
.section .rodata.cst16.dec, "aM", @progbits, 16
.align 16
dec: .octa 0x1
.section .rodata.cst16.enc, "aM", @progbits, 16
.align 16
enc: .octa 0x2
# order of these constants should not change.
# more specifically, ALL_F should follow SHIFT_MASK,
# and zero should follow ALL_F
.section .rodata, "a", @progbits
.align 16
SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.octa 0x00000000000000000000000000000000
.text
#define STACK_OFFSET 8*3
#define AadHash 16*0
#define AadLen 16*1
#define InLen (16*1)+8
#define PBlockEncKey 16*2
#define OrigIV 16*3
#define CurCount 16*4
#define PBlockLen 16*5
#define HashKey 16*6 // store HashKey <<1 mod poly here
#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
// bits of HashKey <<1 mod poly here
//(for Karatsuba purposes)
#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
// bits of HashKey^2 <<1 mod poly here
// (for Karatsuba purposes)
#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
// bits of HashKey^3 <<1 mod poly here
// (for Karatsuba purposes)
#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
// bits of HashKey^4 <<1 mod poly here
// (for Karatsuba purposes)
#define arg1 rdi
#define arg2 rsi
#define arg3 rdx
#define arg4 rcx
#define arg5 r8
#define arg6 r9
#define arg7 STACK_OFFSET+8(%rsp)
#define arg8 STACK_OFFSET+16(%rsp)
#define arg9 STACK_OFFSET+24(%rsp)
#define arg10 STACK_OFFSET+32(%rsp)
#define arg11 STACK_OFFSET+40(%rsp)
#define keysize 2*15*16(%arg1)
#endif
#define STATE1 %xmm0
#define STATE2 %xmm4
#define STATE3 %xmm5
#define STATE4 %xmm6
#define STATE STATE1
#define IN1 %xmm1
#define IN2 %xmm7
#define IN3 %xmm8
#define IN4 %xmm9
#define IN IN1
#define KEY %xmm2
#define IV %xmm3
#define BSWAP_MASK %xmm10
#define CTR %xmm11
#define INC %xmm12
#define GF128MUL_MASK %xmm10
#ifdef __x86_64__
#define AREG %rax
#define KEYP %rdi
#define OUTP %rsi
#define UKEYP OUTP
#define INP %rdx
#define LEN %rcx
#define IVP %r8
#define KLEN %r9d
#define T1 %r10
#define TKEYP T1
#define T2 %r11
#define TCTR_LOW T2
#else
#define AREG %eax
#define KEYP %edi
#define OUTP AREG
#define UKEYP OUTP
#define INP %edx
#define LEN %esi
#define IVP %ebp
#define KLEN %ebx
#define T1 %ecx
#define TKEYP T1
#endif
.macro FUNC_SAVE
push %r12
push %r13
push %r14
#
# states of %xmm registers %xmm6:%xmm15 not saved
# all %xmm registers are clobbered
#
.endm
.macro FUNC_RESTORE
pop %r14
pop %r13
pop %r12
.endm
# Precompute hashkeys.
# Input: Hash subkey.
# Output: HashKeys stored in gcm_context_data. Only needs to be called
# once per key.
# clobbers r12, and tmp xmm registers.
.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
mov \SUBKEY, %r12
movdqu (%r12), \TMP3
movdqa SHUF_MASK(%rip), \TMP2
PSHUFB_XMM \TMP2, \TMP3
# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
movdqa \TMP3, \TMP2
psllq $1, \TMP3
psrlq $63, \TMP2
movdqa \TMP2, \TMP1
pslldq $8, \TMP2
psrldq $8, \TMP1
por \TMP2, \TMP3
# reduce HashKey<<1
pshufd $0x24, \TMP1, \TMP2
pcmpeqd TWOONE(%rip), \TMP2
pand POLY(%rip), \TMP2
pxor \TMP2, \TMP3
movdqu \TMP3, HashKey(%arg2)
movdqa \TMP3, \TMP5
pshufd $78, \TMP3, \TMP1
pxor \TMP3, \TMP1
movdqu \TMP1, HashKey_k(%arg2)
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^2<<1 (mod poly)
movdqu \TMP5, HashKey_2(%arg2)
# HashKey_2 = HashKey^2<<1 (mod poly)
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
movdqu \TMP1, HashKey_2_k(%arg2)
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
movdqu \TMP5, HashKey_3(%arg2)
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
movdqu \TMP1, HashKey_3_k(%arg2)
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
movdqu \TMP5, HashKey_4(%arg2)
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
movdqu \TMP1, HashKey_4_k(%arg2)
.endm
# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
.macro GCM_INIT Iv SUBKEY AAD AADLEN
mov \AADLEN, %r11
mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
xor %r11d, %r11d
mov %r11, InLen(%arg2) # ctx_data.in_length = 0
mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
mov \Iv, %rax
movdqu (%rax), %xmm0
movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
movdqa SHUF_MASK(%rip), %xmm2
PSHUFB_XMM %xmm2, %xmm0
movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
movdqu HashKey(%arg2), %xmm13
CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
%xmm4, %xmm5, %xmm6
.endm
# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
# struct has been initialized by GCM_INIT.
# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
# Clobbers rax, r10-r13, and xmm0-xmm15
.macro GCM_ENC_DEC operation
movdqu AadHash(%arg2), %xmm8
movdqu HashKey(%arg2), %xmm13
add %arg5, InLen(%arg2)
xor %r11d, %r11d # initialise the data pointer offset as zero
PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
sub %r11, %arg5 # sub partial block data used
mov %arg5, %r13 # save the number of bytes
and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
mov %r13, %r12
# Encrypt/Decrypt first few blocks
and $(3<<4), %r12
jz _initial_num_blocks_is_0_\@
cmp $(2<<4), %r12
jb _initial_num_blocks_is_1_\@
je _initial_num_blocks_is_2_\@
_initial_num_blocks_is_3_\@:
INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
sub $48, %r13
jmp _initial_blocks_\@
_initial_num_blocks_is_2_\@:
INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
sub $32, %r13
jmp _initial_blocks_\@
_initial_num_blocks_is_1_\@:
INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
sub $16, %r13
jmp _initial_blocks_\@
_initial_num_blocks_is_0_\@:
INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
_initial_blocks_\@:
# Main loop - Encrypt/Decrypt remaining blocks
cmp $0, %r13
je _zero_cipher_left_\@
sub $64, %r13
je _four_cipher_left_\@
_crypt_by_4_\@:
GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
%xmm7, %xmm8, enc
add $64, %r11
sub $64, %r13
jne _crypt_by_4_\@
_four_cipher_left_\@:
GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
_zero_cipher_left_\@:
movdqu %xmm8, AadHash(%arg2)
movdqu %xmm0, CurCount(%arg2)
mov %arg5, %r13
and $15, %r13 # %r13 = arg5 (mod 16)
je _multiple_of_16_bytes_\@
mov %r13, PBlockLen(%arg2)
# Handle the last <16 Byte block separately
paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
movdqu %xmm0, CurCount(%arg2)
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm0
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
movdqu %xmm0, PBlockEncKey(%arg2)
cmp $16, %arg5
jge _large_enough_update_\@
lea (%arg4,%r11,1), %r10
mov %r13, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
jmp _data_read_\@
_large_enough_update_\@:
sub $16, %r11
add %r13, %r11
# receive the last <16 Byte block
movdqu (%arg4, %r11, 1), %xmm1
sub %r13, %r11
add $16, %r11
lea SHIFT_MASK+16(%rip), %r12
# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
# (r13 is the number of bytes in plaintext mod 16)
sub %r13, %r12
# get the appropriate shuffle mask
movdqu (%r12), %xmm2
# shift right 16-r13 bytes
PSHUFB_XMM %xmm2, %xmm1
_data_read_\@:
lea ALL_F+16(%rip), %r12
sub %r13, %r12
.ifc \operation, dec
movdqa %xmm1, %xmm2
.endif
pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
movdqu (%r12), %xmm1
# get the appropriate mask to mask out top 16-r13 bytes of xmm0
pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
.ifc \operation, dec
pand %xmm1, %xmm2
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10 ,%xmm2
pxor %xmm2, %xmm8
.else
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10,%xmm0
pxor %xmm0, %xmm8
.endif
movdqu %xmm8, AadHash(%arg2)
.ifc \operation, enc
# GHASH computation for the last <16 byte block
movdqa SHUF_MASK(%rip), %xmm10
# shuffle xmm0 back to output as ciphertext
PSHUFB_XMM %xmm10, %xmm0
.endif
# Output %r13 bytes
MOVQ_R64_XMM %xmm0, %rax
cmp $8, %r13
jle _less_than_8_bytes_left_\@
mov %rax, (%arg3 , %r11, 1)
add $8, %r11
psrldq $8, %xmm0
MOVQ_R64_XMM %xmm0, %rax
sub $8, %r13
_less_than_8_bytes_left_\@:
mov %al, (%arg3, %r11, 1)
add $1, %r11
shr $8, %rax
sub $1, %r13
jne _less_than_8_bytes_left_\@
_multiple_of_16_bytes_\@:
.endm
# GCM_COMPLETE Finishes update of tag of last partial block
# Output: Authorization Tag (AUTH_TAG)
# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
movdqu AadHash(%arg2), %xmm8
movdqu HashKey(%arg2), %xmm13
mov PBlockLen(%arg2), %r12
cmp $0, %r12
je _partial_done\@
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
_partial_done\@:
mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
shl $3, %r12 # convert into number of bits
movd %r12d, %xmm15 # len(A) in %xmm15
mov InLen(%arg2), %r12
shl $3, %r12 # len(C) in bits (*128)
MOVQ_R64_XMM %r12, %xmm1
pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
pxor %xmm15, %xmm8
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
# final GHASH computation
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm8
movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
pxor %xmm8, %xmm0
_return_T_\@:
mov \AUTHTAG, %r10 # %r10 = authTag
mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
cmp $16, %r11
je _T_16_\@
cmp $8, %r11
jl _T_4_\@
_T_8_\@:
MOVQ_R64_XMM %xmm0, %rax
mov %rax, (%r10)
add $8, %r10
sub $8, %r11
psrldq $8, %xmm0
cmp $0, %r11
je _return_T_done_\@
_T_4_\@:
movd %xmm0, %eax
mov %eax, (%r10)
add $4, %r10
sub $4, %r11
psrldq $4, %xmm0
cmp $0, %r11
je _return_T_done_\@
_T_123_\@:
movd %xmm0, %eax
cmp $2, %r11
jl _T_1_\@
mov %ax, (%r10)
cmp $2, %r11
je _return_T_done_\@
add $2, %r10
sar $16, %eax
_T_1_\@:
mov %al, (%r10)
jmp _return_T_done_\@
_T_16_\@:
movdqu %xmm0, (%r10)
_return_T_done_\@:
.endm
#ifdef __x86_64__
/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
*
*
* Input: A and B (128-bits each, bit-reflected)
* Output: C = A*B*x mod poly, (i.e. >>1 )
* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
*
*/
.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
movdqa \GH, \TMP1
pshufd $78, \GH, \TMP2
pshufd $78, \HK, \TMP3
pxor \GH, \TMP2 # TMP2 = a1+a0
pxor \HK, \TMP3 # TMP3 = b1+b0
PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
pxor \GH, \TMP2
pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
movdqa \TMP2, \TMP3
pslldq $8, \TMP3 # left shift TMP3 2 DWs
psrldq $8, \TMP2 # right shift TMP2 2 DWs
pxor \TMP3, \GH
pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
# first phase of the reduction
movdqa \GH, \TMP2
movdqa \GH, \TMP3
movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
# in in order to perform
# independent shifts
pslld $31, \TMP2 # packed right shift <<31
pslld $30, \TMP3 # packed right shift <<30
pslld $25, \TMP4 # packed right shift <<25
pxor \TMP3, \TMP2 # xor the shifted versions
pxor \TMP4, \TMP2
movdqa \TMP2, \TMP5
psrldq $4, \TMP5 # right shift TMP5 1 DW
pslldq $12, \TMP2 # left shift TMP2 3 DWs
pxor \TMP2, \GH
# second phase of the reduction
movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
# in in order to perform
# independent shifts
movdqa \GH,\TMP3
movdqa \GH,\TMP4
psrld $1,\TMP2 # packed left shift >>1
psrld $2,\TMP3 # packed left shift >>2
psrld $7,\TMP4 # packed left shift >>7
pxor \TMP3,\TMP2 # xor the shifted versions
pxor \TMP4,\TMP2
pxor \TMP5, \TMP2
pxor \TMP2, \GH
pxor \TMP1, \GH # result is in TMP1
.endm
# Reads DLEN bytes starting at DPTR and stores in XMMDst
# where 0 < DLEN < 16
# Clobbers %rax, DLEN and XMM1
.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
cmp $8, \DLEN
jl _read_lt8_\@
mov (\DPTR), %rax
MOVQ_R64_XMM %rax, \XMMDst
sub $8, \DLEN
jz _done_read_partial_block_\@
xor %eax, %eax
_read_next_byte_\@:
shl $8, %rax
mov 7(\DPTR, \DLEN, 1), %al
dec \DLEN
jnz _read_next_byte_\@
MOVQ_R64_XMM %rax, \XMM1
pslldq $8, \XMM1
por \XMM1, \XMMDst
jmp _done_read_partial_block_\@
_read_lt8_\@:
xor %eax, %eax
_read_next_byte_lt8_\@:
shl $8, %rax
mov -1(\DPTR, \DLEN, 1), %al
dec \DLEN
jnz _read_next_byte_lt8_\@
MOVQ_R64_XMM %rax, \XMMDst
_done_read_partial_block_\@:
.endm
# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
# clobbers r10-11, xmm14
.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
TMP6 TMP7
MOVADQ SHUF_MASK(%rip), %xmm14
mov \AAD, %r10 # %r10 = AAD
mov \AADLEN, %r11 # %r11 = aadLen
pxor \TMP7, \TMP7
pxor \TMP6, \TMP6
cmp $16, %r11
jl _get_AAD_rest\@
_get_AAD_blocks\@:
movdqu (%r10), \TMP7
PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
pxor \TMP7, \TMP6
GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
add $16, %r10
sub $16, %r11
cmp $16, %r11
jge _get_AAD_blocks\@
movdqu \TMP6, \TMP7
/* read the last <16B of AAD */
_get_AAD_rest\@:
cmp $0, %r11
je _get_AAD_done\@
READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
pxor \TMP6, \TMP7
GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
movdqu \TMP7, \TMP6
_get_AAD_done\@:
movdqu \TMP6, AadHash(%arg2)
.endm
# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
# between update calls.
# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
AAD_HASH operation
mov PBlockLen(%arg2), %r13
cmp $0, %r13
je _partial_block_done_\@ # Leave Macro if no partial blocks
# Read in input data without over reading
cmp $16, \PLAIN_CYPH_LEN
jl _fewer_than_16_bytes_\@
movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
jmp _data_read_\@
_fewer_than_16_bytes_\@:
lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
mov \PLAIN_CYPH_LEN, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
mov PBlockLen(%arg2), %r13
_data_read_\@: # Finished reading in data
movdqu PBlockEncKey(%arg2), %xmm9
movdqu HashKey(%arg2), %xmm13
lea SHIFT_MASK(%rip), %r12
# adjust the shuffle mask pointer to be able to shift r13 bytes
# r16-r13 is the number of bytes in plaintext mod 16)
add %r13, %r12
movdqu (%r12), %xmm2 # get the appropriate shuffle mask
PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
.ifc \operation, dec
movdqa %xmm1, %xmm3
pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
mov \PLAIN_CYPH_LEN, %r10
add %r13, %r10
# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
sub $16, %r10
# Determine if if partial block is not being filled and
# shift mask accordingly
jge _no_extra_mask_1_\@
sub %r10, %r12
_no_extra_mask_1_\@:
movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
# get the appropriate mask to mask out bottom r13 bytes of xmm9
pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
pand %xmm1, %xmm3
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm3
PSHUFB_XMM %xmm2, %xmm3
pxor %xmm3, \AAD_HASH
cmp $0, %r10
jl _partial_incomplete_1_\@
# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
xor %eax, %eax
mov %rax, PBlockLen(%arg2)
jmp _dec_done_\@
_partial_incomplete_1_\@:
add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
_dec_done_\@:
movdqu \AAD_HASH, AadHash(%arg2)
.else
pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
mov \PLAIN_CYPH_LEN, %r10
add %r13, %r10
# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
sub $16, %r10
# Determine if if partial block is not being filled and
# shift mask accordingly
jge _no_extra_mask_2_\@
sub %r10, %r12
_no_extra_mask_2_\@:
movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
# get the appropriate mask to mask out bottom r13 bytes of xmm9
pand %xmm1, %xmm9
movdqa SHUF_MASK(%rip), %xmm1
PSHUFB_XMM %xmm1, %xmm9
PSHUFB_XMM %xmm2, %xmm9
pxor %xmm9, \AAD_HASH
cmp $0, %r10
jl _partial_incomplete_2_\@
# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
xor %eax, %eax
mov %rax, PBlockLen(%arg2)
jmp _encode_done_\@
_partial_incomplete_2_\@:
add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
_encode_done_\@:
movdqu \AAD_HASH, AadHash(%arg2)
movdqa SHUF_MASK(%rip), %xmm10
# shuffle xmm9 back to output as ciphertext
PSHUFB_XMM %xmm10, %xmm9
PSHUFB_XMM %xmm2, %xmm9
.endif
# output encrypted Bytes
cmp $0, %r10
jl _partial_fill_\@
mov %r13, %r12
mov $16, %r13
# Set r13 to be the number of bytes to write out
sub %r12, %r13
jmp _count_set_\@
_partial_fill_\@:
mov \PLAIN_CYPH_LEN, %r13
_count_set_\@:
movdqa %xmm9, %xmm0
MOVQ_R64_XMM %xmm0, %rax
cmp $8, %r13
jle _less_than_8_bytes_left_\@
mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $8, \DATA_OFFSET
psrldq $8, %xmm0
MOVQ_R64_XMM %xmm0, %rax
sub $8, %r13
_less_than_8_bytes_left_\@:
movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $1, \DATA_OFFSET
shr $8, %rax
sub $1, %r13
jne _less_than_8_bytes_left_\@
_partial_block_done_\@:
.endm # PARTIAL_BLOCK
/*
* if a = number of total plaintext bytes
* b = floor(a/16)
* num_initial_blocks = b mod 4
* encrypt the initial num_initial_blocks blocks and apply ghash on
* the ciphertext
* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
* are clobbered
* arg1, %arg2, %arg3 are used as a pointer only, not modified
*/
.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
MOVADQ SHUF_MASK(%rip), %xmm14
movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
# start AES for num_initial_blocks blocks
movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
.if (\i == 5) || (\i == 6) || (\i == 7)
MOVADQ ONE(%RIP),\TMP1
MOVADQ 0(%arg1),\TMP2
.irpc index, \i_seq
paddd \TMP1, \XMM0 # INCR Y0
.ifc \operation, dec
movdqa \XMM0, %xmm\index
.else
MOVADQ \XMM0, %xmm\index
.endif
PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
pxor \TMP2, %xmm\index
.endr
lea 0x10(%arg1),%r10
mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8
add $5,%eax # 128->9, 192->11, 256->13
aes_loop_initial_\@:
MOVADQ (%r10),\TMP1
.irpc index, \i_seq
AESENC \TMP1, %xmm\index
.endr
add $16,%r10
sub $1,%eax
jnz aes_loop_initial_\@
MOVADQ (%r10), \TMP1
.irpc index, \i_seq
AESENCLAST \TMP1, %xmm\index # Last Round
.endr
.irpc index, \i_seq
movdqu (%arg4 , %r11, 1), \TMP1
pxor \TMP1, %xmm\index
movdqu %xmm\index, (%arg3 , %r11, 1)
# write back plaintext/ciphertext for num_initial_blocks
add $16, %r11
.ifc \operation, dec
movdqa \TMP1, %xmm\index
.endif
PSHUFB_XMM %xmm14, %xmm\index
# prepare plaintext/ciphertext for GHASH computation
.endr
.endif
# apply GHASH on num_initial_blocks blocks
.if \i == 5
pxor %xmm5, %xmm6
GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
pxor %xmm6, %xmm7
GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
pxor %xmm7, %xmm8
GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.elseif \i == 6
pxor %xmm6, %xmm7
GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
pxor %xmm7, %xmm8
GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.elseif \i == 7
pxor %xmm7, %xmm8
GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.endif
cmp $64, %r13
jl _initial_blocks_done\@
# no need for precomputed values
/*
*
* Precomputations for HashKey parallel with encryption of first 4 blocks.
* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
*/
MOVADQ ONE(%RIP),\TMP1
paddd \TMP1, \XMM0 # INCR Y0
MOVADQ \XMM0, \XMM1
PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
paddd \TMP1, \XMM0 # INCR Y0
MOVADQ \XMM0, \XMM2
PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
paddd \TMP1, \XMM0 # INCR Y0
MOVADQ \XMM0, \XMM3
PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
paddd \TMP1, \XMM0 # INCR Y0
MOVADQ \XMM0, \XMM4
PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
MOVADQ 0(%arg1),\TMP1
pxor \TMP1, \XMM1
pxor \TMP1, \XMM2
pxor \TMP1, \XMM3
pxor \TMP1, \XMM4
.irpc index, 1234 # do 4 rounds
movaps 0x10*\index(%arg1), \TMP1
AESENC \TMP1, \XMM1
AESENC \TMP1, \XMM2
AESENC \TMP1, \XMM3
AESENC \TMP1, \XMM4
.endr
.irpc index, 56789 # do next 5 rounds
movaps 0x10*\index(%arg1), \TMP1
AESENC \TMP1, \XMM1
AESENC \TMP1, \XMM2
AESENC \TMP1, \XMM3
AESENC \TMP1, \XMM4
.endr
lea 0xa0(%arg1),%r10
mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8
sub $4,%eax # 128->0, 192->2, 256->4
jz aes_loop_pre_done\@
aes_loop_pre_\@:
MOVADQ (%r10),\TMP2
.irpc index, 1234
AESENC \TMP2, %xmm\index
.endr
add $16,%r10
sub $1,%eax
jnz aes_loop_pre_\@
aes_loop_pre_done\@:
MOVADQ (%r10), \TMP2
AESENCLAST \TMP2, \XMM1
AESENCLAST \TMP2, \XMM2
AESENCLAST \TMP2, \XMM3
AESENCLAST \TMP2, \XMM4
movdqu 16*0(%arg4 , %r11 , 1), \TMP1
pxor \TMP1, \XMM1
.ifc \operation, dec
movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
movdqa \TMP1, \XMM1
.endif
movdqu 16*1(%arg4 , %r11 , 1), \TMP1
pxor \TMP1, \XMM2
.ifc \operation, dec
movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
movdqa \TMP1, \XMM2
.endif
movdqu 16*2(%arg4 , %r11 , 1), \TMP1
pxor \TMP1, \XMM3
.ifc \operation, dec
movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
movdqa \TMP1, \XMM3
.endif
movdqu 16*3(%arg4 , %r11 , 1), \TMP1
pxor \TMP1, \XMM4
.ifc \operation, dec
movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
movdqa \TMP1, \XMM4
.else
movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
.endif
add $64, %r11
PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
pxor \XMMDst, \XMM1
# combine GHASHed value with the corresponding ciphertext
PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
_initial_blocks_done\@:
.endm
/*
* encrypt 4 blocks at a time
* ghash the 4 previously encrypted ciphertext blocks
* arg1, %arg3, %arg4 are used as pointers only, not modified
* %r11 is the data offset value
*/
.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM1, \XMM5
movdqa \XMM2, \XMM6
movdqa \XMM3, \XMM7
movdqa \XMM4, \XMM8
movdqa SHUF_MASK(%rip), %xmm15
# multiply TMP5 * HashKey using karatsuba
movdqa \XMM5, \TMP4
pshufd $78, \XMM5, \TMP6
pxor \XMM5, \TMP6
paddd ONE(%rip), \XMM0 # INCR CNT
movdqu HashKey_4(%arg2), \TMP5