forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaesni-intel_avx-x86_64.S
2843 lines (2304 loc) · 97.9 KB
/
aesni-intel_avx-x86_64.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
########################################################################
# Copyright (c) 2013, Intel Corporation
#
# This software is available to you under a choice of one of two
# licenses. You may choose to be licensed under the terms of the GNU
# General Public License (GPL) Version 2, available from the file
# COPYING in the main directory of this source tree, or the
# OpenIB.org BSD license below:
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the
# distribution.
#
# * Neither the name of the Intel Corporation nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
#
# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
########################################################################
##
## Authors:
## Erdinc Ozturk <[email protected]>
## Vinodh Gopal <[email protected]>
## James Guilford <[email protected]>
## Tim Chen <[email protected]>
##
## References:
## This code was derived and highly optimized from the code described in paper:
## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
## on Intel Architecture Processors. August, 2010
## The details of the implementation is explained in:
## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
## on Intel Architecture Processors. October, 2012.
##
## Assumptions:
##
##
##
## iv:
## 0 1 2 3
## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
## | Salt (From the SA) |
## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
## | Initialization Vector |
## | (This is the sequence number from IPSec header) |
## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
## | 0x1 |
## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
##
##
##
## AAD:
## AAD padded to 128 bits with 0
## for example, assume AAD is a u32 vector
##
## if AAD is 8 bytes:
## AAD[3] = {A0, A1}#
## padded AAD in xmm register = {A1 A0 0 0}
##
## 0 1 2 3
## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
## | SPI (A1) |
## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
## | 32-bit Sequence Number (A0) |
## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
## | 0x0 |
## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
##
## AAD Format with 32-bit Sequence Number
##
## if AAD is 12 bytes:
## AAD[3] = {A0, A1, A2}#
## padded AAD in xmm register = {A2 A1 A0 0}
##
## 0 1 2 3
## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
## | SPI (A2) |
## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
## | 64-bit Extended Sequence Number {A1,A0} |
## | |
## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
## | 0x0 |
## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
##
## AAD Format with 64-bit Extended Sequence Number
##
##
## aadLen:
## from the definition of the spec, aadLen can only be 8 or 12 bytes.
## The code additionally supports aadLen of length 16 bytes.
##
## TLen:
## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
##
## poly = x^128 + x^127 + x^126 + x^121 + 1
## throughout the code, one tab and two tab indentations are used. one tab is
## for GHASH part, two tabs is for AES part.
##
#include <linux/linkage.h>
#include <asm/inst.h>
# constants in mergeable sections, linker can reorder and merge
.section .rodata.cst16.POLY, "aM", @progbits, 16
.align 16
POLY: .octa 0xC2000000000000000000000000000001
.section .rodata.cst16.POLY2, "aM", @progbits, 16
.align 16
POLY2: .octa 0xC20000000000000000000001C2000000
.section .rodata.cst16.TWOONE, "aM", @progbits, 16
.align 16
TWOONE: .octa 0x00000001000000000000000000000001
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
.align 16
SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
.section .rodata.cst16.ONE, "aM", @progbits, 16
.align 16
ONE: .octa 0x00000000000000000000000000000001
.section .rodata.cst16.ONEf, "aM", @progbits, 16
.align 16
ONEf: .octa 0x01000000000000000000000000000000
# order of these constants should not change.
# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
.section .rodata, "a", @progbits
.align 16
SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.octa 0x00000000000000000000000000000000
.section .rodata
.align 16
.type aad_shift_arr, @object
.size aad_shift_arr, 272
aad_shift_arr:
.octa 0xffffffffffffffffffffffffffffffff
.octa 0xffffffffffffffffffffffffffffff0C
.octa 0xffffffffffffffffffffffffffff0D0C
.octa 0xffffffffffffffffffffffffff0E0D0C
.octa 0xffffffffffffffffffffffff0F0E0D0C
.octa 0xffffffffffffffffffffff0C0B0A0908
.octa 0xffffffffffffffffffff0D0C0B0A0908
.octa 0xffffffffffffffffff0E0D0C0B0A0908
.octa 0xffffffffffffffff0F0E0D0C0B0A0908
.octa 0xffffffffffffff0C0B0A090807060504
.octa 0xffffffffffff0D0C0B0A090807060504
.octa 0xffffffffff0E0D0C0B0A090807060504
.octa 0xffffffff0F0E0D0C0B0A090807060504
.octa 0xffffff0C0B0A09080706050403020100
.octa 0xffff0D0C0B0A09080706050403020100
.octa 0xff0E0D0C0B0A09080706050403020100
.octa 0x0F0E0D0C0B0A09080706050403020100
.text
#define AadHash 16*0
#define AadLen 16*1
#define InLen (16*1)+8
#define PBlockEncKey 16*2
#define OrigIV 16*3
#define CurCount 16*4
#define PBlockLen 16*5
HashKey = 16*6 # store HashKey <<1 mod poly here
HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
#define arg1 %rdi
#define arg2 %rsi
#define arg3 %rdx
#define arg4 %rcx
#define arg5 %r8
#define arg6 %r9
#define arg7 STACK_OFFSET+8*1(%r14)
#define arg8 STACK_OFFSET+8*2(%r14)
#define arg9 STACK_OFFSET+8*3(%r14)
#define arg10 STACK_OFFSET+8*4(%r14)
#define keysize 2*15*16(arg1)
i = 0
j = 0
out_order = 0
in_order = 1
DEC = 0
ENC = 1
.macro define_reg r n
reg_\r = %xmm\n
.endm
.macro setreg
.altmacro
define_reg i %i
define_reg j %j
.noaltmacro
.endm
# need to push 4 registers into stack to maintain
STACK_OFFSET = 8*4
TMP1 = 16*0 # Temporary storage for AAD
TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
TMP3 = 16*2 # Temporary storage for AES State 3
TMP4 = 16*3 # Temporary storage for AES State 4
TMP5 = 16*4 # Temporary storage for AES State 5
TMP6 = 16*5 # Temporary storage for AES State 6
TMP7 = 16*6 # Temporary storage for AES State 7
TMP8 = 16*7 # Temporary storage for AES State 8
VARIABLE_OFFSET = 16*8
################################
# Utility Macros
################################
.macro FUNC_SAVE
#the number of pushes must equal STACK_OFFSET
push %r12
push %r13
push %r14
push %r15
mov %rsp, %r14
sub $VARIABLE_OFFSET, %rsp
and $~63, %rsp # align rsp to 64 bytes
.endm
.macro FUNC_RESTORE
mov %r14, %rsp
pop %r15
pop %r14
pop %r13
pop %r12
.endm
# Encryption of a single block
.macro ENCRYPT_SINGLE_BLOCK REP XMM0
vpxor (arg1), \XMM0, \XMM0
i = 1
setreg
.rep \REP
vaesenc 16*i(arg1), \XMM0, \XMM0
i = (i+1)
setreg
.endr
vaesenclast 16*i(arg1), \XMM0, \XMM0
.endm
# combined for GCM encrypt and decrypt functions
# clobbering all xmm registers
# clobbering r10, r11, r12, r13, r14, r15
.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
vmovdqu AadHash(arg2), %xmm8
vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
add arg5, InLen(arg2)
# initialize the data pointer offset as zero
xor %r11d, %r11d
PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
sub %r11, arg5
mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
and $-16, %r13 # r13 = r13 - (r13 mod 16)
mov %r13, %r12
shr $4, %r12
and $7, %r12
jz _initial_num_blocks_is_0\@
cmp $7, %r12
je _initial_num_blocks_is_7\@
cmp $6, %r12
je _initial_num_blocks_is_6\@
cmp $5, %r12
je _initial_num_blocks_is_5\@
cmp $4, %r12
je _initial_num_blocks_is_4\@
cmp $3, %r12
je _initial_num_blocks_is_3\@
cmp $2, %r12
je _initial_num_blocks_is_2\@
jmp _initial_num_blocks_is_1\@
_initial_num_blocks_is_7\@:
\INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*7, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_6\@:
\INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*6, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_5\@:
\INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*5, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_4\@:
\INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*4, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_3\@:
\INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*3, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_2\@:
\INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*2, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_1\@:
\INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*1, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_0\@:
\INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
_initial_blocks_encrypted\@:
cmp $0, %r13
je _zero_cipher_left\@
sub $128, %r13
je _eight_cipher_left\@
vmovd %xmm9, %r15d
and $255, %r15d
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
_encrypt_by_8_new\@:
cmp $(255-8), %r15d
jg _encrypt_by_8\@
add $8, %r15b
\GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
add $128, %r11
sub $128, %r13
jne _encrypt_by_8_new\@
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
jmp _eight_cipher_left\@
_encrypt_by_8\@:
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
add $8, %r15b
\GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
add $128, %r11
sub $128, %r13
jne _encrypt_by_8_new\@
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
_eight_cipher_left\@:
\GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
_zero_cipher_left\@:
vmovdqu %xmm14, AadHash(arg2)
vmovdqu %xmm9, CurCount(arg2)
# check for 0 length
mov arg5, %r13
and $15, %r13 # r13 = (arg5 mod 16)
je _multiple_of_16_bytes\@
# handle the last <16 Byte block separately
mov %r13, PBlockLen(arg2)
vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
vmovdqu %xmm9, CurCount(arg2)
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
vmovdqu %xmm9, PBlockEncKey(arg2)
cmp $16, arg5
jge _large_enough_update\@
lea (arg4,%r11,1), %r10
mov %r13, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm1
lea SHIFT_MASK+16(%rip), %r12
sub %r13, %r12 # adjust the shuffle mask pointer to be
# able to shift 16-r13 bytes (r13 is the
# number of bytes in plaintext mod 16)
jmp _final_ghash_mul\@
_large_enough_update\@:
sub $16, %r11
add %r13, %r11
# receive the last <16 Byte block
vmovdqu (arg4, %r11, 1), %xmm1
sub %r13, %r11
add $16, %r11
lea SHIFT_MASK+16(%rip), %r12
# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
# (r13 is the number of bytes in plaintext mod 16)
sub %r13, %r12
# get the appropriate shuffle mask
vmovdqu (%r12), %xmm2
# shift right 16-r13 bytes
vpshufb %xmm2, %xmm1, %xmm1
_final_ghash_mul\@:
.if \ENC_DEC == DEC
vmovdqa %xmm1, %xmm2
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
# mask out top 16-r13 bytes of xmm9
vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
vpand %xmm1, %xmm2, %xmm2
vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
vpxor %xmm2, %xmm14, %xmm14
vmovdqu %xmm14, AadHash(arg2)
.else
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
# mask out top 16-r13 bytes of xmm9
vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
vpxor %xmm9, %xmm14, %xmm14
vmovdqu %xmm14, AadHash(arg2)
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
.endif
#############################
# output r13 Bytes
vmovq %xmm9, %rax
cmp $8, %r13
jle _less_than_8_bytes_left\@
mov %rax, (arg3 , %r11)
add $8, %r11
vpsrldq $8, %xmm9, %xmm9
vmovq %xmm9, %rax
sub $8, %r13
_less_than_8_bytes_left\@:
movb %al, (arg3 , %r11)
add $1, %r11
shr $8, %rax
sub $1, %r13
jne _less_than_8_bytes_left\@
#############################
_multiple_of_16_bytes\@:
.endm
# GCM_COMPLETE Finishes update of tag of last partial block
# Output: Authorization Tag (AUTH_TAG)
# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
vmovdqu AadHash(arg2), %xmm14
vmovdqu HashKey(arg2), %xmm13
mov PBlockLen(arg2), %r12
cmp $0, %r12
je _partial_done\@
#GHASH computation for the last <16 Byte block
\GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
_partial_done\@:
mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
shl $3, %r12 # convert into number of bits
vmovd %r12d, %xmm15 # len(A) in xmm15
mov InLen(arg2), %r12
shl $3, %r12 # len(C) in bits (*128)
vmovq %r12, %xmm1
vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
vpxor %xmm15, %xmm14, %xmm14
\GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
vmovdqu OrigIV(arg2), %xmm9
ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
vpxor %xmm14, %xmm9, %xmm9
_return_T\@:
mov \AUTH_TAG, %r10 # r10 = authTag
mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
cmp $16, %r11
je _T_16\@
cmp $8, %r11
jl _T_4\@
_T_8\@:
vmovq %xmm9, %rax
mov %rax, (%r10)
add $8, %r10
sub $8, %r11
vpsrldq $8, %xmm9, %xmm9
cmp $0, %r11
je _return_T_done\@
_T_4\@:
vmovd %xmm9, %eax
mov %eax, (%r10)
add $4, %r10
sub $4, %r11
vpsrldq $4, %xmm9, %xmm9
cmp $0, %r11
je _return_T_done\@
_T_123\@:
vmovd %xmm9, %eax
cmp $2, %r11
jl _T_1\@
mov %ax, (%r10)
cmp $2, %r11
je _return_T_done\@
add $2, %r10
sar $16, %eax
_T_1\@:
mov %al, (%r10)
jmp _return_T_done\@
_T_16\@:
vmovdqu %xmm9, (%r10)
_return_T_done\@:
.endm
.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
mov \AAD, %r10 # r10 = AAD
mov \AADLEN, %r12 # r12 = aadLen
mov %r12, %r11
vpxor \T8, \T8, \T8
vpxor \T7, \T7, \T7
cmp $16, %r11
jl _get_AAD_rest8\@
_get_AAD_blocks\@:
vmovdqu (%r10), \T7
vpshufb SHUF_MASK(%rip), \T7, \T7
vpxor \T7, \T8, \T8
\GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
add $16, %r10
sub $16, %r12
sub $16, %r11
cmp $16, %r11
jge _get_AAD_blocks\@
vmovdqu \T8, \T7
cmp $0, %r11
je _get_AAD_done\@
vpxor \T7, \T7, \T7
/* read the last <16B of AAD. since we have at least 4B of
data right after the AAD (the ICV, and maybe some CT), we can
read 4B/8B blocks safely, and then get rid of the extra stuff */
_get_AAD_rest8\@:
cmp $4, %r11
jle _get_AAD_rest4\@
movq (%r10), \T1
add $8, %r10
sub $8, %r11
vpslldq $8, \T1, \T1
vpsrldq $8, \T7, \T7
vpxor \T1, \T7, \T7
jmp _get_AAD_rest8\@
_get_AAD_rest4\@:
cmp $0, %r11
jle _get_AAD_rest0\@
mov (%r10), %eax
movq %rax, \T1
add $4, %r10
sub $4, %r11
vpslldq $12, \T1, \T1
vpsrldq $4, \T7, \T7
vpxor \T1, \T7, \T7
_get_AAD_rest0\@:
/* finalize: shift out the extra bytes we read, and align
left. since pslldq can only shift by an immediate, we use
vpshufb and an array of shuffle masks */
movq %r12, %r11
salq $4, %r11
vmovdqu aad_shift_arr(%r11), \T1
vpshufb \T1, \T7, \T7
_get_AAD_rest_final\@:
vpshufb SHUF_MASK(%rip), \T7, \T7
vpxor \T8, \T7, \T7
\GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
_get_AAD_done\@:
vmovdqu \T7, AadHash(arg2)
.endm
.macro INIT GHASH_MUL PRECOMPUTE
mov arg6, %r11
mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
xor %r11d, %r11d
mov %r11, InLen(arg2) # ctx_data.in_length = 0
mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
mov arg3, %rax
movdqu (%rax), %xmm0
movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
vmovdqu (arg4), %xmm6 # xmm6 = HashKey
vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
vmovdqa %xmm6, %xmm2
vpsllq $1, %xmm6, %xmm6
vpsrlq $63, %xmm2, %xmm2
vmovdqa %xmm2, %xmm1
vpslldq $8, %xmm2, %xmm2
vpsrldq $8, %xmm1, %xmm1
vpor %xmm2, %xmm6, %xmm6
#reduction
vpshufd $0b00100100, %xmm1, %xmm2
vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
vpand POLY(%rip), %xmm2, %xmm2
vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
#######################################################################
vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
\PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
.endm
# Reads DLEN bytes starting at DPTR and stores in XMMDst
# where 0 < DLEN < 16
# Clobbers %rax, DLEN
.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
vpxor \XMMDst, \XMMDst, \XMMDst
cmp $8, \DLEN
jl _read_lt8_\@
mov (\DPTR), %rax
vpinsrq $0, %rax, \XMMDst, \XMMDst
sub $8, \DLEN
jz _done_read_partial_block_\@
xor %eax, %eax
_read_next_byte_\@:
shl $8, %rax
mov 7(\DPTR, \DLEN, 1), %al
dec \DLEN
jnz _read_next_byte_\@
vpinsrq $1, %rax, \XMMDst, \XMMDst
jmp _done_read_partial_block_\@
_read_lt8_\@:
xor %eax, %eax
_read_next_byte_lt8_\@:
shl $8, %rax
mov -1(\DPTR, \DLEN, 1), %al
dec \DLEN
jnz _read_next_byte_lt8_\@
vpinsrq $0, %rax, \XMMDst, \XMMDst
_done_read_partial_block_\@:
.endm
# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
# between update calls.
# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
AAD_HASH ENC_DEC
mov PBlockLen(arg2), %r13
cmp $0, %r13
je _partial_block_done_\@ # Leave Macro if no partial blocks
# Read in input data without over reading
cmp $16, \PLAIN_CYPH_LEN
jl _fewer_than_16_bytes_\@
vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
jmp _data_read_\@
_fewer_than_16_bytes_\@:
lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
mov \PLAIN_CYPH_LEN, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm1
mov PBlockLen(arg2), %r13
_data_read_\@: # Finished reading in data
vmovdqu PBlockEncKey(arg2), %xmm9
vmovdqu HashKey(arg2), %xmm13
lea SHIFT_MASK(%rip), %r12
# adjust the shuffle mask pointer to be able to shift r13 bytes
# r16-r13 is the number of bytes in plaintext mod 16)
add %r13, %r12
vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
.if \ENC_DEC == DEC
vmovdqa %xmm1, %xmm3
pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
mov \PLAIN_CYPH_LEN, %r10
add %r13, %r10
# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
sub $16, %r10
# Determine if if partial block is not being filled and
# shift mask accordingly
jge _no_extra_mask_1_\@
sub %r10, %r12
_no_extra_mask_1_\@:
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
# get the appropriate mask to mask out bottom r13 bytes of xmm9
vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
vpand %xmm1, %xmm3, %xmm3
vmovdqa SHUF_MASK(%rip), %xmm10
vpshufb %xmm10, %xmm3, %xmm3
vpshufb %xmm2, %xmm3, %xmm3
vpxor %xmm3, \AAD_HASH, \AAD_HASH
cmp $0, %r10
jl _partial_incomplete_1_\@
# GHASH computation for the last <16 Byte block
\GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
xor %eax,%eax
mov %rax, PBlockLen(arg2)
jmp _dec_done_\@
_partial_incomplete_1_\@:
add \PLAIN_CYPH_LEN, PBlockLen(arg2)
_dec_done_\@:
vmovdqu \AAD_HASH, AadHash(arg2)
.else
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
mov \PLAIN_CYPH_LEN, %r10
add %r13, %r10
# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
sub $16, %r10
# Determine if if partial block is not being filled and
# shift mask accordingly
jge _no_extra_mask_2_\@
sub %r10, %r12
_no_extra_mask_2_\@:
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
# get the appropriate mask to mask out bottom r13 bytes of xmm9
vpand %xmm1, %xmm9, %xmm9
vmovdqa SHUF_MASK(%rip), %xmm1
vpshufb %xmm1, %xmm9, %xmm9
vpshufb %xmm2, %xmm9, %xmm9
vpxor %xmm9, \AAD_HASH, \AAD_HASH
cmp $0, %r10
jl _partial_incomplete_2_\@
# GHASH computation for the last <16 Byte block
\GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
xor %eax,%eax
mov %rax, PBlockLen(arg2)
jmp _encode_done_\@
_partial_incomplete_2_\@:
add \PLAIN_CYPH_LEN, PBlockLen(arg2)
_encode_done_\@:
vmovdqu \AAD_HASH, AadHash(arg2)
vmovdqa SHUF_MASK(%rip), %xmm10
# shuffle xmm9 back to output as ciphertext
vpshufb %xmm10, %xmm9, %xmm9
vpshufb %xmm2, %xmm9, %xmm9
.endif
# output encrypted Bytes
cmp $0, %r10
jl _partial_fill_\@
mov %r13, %r12
mov $16, %r13
# Set r13 to be the number of bytes to write out
sub %r12, %r13
jmp _count_set_\@
_partial_fill_\@:
mov \PLAIN_CYPH_LEN, %r13
_count_set_\@:
vmovdqa %xmm9, %xmm0
vmovq %xmm0, %rax
cmp $8, %r13
jle _less_than_8_bytes_left_\@
mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $8, \DATA_OFFSET
psrldq $8, %xmm0
vmovq %xmm0, %rax
sub $8, %r13
_less_than_8_bytes_left_\@:
movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $1, \DATA_OFFSET
shr $8, %rax
sub $1, %r13
jne _less_than_8_bytes_left_\@
_partial_block_done_\@:
.endm # PARTIAL_BLOCK
#ifdef CONFIG_AS_AVX
###############################################################################
# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
# Input: A and B (128-bits each, bit-reflected)
# Output: C = A*B*x mod poly, (i.e. >>1 )
# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
###############################################################################
.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
vpshufd $0b01001110, \GH, \T2
vpshufd $0b01001110, \HK, \T3
vpxor \GH , \T2, \T2 # T2 = (a1+a0)
vpxor \HK , \T3, \T3 # T3 = (b1+b0)
vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
vpxor \GH, \T2,\T2
vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
vpxor \T3, \GH, \GH
vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
#first phase of the reduction
vpslld $31, \GH, \T2 # packed right shifting << 31
vpslld $30, \GH, \T3 # packed right shifting shift << 30
vpslld $25, \GH, \T4 # packed right shifting shift << 25
vpxor \T3, \T2, \T2 # xor the shifted versions
vpxor \T4, \T2, \T2
vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
vpxor \T2, \GH, \GH # first phase of the reduction complete
#second phase of the reduction
vpsrld $1,\GH, \T2 # packed left shifting >> 1
vpsrld $2,\GH, \T3 # packed left shifting >> 2
vpsrld $7,\GH, \T4 # packed left shifting >> 7
vpxor \T3, \T2, \T2 # xor the shifted versions
vpxor \T4, \T2, \T2
vpxor \T5, \T2, \T2
vpxor \T2, \GH, \GH
vpxor \T1, \GH, \GH # the result is in GH
.endm
.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
vmovdqa \HK, \T5
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
vmovdqu \T1, HashKey_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
vmovdqu \T1, HashKey_2_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
vmovdqu \T5, HashKey_3(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
vmovdqu \T1, HashKey_3_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
vmovdqu \T5, HashKey_4(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
vmovdqu \T1, HashKey_4_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
vmovdqu \T5, HashKey_5(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
vmovdqu \T1, HashKey_5_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
vmovdqu \T5, HashKey_6(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
vmovdqu \T1, HashKey_6_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
vmovdqu \T5, HashKey_7(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
vmovdqu \T1, HashKey_7_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
vmovdqu \T5, HashKey_8(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
vmovdqu \T1, HashKey_8_k(arg2)
.endm
## if a = number of total plaintext bytes
## b = floor(a/16)
## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered