forked from acidanthera/OpenCorePkg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLegacyBcopy.nasm
337 lines (310 loc) · 14 KB
/
LegacyBcopy.nasm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
;------------------------------------------------------------------------------
; @file
; Copyright (C) 2020, vit9696. All rights reserved.
; Copyright (C) 2006, Apple Computer, Inc. All rights reserved.
;
; All rights reserved.
;
; This program and the accompanying materials
; are licensed and made available under the terms and conditions of the BSD License
; which accompanies this distribution. The full text of the license may be found at
; http://opensource.org/licenses/bsd-license.php
;
; THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
; WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
;------------------------------------------------------------------------------
BITS 64
DEFAULT REL
;------------------------------------------------------------------------------
; The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
; Supplemental SSE3 and 64-byte cache lines. This is the 64-bit version.
;
; To generate the binary blob execute the following command:
; nasm LegacyBcopy.nasm -o /dev/stdout | xxd -i > LegacyBcopy.h
;
; The following #defines are tightly coupled to the u-architecture:
;------------------------------------------------------------------------------
%define kShort 80 ; too short to bother with SSE (must be >=80)
%define kVeryLong (500*1024) ; large enough for non-temporal stores (>=8192 and <2GB)
%define kFastUCode ((16*1024)-15) ; cutoff for microcode fastpath for "rep/movsl"
%define COMM_PAGE_LONGCOPY 7FFFFFE01200h
;------------------------------------------------------------------------------
; void bcopy(const void *src, void *dst, size_t len)
; src, dst, len ~ rdi, rsi, rdx
;------------------------------------------------------------------------------
Lbcopy:
push rbp ; set up a frame for backtraces
mov rbp, rsp
mov rax, rsi ; copy dest ptr
mov rsi, rdi ; xchange source and dest ptrs
mov rdi, rax
sub rax, rsi ; (dest - source)
cmp rax, rdx ; must move in reverse if (dest - source) < length
jb short LReverseIsland
cmp rdx, kShort ; long enough to bother with SSE?
jbe short LShort ; no
jmp short LNotShort
;------------------------------------------------------------------------------
; void *memcpy(void *dst, const void *src, size_t len)
; void *memmove(void *dst, const void *src, size_t len)
;
; NB: These need to be 32 bytes from bcopy().
;------------------------------------------------------------------------------
align 32
Lmemcpy:
Lmemmove:
push rbp ; set up a frame for backtraces
mov rbp, rsp
mov r11, rdi ; save return value here
mov rax, rdi
sub rax, rsi ; (dest - source)
cmp rax, rdx ; must move in reverse if (dest - source) < length
jb short LReverseIsland
cmp rdx, kShort ; long enough to bother with SSE?
ja short LNotShort ; yes
;------------------------------------------------------------------------------
; Handle short forward copies. As the most common case, this is the fall-through path.
; rdx = length (<= kShort)
; rsi = source ptr
; rdi = dest ptr
;------------------------------------------------------------------------------
LShort:
mov ecx, edx ; copy length using 32-bit operation
shr ecx, 2 ; get #doublewords
jz short LLeftovers
.cycle: ; loop copying doublewords
mov eax, [rsi]
add rsi, 4
mov [rdi], eax
add rdi, 4
dec ecx
jnz short .cycle
LLeftovers: ; handle leftover bytes (0..3) in last word
and edx, 3
jz short .skip ; any leftover bytes?
.cycle: ; loop copying bytes
mov al, [rsi]
inc rsi
mov [rdi], al
inc rdi
dec edx
jnz short .cycle
.skip:
mov rax, r11 ; get return value (dst ptr) for memcpy/memmove
pop rbp
retn
LReverseIsland: ; keep the "jb" above a short branch...
jmp LReverse ; ...because reverse moves are uncommon
;------------------------------------------------------------------------------
; Handle forward moves that are long enough to justify use of SSE.
; First, 16-byte align the destination.
; rdx = length (> kShort)
; rsi = source ptr
; rdi = dest ptr
;------------------------------------------------------------------------------
LNotShort:
cmp rdx, kVeryLong ; long enough to justify heavyweight loops?
jnb short LVeryLong ; use very-long-operand path
mov ecx, edi ; copy low half of destination ptr
neg ecx
and ecx, 15 ; get #bytes to align destination
jz short LDestAligned ; already aligned
sub edx, ecx ; decrement length
rep movsb ; align destination
;------------------------------------------------------------------------------
; Destination is now aligned. Dispatch to the loops over 64-byte chunks,
; based on the alignment of the source. All vector loads and stores are aligned.
; Even though this means we have to shift and repack vectors, doing so is much faster
; than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
; there is at least one chunk. When we enter the copy loops, the following registers
; are set up:
; rdx = residual length (0..63)
; rcx = -(length to move), a multiple of 64 less than 2GB
; rsi = ptr to 1st source byte not to move (unaligned)
; rdi = ptr to 1st dest byte not to move (aligned)
;------------------------------------------------------------------------------
LDestAligned:
mov rcx, rdx ; copy length
mov eax, esi ; copy low half of source address
and edx, 63 ; get remaining bytes for LShort
and rcx, -64 ; get number of bytes we will copy in inner loop
add rsi, rcx ; point to 1st byte not copied
add rdi, rcx
neg rcx ; now generate offset to 1st byte to be copied
; Choose the loop. Without SSSE3 we only have two choices.
; 16-byte aligned loop (LMod0) and 1-byte unaligned loop (LMod1).
and eax, 15
jz short LMod0
jmp short LMod1
;------------------------------------------------------------------------------
; Very long forward moves. These are at least several pages. They are special cased
; and aggressively optimized, not so much because they are common or useful, but
; because they are subject to benchmark. There isn't enough room for them in the
; area reserved on the commpage for bcopy, so we put them elsewhere. We call
; the longcopy routine using the normal ABI:
; rdi = dest
; rsi = source
; rdx = length (>= kVeryLong bytes)
;------------------------------------------------------------------------------
LVeryLong:
push r11 ; save return value
mov rax, COMM_PAGE_LONGCOPY
call rax ; call very long operand routine
pop rax ; pop return value
pop rbp
retn
;------------------------------------------------------------------------------
; On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
; aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
; about 256 bytes up to kVeryLong for cold caches. This is because the microcode
; avoids having to read destination cache lines that will be completely overwritten.
; The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
; we do not know if the destination is in cache or not.
;------------------------------------------------------------------------------
Lfastpath:
add rsi, rcx ; restore ptrs to 1st byte of source and dest
add rdi, rcx
neg ecx ; make length positive (known to be < 2GB)
or ecx, edx ; restore total #bytes remaining to move
cld ; we'll move forward
shr ecx, 2 ; compute #words to move
rep movsd ; the u-code will optimize this
jmp LLeftovers ; handle 0..3 leftover bytes
;------------------------------------------------------------------------------
; Forward loop for medium length operands in which low four bits of %rsi == 0000
;------------------------------------------------------------------------------
LMod0:
cmp ecx, -kFastUCode ; %rcx == -length, where (length < kVeryLong)
jle short Lfastpath ; long enough for fastpath in microcode
jmp short .loop
align 16 ; 16-byte align inner loops
.loop: ; loop over 64-byte chunks
movdqa xmm0, oword [rsi+rcx]
movdqa xmm1, oword [rsi+rcx+10h]
movdqa xmm2, oword [rsi+rcx+20h]
movdqa xmm3, oword [rsi+rcx+30h]
movdqa oword [rdi+rcx], xmm0
movdqa oword [rdi+rcx+10h], xmm1
movdqa oword [rdi+rcx+20h], xmm2
movdqa oword [rdi+rcx+30h], xmm3
add rcx, 64
jnz short .loop
jmp LShort ; copy remaining 0..63 bytes and done
;------------------------------------------------------------------------------
; Forward loop for medium length operands in which low four bits of %rsi != 0000
;------------------------------------------------------------------------------
align 16
LMod1:
movdqu xmm0, oword [rsi+rcx]
movdqu xmm1, oword [rsi+rcx+10h]
movdqu xmm2, oword [rsi+rcx+20h]
movdqu xmm3, oword [rsi+rcx+30h]
movdqa oword [rdi+rcx], xmm0
movdqa oword [rdi+rcx+10h], xmm1
movdqa oword [rdi+rcx+20h], xmm2
movdqa oword [rdi+rcx+30h], xmm3
add rcx, 64
jnz short LMod1
jmp LShort ; copy remaining 0..63 bytes and done
;------------------------------------------------------------------------------
; Reverse moves. These are not optimized as aggressively as their forward
; counterparts, as they are only used with destructive overlap.
; rdx = length
; rsi = source ptr
; rdi = dest ptr
;------------------------------------------------------------------------------
LReverse:
add rsi, rdx ; point to end of strings
add rdi, rdx
cmp rdx, kShort ; long enough to bother with SSE?
ja short LReverseNotShort ; yes
;------------------------------------------------------------------------------
; Handle reverse short copies.
; edx = length (<= kShort)
; rsi = one byte past end of source
; rdi = one byte past end of dest
;------------------------------------------------------------------------------
LReverseShort:
mov ecx, edx ; copy length
shr ecx, 3 ; #quadwords
jz short .l2
.l1:
sub rsi, 8
mov rax, [rsi]
sub rdi, 8
mov [rdi], rax
dec ecx
jnz short .l1
.l2:
and edx, 7 ; bytes?
jz short .l4
.l3:
dec rsi
mov al, [rsi]
dec rdi
mov [rdi], al
dec edx
jnz short .l3
.l4:
mov rax, r11 ; get return value (dst ptr) for memcpy/memmove
pop rbp
ret
;------------------------------------------------------------------------------
; Handle a reverse move long enough to justify using SSE.
; rdx = length (> kShort)
; rsi = one byte past end of source
; rdi = one byte past end of dest
;------------------------------------------------------------------------------
LReverseNotShort:
mov ecx, edi ; copy destination
and ecx, 15 ; get #bytes to align destination
jz short LReverseDestAligned ; already aligned
sub rdx, rcx ; adjust length
.cycle: ; loop copying 1..15 bytes
dec rsi
mov al, [rsi]
dec rdi
mov [rdi], al
dec ecx
jnz short .cycle
;------------------------------------------------------------------------------
; Destination is now aligned. Prepare for reverse loops.
;------------------------------------------------------------------------------
LReverseDestAligned:
mov rcx, rdx ; copy length
and edx, 63 ; get remaining bytes for LReverseShort
and rcx, -64 ; get number of bytes we will copy in inner loop
sub rsi, rcx ; point to endpoint of copy
sub rdi, rcx
test esi, 15 ; is source aligned too?
jnz short LReverseUnalignedLoop
;------------------------------------------------------------------------------
; Reverse loop over 64-byte aligned chunks.
;------------------------------------------------------------------------------
LReverseAlignedLoop:
movdqa xmm0, oword [rsi+rcx-16]
movdqa xmm1, oword [rsi+rcx-32]
movdqa xmm2, oword [rsi+rcx-48]
movdqa xmm3, oword [rsi+rcx-64]
movdqa oword [rdi+rcx-16], xmm0
movdqa oword [rdi+rcx-32], xmm1
movdqa oword [rdi+rcx-48], xmm2
movdqa oword [rdi+rcx-64], xmm3
sub rcx, 64
jnz short LReverseAlignedLoop
jmp LReverseShort ; copy remaining 0..63 bytes and done
;------------------------------------------------------------------------------
; Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
;------------------------------------------------------------------------------
LReverseUnalignedLoop:
movdqu xmm0, oword [rsi+rcx-16]
movdqu xmm1, oword [rsi+rcx-32]
movdqu xmm2, oword [rsi+rcx-48]
movdqu xmm3, oword [rsi+rcx-64]
movdqa oword [rdi+rcx-16], xmm0
movdqa oword [rdi+rcx-32], xmm1
movdqa oword [rdi+rcx-48], xmm2
movdqa oword [rdi+rcx-64], xmm3
sub rcx, 64
jnz short LReverseUnalignedLoop
jmp LReverseShort ; copy remaining 0..63 bytes and done