-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathldriver.yo
395 lines (395 loc) · 20.5 KB
/
ldriver.yo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
| #######################################################################
| # Test for copying block of size 63;
| #######################################################################
0x000: | .pos 0
0x000: 30f43808000000000000 | main: irmovq Stack, %rsp # Set up stack pointer
|
| # Set up arguments for copy function and then invoke it
0x00a: 30f23f00000000000000 | irmovq $63, %rdx # src and dst have 63 elements
0x014: 30f6b805000000000000 | irmovq dest, %rsi # dst array
0x01e: 30f7a803000000000000 | irmovq src, %rdi # src array
0x028: 803200000000000000 | call ncopy
0x031: 00 | halt # should halt with num nonzeros in %rax
0x032: | StartFun:
| # Author:李一凡 [email protected]
| # 优化思路1:将原来的“先把常数移到寄存器中再做运算”的方法改成使用iaddq实现,减少冗余;(13.70)
| # 优化思路2:将Always Taken的分支预测方法改成BTFNT, 提高预测准确性。不过由于现在程序中分支并不多,所以这条方法没有很大改进;(13.63)
| # 优化点:Y86-64处理器默认寄存器存储的是0,所以一开始的那个异或是没必要的;(13.56)
| # 优化点:在mrmovq和rmmovq之间插入无关痛痒的%rdi+=8,因为后一个mov在前一个访存之后才能接收到数据转发,而这造成了一个阶段的暂停。把第一遍改漏的subq也换成iaddq;(11.56)
| # 优化点:每次对len减一的时候总是andq判断一下再跳转,实际上跳转之前只要没有其他运算就可以了;(10.56)
| # 优化思路3:按len-=2展开循环。一开始先把len-2,判断有没有资格进入设计出来的Loop_double。每次从Loop_double出来都要先-2,判断有没有再次进入Loop_double的资格。
| # 每次发现没有进入的资格就直接跳到Once 里面。由于%rdx比2小但是每次只是减去2,所以这个部分会判断%rdx是0(直接返回到Done)还是1(按照Once中的语句再ncopy一次);(8.73)
| # 优化思路4:按len-=3展开循环。与思路3类似,只是在最后跳出的判断出有差异。我选择先把由于小于3而不能进入下一次Loop_trible的%rdx先加上2,这样原来的0是负的,1是equal,2是正的。然后跳转到各自的处理模块中即可;(8.12)
| # 优化思路5:按len-=4展开循环。与上述过程类似,在第一个mrmov的地方利用bubble多复制一个,之后滚雪球。最后判断余数的时候,先加上3,再看大于或小于。等于只用执行一次。如果执行3次,先把最后一个复制过去,剩下两个与复制两次的用同一片代码,便于之后展开规模更大时不超字节限制;(7.90)
| # 优化思路6:按len-=6展开循环。处理余数的时候没什么技巧,只是加了一些数然后分成-1,0和从1开始的正数。如果不能完全分开就减一些数,仍然把他们分成-1,0,,一些正数......
| # 目前ncopy.yo的字节数637, 最后处理余数的环节出现了不少bubble;(7.76)
| # 优化思路7:按len-=8展开,与之前相似,不再赘述。由于没有找到合适的方法处理最后余数部分的的bubble,对于前面数据规模较小的测试来说反而是负优化;(7.75)
| # 优化思路8:利用书后题4.57提到的加载转发,合理优化mrmov和rmmov连起来的余数处理的部分;(7.63) (同样处理但是AT分支预测CPE=7.71)
|
|
| #/* $begin ncopy-ys */
| ##################################################################
| # ncopy.ys - Copy a src block of len words to dst.
| # Return the number of positive words (>0) contained in src.
| #
| # Include your name and ID here.
| #
| # Describe how and why you modified the baseline code.
| #
| ##################################################################
| # Do not modify this portion
| # Function prologue.
| # %rdi = src, %rsi = dst, %rdx = len
0x032: | ncopy:
|
| ##################################################################
| # You can modify this portion
| # Loop header
| # xorq %rax,%rax # count = 0;
|
| # 设法判断运行时环境:
0x032: 62cc | andq %r12, %r12
0x034: 734403000000000000 | je naive_ncopy
|
0x03d: c0f2f7ffffffffffffff | iaddq $-9, %rdx
0x047: 729701000000000000 | jl Last
|
|
0x050: | Loop_nine:
0x050: 50a70000000000000000 | mrmovq (%rdi), %r10 # read val from src...
0x05a: 50b70800000000000000 | mrmovq 8(%rdi), %r11 # Destory this bubble.
0x064: 40a60000000000000000 | rmmovq %r10, (%rsi) # ...and store it to dst
0x06e: 62aa | andq %r10, %r10 # val <= 0?
| # jle L1 # if so, goto L1 (Across iaddq):
0x070: c0f00100000000000000 | iaddq $1, %rax # irmovq $1, %r10
| # addq %r10, %rax # count++
0x07a: | L1:
0x07a: 50a71000000000000000 | mrmovq 16(%rdi), %r10
0x084: 40b60800000000000000 | rmmovq %r11, 8(%rsi)
0x08e: 62bb | andq %r11, %r11
| # jle L2
0x090: c0f00100000000000000 | iaddq $1, %rax
|
0x09a: | L2:
0x09a: 50b71800000000000000 | mrmovq 24(%rdi), %r11
0x0a4: 40a61000000000000000 | rmmovq %r10, 16(%rsi)
0x0ae: 62aa | andq %r10, %r10
| # jle L3
0x0b0: c0f00100000000000000 | iaddq $1, %rax
0x0ba: | L3:
0x0ba: 50a72000000000000000 | mrmovq 32(%rdi), %r10
0x0c4: 40b61800000000000000 | rmmovq %r11, 24(%rsi)
0x0ce: 62bb | andq %r11, %r11
| # jle L4
0x0d0: c0f00100000000000000 | iaddq $1, %rax
0x0da: | L4:
0x0da: 50b72800000000000000 | mrmovq 40(%rdi), %r11
0x0e4: 40a62000000000000000 | rmmovq %r10, 32(%rsi)
0x0ee: 62aa | andq %r10, %r10
| # jle L5
0x0f0: c0f00100000000000000 | iaddq $1, %rax
0x0fa: | L5:
0x0fa: 50a73000000000000000 | mrmovq 48(%rdi), %r10
0x104: 40b62800000000000000 | rmmovq %r11, 40(%rsi)
0x10e: 62bb | andq %r11, %r11
| # jle L6
0x110: c0f00100000000000000 | iaddq $1, %rax
0x11a: | L6:
0x11a: 50b73800000000000000 | mrmovq 56(%rdi), %r11
0x124: 40a63000000000000000 | rmmovq %r10, 48(%rsi)
0x12e: 62aa | andq %r10, %r10
| # jle L7
0x130: c0f00100000000000000 | iaddq $1, %rax
0x13a: | L7:
0x13a: 50a74000000000000000 | mrmovq 64(%rdi), %r10
0x144: 40b63800000000000000 | rmmovq %r11, 56(%rsi)
0x14e: 62bb | andq %r11, %r11
| # jle L8
0x150: c0f00100000000000000 | iaddq $1, %rax
0x15a: | L8:
0x15a: 40a64000000000000000 | rmmovq %r10, 64(%rsi)
0x164: 62aa | andq %r10, %r10
| # jle L9
0x166: c0f00100000000000000 | iaddq $1, %rax
|
0x170: | L9:
0x170: c0f74800000000000000 | iaddq $72, %rdi # src += 9
0x17a: c0f64800000000000000 | iaddq $72, %rsi # dst += 9
0x184: c0f2f7ffffffffffffff | iaddq $-9, %rdx
0x18e: 755000000000000000 | jge Loop_nine # if %rdx >= 9, goto Loop again:
|
0x197: | Last:
0x197: c0f20800000000000000 | iaddq $8, %rdx
0x1a1: 72a103000000000000 | jl Done
0x1aa: 76dd01000000000000 | jg Judge_remain # 对应着原来的2,3,4,5,6,7,8.
0x1b3: 50a70000000000000000 | mrmovq (%rdi), %r10
0x1bd: 40a60000000000000000 | rmmovq %r10, (%rsi)
0x1c7: 62aa | andq %r10, %r10
0x1c9: 71a103000000000000 | jle Done
0x1d2: c0f00100000000000000 | iaddq $1, %rax
0x1dc: 90 | ret
|
0x1dd: | Judge_remain:
0x1dd: c0f2feffffffffffffff | iaddq $-2, %rdx
0x1e7: 720303000000000000 | jl Loop_twice
0x1f0: 73e302000000000000 | je Loop_three
0x1f9: 760202000000000000 | jg Judge_four_to_eight
|
0x202: | Judge_four_to_eight:
0x202: c0f2feffffffffffffff | iaddq $-2, %rdx
0x20c: 72c302000000000000 | jl Loop_four
0x215: 73a302000000000000 | je Loop_five
0x21e: 762702000000000000 | jg Judge_six_eight
|
0x227: | Judge_six_eight:
0x227: c0f2feffffffffffffff | iaddq $-2, %rdx
0x231: 728302000000000000 | jl Loop_six
0x23a: 736302000000000000 | je Loop_seven
|
0x243: | Loop_eight:
0x243: 50a73800000000000000 | mrmovq 56(%rdi), %r10
0x24d: 40a63800000000000000 | rmmovq %r10, 56(%rsi)
0x257: 62aa | andq %r10, %r10
| # jle Loop_seven
0x259: c0f00100000000000000 | iaddq $1, %rax
|
0x263: | Loop_seven:
0x263: 50a73000000000000000 | mrmovq 48(%rdi), %r10
0x26d: 40a63000000000000000 | rmmovq %r10, 48(%rsi)
0x277: 62aa | andq %r10, %r10
| # jle Loop_six
0x279: c0f00100000000000000 | iaddq $1, %rax
|
0x283: | Loop_six:
0x283: 50a72800000000000000 | mrmovq 40(%rdi), %r10
0x28d: 40a62800000000000000 | rmmovq %r10, 40(%rsi)
0x297: 62aa | andq %r10, %r10
| # jle Loop_five
0x299: c0f00100000000000000 | iaddq $1, %rax
|
0x2a3: | Loop_five:
0x2a3: 50a72000000000000000 | mrmovq 32(%rdi), %r10
0x2ad: 40a62000000000000000 | rmmovq %r10, 32(%rsi)
0x2b7: 62aa | andq %r10, %r10
| # jle Loop_four
0x2b9: c0f00100000000000000 | iaddq $1, %rax
|
0x2c3: | Loop_four:
0x2c3: 50a71800000000000000 | mrmovq 24(%rdi), %r10
0x2cd: 40a61800000000000000 | rmmovq %r10, 24(%rsi)
0x2d7: 62aa | andq %r10, %r10
| # jle Loop_three
0x2d9: c0f00100000000000000 | iaddq $1, %rax
|
0x2e3: | Loop_three:
0x2e3: 50a71000000000000000 | mrmovq 16(%rdi), %r10
0x2ed: 40a61000000000000000 | rmmovq %r10, 16(%rsi)
0x2f7: 62aa | andq %r10, %r10
| # jle Loop_twice
0x2f9: c0f00100000000000000 | iaddq $1, %rax
|
|
0x303: | Loop_twice:
0x303: 50a70000000000000000 | mrmovq (%rdi), %r10
0x30d: 50b70800000000000000 | mrmovq 8(%rdi), %r11 # Destory the bubble.
0x317: 40a60000000000000000 | rmmovq %r10, (%rsi)
0x321: 62aa | andq %r10, %r10
| # jle Second
0x323: c0f00100000000000000 | iaddq $1, %rax
0x32d: | Second:
0x32d: 40b60800000000000000 | rmmovq %r11, 8(%rsi)
0x337: 62bb | andq %r11, %r11
| # jle Done
0x339: c0f00100000000000000 | iaddq $1, %rax
0x343: 90 | ret
|
0x344: | naive_ncopy:
0x344: 6222 | andq %rdx,%rdx # len <= 0?
0x346: 71a103000000000000 | jle Done # if so, goto Done:
|
0x34f: | Loop:
0x34f: 50a70000000000000000 | mrmovq (%rdi), %r10 # read val from src...
0x359: c0f70800000000000000 | iaddq $8, %rdi # src++
0x363: 40a60000000000000000 | rmmovq %r10, (%rsi) # ...and store it to dst
0x36d: 62aa | andq %r10, %r10 # val <= 0?
0x36f: 718203000000000000 | jle Npos # if so, goto Npos:
0x378: c0f00100000000000000 | iaddq $1, %rax # irmovq $1, %r10
| # addq %r10, %rax # count++
0x382: | Npos:
0x382: c0f2ffffffffffffffff | iaddq $-1, %rdx
0x38c: c0f60800000000000000 | iaddq $8, %rsi # dst++
0x396: 6222 | andq %rdx,%rdx # len > 0?
0x398: 764f03000000000000 | jg Loop # if so, goto Loop:
|
|
| ##################################################################
| # Do not modify the following section of code
| # Function epilogue.
0x3a1: | Done:
0x3a1: 90 | ret
| ##################################################################
| # Keep the following label at the end of your function (AND MAKE SURE THERE IS A BLANKE LINE IN THE END!!!)
0x3a2: | End:
| #/* $end ncopy-ys */
0x3a2: | EndFun:
|
| ###############################
| # Source and destination blocks
| ###############################
0x3a8: | .align 8
0x3a8: | src:
0x3a8: 0100000000000000 | .quad 1
0x3b0: feffffffffffffff | .quad -2
0x3b8: fdffffffffffffff | .quad -3
0x3c0: 0400000000000000 | .quad 4
0x3c8: 0500000000000000 | .quad 5
0x3d0: 0600000000000000 | .quad 6
0x3d8: 0700000000000000 | .quad 7
0x3e0: 0800000000000000 | .quad 8
0x3e8: 0900000000000000 | .quad 9
0x3f0: 0a00000000000000 | .quad 10
0x3f8: 0b00000000000000 | .quad 11
0x400: 0c00000000000000 | .quad 12
0x408: 0d00000000000000 | .quad 13
0x410: f2ffffffffffffff | .quad -14
0x418: f1ffffffffffffff | .quad -15
0x420: 1000000000000000 | .quad 16
0x428: efffffffffffffff | .quad -17
0x430: eeffffffffffffff | .quad -18
0x438: 1300000000000000 | .quad 19
0x440: 1400000000000000 | .quad 20
0x448: 1500000000000000 | .quad 21
0x450: 1600000000000000 | .quad 22
0x458: 1700000000000000 | .quad 23
0x460: 1800000000000000 | .quad 24
0x468: e7ffffffffffffff | .quad -25
0x470: 1a00000000000000 | .quad 26
0x478: 1b00000000000000 | .quad 27
0x480: e4ffffffffffffff | .quad -28
0x488: 1d00000000000000 | .quad 29
0x490: 1e00000000000000 | .quad 30
0x498: 1f00000000000000 | .quad 31
0x4a0: e0ffffffffffffff | .quad -32
0x4a8: dfffffffffffffff | .quad -33
0x4b0: 2200000000000000 | .quad 34
0x4b8: 2300000000000000 | .quad 35
0x4c0: dcffffffffffffff | .quad -36
0x4c8: dbffffffffffffff | .quad -37
0x4d0: 2600000000000000 | .quad 38
0x4d8: 2700000000000000 | .quad 39
0x4e0: d8ffffffffffffff | .quad -40
0x4e8: d7ffffffffffffff | .quad -41
0x4f0: d6ffffffffffffff | .quad -42
0x4f8: d5ffffffffffffff | .quad -43
0x500: 2c00000000000000 | .quad 44
0x508: 2d00000000000000 | .quad 45
0x510: d2ffffffffffffff | .quad -46
0x518: 2f00000000000000 | .quad 47
0x520: d0ffffffffffffff | .quad -48
0x528: cfffffffffffffff | .quad -49
0x530: ceffffffffffffff | .quad -50
0x538: 3300000000000000 | .quad 51
0x540: ccffffffffffffff | .quad -52
0x548: cbffffffffffffff | .quad -53
0x550: caffffffffffffff | .quad -54
0x558: c9ffffffffffffff | .quad -55
0x560: c8ffffffffffffff | .quad -56
0x568: c7ffffffffffffff | .quad -57
0x570: c6ffffffffffffff | .quad -58
0x578: c5ffffffffffffff | .quad -59
0x580: c4ffffffffffffff | .quad -60
0x588: c3ffffffffffffff | .quad -61
0x590: c2ffffffffffffff | .quad -62
0x598: c1ffffffffffffff | .quad -63
0x5a0: fadebc0000000000 | .quad 0xbcdefa # This shouldn't get moved
|
0x5b0: | .align 16
0x5b0: | Predest:
0x5b0: fadebc0000000000 | .quad 0xbcdefa
0x5b8: | dest:
0x5b8: abefcd0000000000 | .quad 0xcdefab
0x5c0: abefcd0000000000 | .quad 0xcdefab
0x5c8: abefcd0000000000 | .quad 0xcdefab
0x5d0: abefcd0000000000 | .quad 0xcdefab
0x5d8: abefcd0000000000 | .quad 0xcdefab
0x5e0: abefcd0000000000 | .quad 0xcdefab
0x5e8: abefcd0000000000 | .quad 0xcdefab
0x5f0: abefcd0000000000 | .quad 0xcdefab
0x5f8: abefcd0000000000 | .quad 0xcdefab
0x600: abefcd0000000000 | .quad 0xcdefab
0x608: abefcd0000000000 | .quad 0xcdefab
0x610: abefcd0000000000 | .quad 0xcdefab
0x618: abefcd0000000000 | .quad 0xcdefab
0x620: abefcd0000000000 | .quad 0xcdefab
0x628: abefcd0000000000 | .quad 0xcdefab
0x630: abefcd0000000000 | .quad 0xcdefab
0x638: abefcd0000000000 | .quad 0xcdefab
0x640: abefcd0000000000 | .quad 0xcdefab
0x648: abefcd0000000000 | .quad 0xcdefab
0x650: abefcd0000000000 | .quad 0xcdefab
0x658: abefcd0000000000 | .quad 0xcdefab
0x660: abefcd0000000000 | .quad 0xcdefab
0x668: abefcd0000000000 | .quad 0xcdefab
0x670: abefcd0000000000 | .quad 0xcdefab
0x678: abefcd0000000000 | .quad 0xcdefab
0x680: abefcd0000000000 | .quad 0xcdefab
0x688: abefcd0000000000 | .quad 0xcdefab
0x690: abefcd0000000000 | .quad 0xcdefab
0x698: abefcd0000000000 | .quad 0xcdefab
0x6a0: abefcd0000000000 | .quad 0xcdefab
0x6a8: abefcd0000000000 | .quad 0xcdefab
0x6b0: abefcd0000000000 | .quad 0xcdefab
0x6b8: abefcd0000000000 | .quad 0xcdefab
0x6c0: abefcd0000000000 | .quad 0xcdefab
0x6c8: abefcd0000000000 | .quad 0xcdefab
0x6d0: abefcd0000000000 | .quad 0xcdefab
0x6d8: abefcd0000000000 | .quad 0xcdefab
0x6e0: abefcd0000000000 | .quad 0xcdefab
0x6e8: abefcd0000000000 | .quad 0xcdefab
0x6f0: abefcd0000000000 | .quad 0xcdefab
0x6f8: abefcd0000000000 | .quad 0xcdefab
0x700: abefcd0000000000 | .quad 0xcdefab
0x708: abefcd0000000000 | .quad 0xcdefab
0x710: abefcd0000000000 | .quad 0xcdefab
0x718: abefcd0000000000 | .quad 0xcdefab
0x720: abefcd0000000000 | .quad 0xcdefab
0x728: abefcd0000000000 | .quad 0xcdefab
0x730: abefcd0000000000 | .quad 0xcdefab
0x738: abefcd0000000000 | .quad 0xcdefab
0x740: abefcd0000000000 | .quad 0xcdefab
0x748: abefcd0000000000 | .quad 0xcdefab
0x750: abefcd0000000000 | .quad 0xcdefab
0x758: abefcd0000000000 | .quad 0xcdefab
0x760: abefcd0000000000 | .quad 0xcdefab
0x768: abefcd0000000000 | .quad 0xcdefab
0x770: abefcd0000000000 | .quad 0xcdefab
0x778: abefcd0000000000 | .quad 0xcdefab
0x780: abefcd0000000000 | .quad 0xcdefab
0x788: abefcd0000000000 | .quad 0xcdefab
0x790: abefcd0000000000 | .quad 0xcdefab
0x798: abefcd0000000000 | .quad 0xcdefab
0x7a0: abefcd0000000000 | .quad 0xcdefab
0x7a8: abefcd0000000000 | .quad 0xcdefab
0x7b0: | Postdest:
0x7b0: bcfade0000000000 | .quad 0xdefabc
|
0x7b8: | .align 8
| # Run time stack
0x7b8: 0000000000000000 | .quad 0
0x7c0: 0000000000000000 | .quad 0
0x7c8: 0000000000000000 | .quad 0
0x7d0: 0000000000000000 | .quad 0
0x7d8: 0000000000000000 | .quad 0
0x7e0: 0000000000000000 | .quad 0
0x7e8: 0000000000000000 | .quad 0
0x7f0: 0000000000000000 | .quad 0
0x7f8: 0000000000000000 | .quad 0
0x800: 0000000000000000 | .quad 0
0x808: 0000000000000000 | .quad 0
0x810: 0000000000000000 | .quad 0
0x818: 0000000000000000 | .quad 0
0x820: 0000000000000000 | .quad 0
0x828: 0000000000000000 | .quad 0
0x830: 0000000000000000 | .quad 0
|
0x838: | Stack: