|
58 | 58 | ```
|
59 | 59 | - `寄存器分配(Register Allocation)`:下例将对g的2次读取优化成了1次
|
60 | 60 | ```c++
|
61 |
| - a = g; //-----------> load r1, mem1 |
62 |
| - b += a; // rewrite add r2, r2, r1 |
| 61 | + a = g; //-----------> load %r1, 0($mem1) |
| 62 | + b += a; // rewrite add %r2, %r2, %r1 |
63 | 63 | a = g; //
|
64 |
| - c += a; // add r3, r3, r1 |
| 64 | + c += a; // add %r3, %r3, %r1 |
65 | 65 | ```
|
66 | 66 | - `指令调度(Instruction Scheduling)`:下例重排的后一条指令不必等待前一条的结果减少了停顿*(这里的静态流水线调度缓解了`RAW Hazard`)*
|
67 | 67 | ```asm
|
68 |
| - load r0, mem0 // load r0, mem0 |
69 |
| - mul r1, r1, r0 //-----------> load r2, mem2 |
70 |
| - store mem1, r1 // rewrite mul r1, r1, r0 |
71 |
| - load r2, mem2 // mul r3, r3, r2 |
72 |
| - mul r3, r3, r2 // store mem1, r1 |
73 |
| - store mem3, r3 // store mem3, r3 |
| 68 | + load %r0, 0($mem0) // load %r0, 0($mem0) |
| 69 | + mul %r1, %r1, %r0 //-----------> load %r2, 0($mem2) |
| 70 | + store 0($mem1), %r1 // rewrite mul %r1, %r1, %r0 |
| 71 | + load %r2, 0($mem2) // mul %r3, %r3, %r2 |
| 72 | + mul %r3, %r3, %r2 // store 0($mem1), %r1 |
| 73 | + store 0($mem3), %r3 // store 0($mem3), %r3 |
74 | 74 | ```
|
75 | 75 |
|
76 | 76 | ## Memory Model
|
|
99 | 99 | { data == 0, flag == 0 }
|
100 | 100 | // thread 0 thread 1
|
101 | 101 | //----------------------------------------
|
102 |
| - store data, 1 loop: |
103 |
| - store flag, 1 load r0, flag |
104 |
| - beq r0, 0, loop |
105 |
| - load r1, data |
| 102 | + store 0($data), $1 loop: |
| 103 | + store 0($flag), $1 load %r0, 0($flag) |
| 104 | + beq %r0, $0, loop |
| 105 | + load %r1, 0($data) |
106 | 106 | ```
|
107 | 107 | 可能r1 == 0。本例在ARM上能重现
|
108 | 108 | - Store-Load乱序
|
109 | 109 | ```asm
|
110 | 110 | { x == 0, y == 0 }
|
111 | 111 | // thread 0 thread 1
|
112 | 112 | //----------------------------------------
|
113 |
| - store x, 1 store y, 1 |
114 |
| - load r0, y load r1, x |
| 113 | + store 0($x), $1 store 0($y), $1 |
| 114 | + load %r0, 0($y) load %r1, 0($x) |
115 | 115 | ```
|
116 | 116 | 可能r0 == 0 && r1 == 0。本例在ARM/x86上能重现
|
117 | 117 | - `Dependent Loads`乱序$^{[5]}$
|
118 |
| - ```c++ |
| 118 | + ```asm |
119 | 119 | { A == 1, B == 2, C == 3, P == &A, Q == &C }
|
120 | 120 | // thread 0 thread 1
|
121 | 121 | //--------------------------------------
|
122 |
| - B = 4; |
123 |
| - BARRIER; |
124 |
| - P = &B; |
125 |
| - Q = P; |
126 |
| - D = *Q; |
| 122 | + store 0($B), $4 |
| 123 | + BARRIER |
| 124 | + store 0($P), $B |
| 125 | + load %r0, 0($P) |
| 126 | + load %r1 0(%r0) |
127 | 127 | ```
|
128 |
| - 可能Q == &B && D == 2。本例在DEC Alpha上能重现 |
| 128 | + 可能r0 == &B && r1 == 2。本例在DEC Alpha上能重现 |
129 | 129 | - `Non-Causality`/`Non-Transitivity`$^{[3]}$
|
130 | 130 | ```asm
|
131 | 131 | { flag0 == 0, flag1 == 0 }
|
132 |
| - // thread 0 thread 1 thread 2 |
133 |
| - //------------------------------------------------------- |
134 |
| - store flag0, 1 |
135 |
| - loop: |
136 |
| - load r0, flag0 |
137 |
| - beq r0, 0, loop |
138 |
| - BARRIER |
139 |
| - store flag1, 1 |
140 |
| - loop: |
141 |
| - load r1, flag1 |
142 |
| - beq r1, 0, loop |
143 |
| - BARRIER |
144 |
| - load r2, flag0 |
| 132 | + // thread 0 thread 1 thread 2 |
| 133 | + //----------------------------------------------------------------- |
| 134 | + store 0($flag0), $1 |
| 135 | + loop: |
| 136 | + load %r0, 0($flag0) |
| 137 | + beq %r0, $0, loop |
| 138 | + BARRIER |
| 139 | + store 0($flag1), $1 |
| 140 | + loop: |
| 141 | + load %r1, 0($flag1) |
| 142 | + beq %r1, $0, loop |
| 143 | + BARRIER |
| 144 | + load %r2, 0($flag0) |
145 | 145 | ```
|
146 | 146 | 可能r2 == 0。本例在不支持Causality的系统中能重现
|
147 | 147 | - `IRIW(Independent Read Independent Write)`$^{[3]}$
|
148 | 148 | ```asm
|
149 |
| - // thread 0 thread 1 thread 2 thread 3 |
150 |
| - //------------------------------------------------------------------------- |
151 |
| - store data1, 1 store data2, 1 |
152 |
| - load r1, data1 load r3, data2 |
153 |
| - BARRIER BARRIER |
154 |
| - load r2, data2 load r4, data1 |
| 149 | + // thread 0 thread 1 thread 2 thread 3 |
| 150 | + //--------------------------------------------------------------------------------- |
| 151 | + store 0($data1), 1 store 0($data2), 1 |
| 152 | + load %r1, 0($data1) load %r3, 0($data2) |
| 153 | + BARRIER BARRIER |
| 154 | + load %r2, 0($data2) load %r4, 0($data1) |
155 | 155 | ```
|
156 | 156 | 在r1 == 1 && r3 == 1的前提下,可能r2 == 0 && r4 == 0,即thread 2和3看见了不同的写顺序。本例在不支持`Atomic Store`的系统中能重现,比如某些NUMA和带SMT的UMA系统
|
157 | 157 | - #### Memory Model由哪些属性构成
|
|
0 commit comments