forked from doldecomp/melee
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrank.py
214 lines (181 loc) · 7.61 KB
/
frank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#! /usr/bin/env python3
# Written by Ethan Roseman (ethteck)
# MIT License
# Copyright 2021
# Modified by EpochFlame
import argparse
import sys
# Byte sequence that marks code size
CODESIZE_MAGIC = b"\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x34"
BLR_BYTE_SEQ = b"\x4E\x80\x00\x20"
MTLR_BYTE_SEQ = b"\x7C\x08\x03\xA6"
PROFILE_EXTRA_BYTES = b"\x48\x00\x00\x01\x60\x00\x00\x00"
LWZ_BYTE = b"\x80"
# Byte sequence array for branches to link register
BLR_BYTE_SEQ_ARRAY = [BLR_BYTE_SEQ,
b"\x4D\x80\x00\x20", b"\x4D\x80\x00\x21", b"\x4C\x81\x00\x20", b"\x4C\x81\x00\x21",
b"\x4D\x82\x00\x20", b"\x4D\x82\x00\x21", b"\x4C\x80\x00\x20", b"\x4C\x80\x00\x21",
b"\x4D\x81\x00\x20", b"\x4D\x81\x00\x21", b"\x4C\x80\x00\x20", b"\x4C\x80\x00\x21",
b"\x4C\x82\x00\x20", b"\x4C\x82\x00\x21", b"\x4C\x81\x00\x20", b"\x4C\x81\x00\x21",
b"\x4D\x83\x00\x20", b"\x4D\x83\x00\x21", b"\x4C\x83\x00\x20", b"\x4C\x83\x00\x21",
b"\x4D\x83\x00\x20", b"\x4D\x83\x00\x21", b"\x4C\x83\x00\x20", b"\x4C\x83\x00\x21"]
# Example invocation: ./frank.py vanilla.o profile.o output.o
parser = argparse.ArgumentParser()
parser.add_argument("vanilla", help="Path to the vanilla object", type=argparse.FileType('rb'))
parser.add_argument("profile", help="Path to the profile object", type=argparse.FileType('rb'))
parser.add_argument("target", help="Path to the target object (to write)")
args = parser.parse_args()
# Read contents into bytearrays and close files
vanilla_bytes = args.vanilla.read()
args.vanilla.close()
# If the file contains no code, the codesize magic will not be found.
# The vanilla object requires no modification.
code_size_magic_idx = vanilla_bytes.find(CODESIZE_MAGIC)
if code_size_magic_idx == -1:
with open(args.target, "wb") as f:
f.write(vanilla_bytes)
sys.exit(0)
profile_bytes = args.profile.read()
args.profile.close()
# Peephole rescheduling
#
# This is the pattern we will detect:
# (A) lwz <--. .--> (A) li
# (B) li <---\-' bl
# \ nop
# '---> (B) lwz
#
# If the profiled schedule swaps the
# instructions around the bl/nop, we
# instead use the vanilla schedule.
#
idx = 8
shift = 0 # difference between vanilla and profile code, due to bl/nops
while idx < len(profile_bytes) - 16:
# Find next epilogue
epi_pos = profile_bytes.find(PROFILE_EXTRA_BYTES, idx)
if epi_pos == -1:
break # break while loop when no targets remain
if epi_pos % 4 != 0: # check 4-byte alignment
idx += 4
continue
v_pos = epi_pos - shift
shift += 8
vanilla_inst_a = vanilla_bytes[v_pos-4:v_pos]
vanilla_inst_b = vanilla_bytes[v_pos:v_pos+4]
vanilla_inst_c = vanilla_bytes[v_pos+4:v_pos+8]
profile_inst_a = profile_bytes[epi_pos-4:epi_pos]
profile_inst_b = profile_bytes[epi_pos+8:epi_pos+12]
profile_inst_c = profile_bytes[epi_pos+12:epi_pos+16]
opcode_a = vanilla_inst_a[0] >> 2
opcode_b = vanilla_inst_b[0] >> 2
opcode_c = vanilla_inst_c[0] >> 2
LWZ = 0x80 >> 2
LFS = 0xC0 >> 2
ADDI = 0x38 >> 2
LI = ADDI # an LI instruction is just an ADDI with RA=0
LMW = 0xB8 >> 2
FDIVS = 0xEC >> 2
if opcode_a == LWZ and \
opcode_b in [LI, LFS, FDIVS] and \
vanilla_inst_a == profile_inst_b and \
vanilla_inst_b == profile_inst_a and \
vanilla_inst_c == profile_inst_c and \
opcode_c != ADDI: # <- don't reorder if at the very end of the epilogue
# Swap instructions (A) and (B)
profile_bytes = profile_bytes[:epi_pos-4] \
+ vanilla_inst_a \
+ PROFILE_EXTRA_BYTES \
+ vanilla_inst_b \
+ profile_bytes[epi_pos+12:]
# Similar reordering for lwz/lmw, except both insns follow the bl/nop
elif opcode_b == LWZ and \
opcode_c == LMW and \
vanilla_inst_b == profile_inst_c and \
vanilla_inst_c == profile_inst_b:
profile_bytes = profile_bytes[:epi_pos+8] \
+ vanilla_inst_b \
+ vanilla_inst_c \
+ profile_bytes[epi_pos+16:]
idx = epi_pos + 8
# Remove byte sequence
stripped_bytes = profile_bytes.replace(PROFILE_EXTRA_BYTES, b"")
# Find end of code sections in vanilla and stripped bytes
code_size_offset = code_size_magic_idx + len(CODESIZE_MAGIC)
code_size_bytes = vanilla_bytes[code_size_offset:code_size_offset+4]
code_size = int.from_bytes(code_size_bytes, byteorder='big')
eoc_offset = 0x34 + code_size
# Break if the eoc is not found
assert(eoc_offset != len(vanilla_bytes))
# Replace 0x34 - eoc in vanilla with bytes from stripped
final_bytes = vanilla_bytes[:0x34] + stripped_bytes[0x34:eoc_offset] + vanilla_bytes[eoc_offset:]
# Fix branches to link register
for seq in BLR_BYTE_SEQ_ARRAY:
idx = 0
while idx < len(vanilla_bytes):
found_pos = vanilla_bytes.find(seq, idx)
if found_pos == -1:
break # break while loop when no targets remain
if found_pos % 4 != 0: # check 4-byte alignment
idx += 4
continue
final_bytes = final_bytes[:found_pos] + vanilla_bytes[found_pos:found_pos+4] + final_bytes[found_pos+4:]
idx = found_pos + len(seq)
# Reunify mtlr/blr instructions, shifting intermediary instructions up
idx = 0
while idx < len(final_bytes):
# Find mtlr position
mtlr_found_pos = final_bytes.find(MTLR_BYTE_SEQ, idx)
if mtlr_found_pos == -1:
break # break while loop when no targets remain
if mtlr_found_pos % 4 != 0: # check 4-byte alignment
idx += 4
continue
# Find paired blr position
blr_found_pos = final_bytes.find(BLR_BYTE_SEQ, mtlr_found_pos)
if blr_found_pos == -1:
break # break while loop when no targets remain
if blr_found_pos % 4 != 0: # check 4-byte alignment
idx += 4
continue
if mtlr_found_pos + 4 == blr_found_pos:
idx += 4
continue # continue if mtlr is followed directly by blr
final_bytes = final_bytes[:mtlr_found_pos] + final_bytes[mtlr_found_pos+4:blr_found_pos] + final_bytes[mtlr_found_pos:mtlr_found_pos+4] + final_bytes[blr_found_pos:]
idx = mtlr_found_pos + len(MTLR_BYTE_SEQ)
# Reorder lmw/lwz/lfd instructions, if needed (@Altafen)
# Specifically, if this sequence shows up in the stripped profiler code: "LMW, LWZ, LFD*"
# And this sequence shows up in the vanilla code: "LWZ, LFD*, LMW"
# (LFD* = any number of LFDs, including zero)
# If all bytes match between the two (except for the reordering), then use the vanilla ordering.
# This could be written to anchor around the "BL, NOP" instructions in unstripped profiler code,
# or to check for the presence of "ADDI, MTLR, BLR" soon after.
# This also could be written to decode the operands of each instruction to make sure the reorder is harmless.
# Neither of these safeguards are necessary at the moment.
LWZ = 32
LMW = 46
LFD = 50
idx = 0
while idx+4 < len(final_bytes):
if final_bytes[idx] >> 2 == LMW and final_bytes[idx+4] >> 2 == LWZ and vanilla_bytes[idx] >> 2 == LWZ:
start_idx = idx
lmw_bytes = final_bytes[idx:idx+4]
lwz_bytes = final_bytes[idx+4:idx+8]
if vanilla_bytes[idx:idx+4] != lwz_bytes:
idx += 4
continue
lfd_bytes = b""
idx += 4
while vanilla_bytes[idx] >> 2 == LFD:
lfd_bytes += vanilla_bytes[idx:idx+4]
idx += 4
if vanilla_bytes[idx:idx+4] != lmw_bytes:
continue
if final_bytes[start_idx+8:start_idx+8+len(lfd_bytes)] != lfd_bytes:
continue
idx += 4
final_bytes = final_bytes[:start_idx] + lwz_bytes + lfd_bytes + lmw_bytes + final_bytes[idx:]
continue
idx += 4
with open(args.target, "wb") as f:
f.write(final_bytes)