forked from ACreTeam/ac-decomp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecompctx.py
501 lines (408 loc) · 21.2 KB
/
decompctx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
# This script makes leaves most of the heavy lifting to pcpp which does preprocessing and expansion of files:
# https://github.com/ned14/pcpp
# To use it make sure you run 'pip install pcpp'
#
# This script also optionally uses pyperclip to conveniently copy the context to the clipboard:
# https://github.com/asweigart/pyperclip
# Install via `pip install pyperclip`
import os
import re
import typing
import argparse
import pyperclip
from glob import glob
from re import Pattern
from io import StringIO
from pcpp import Preprocessor
from pcpp import CmdPreprocessor
from contextlib import redirect_stdout
#region Context Options
class ContextGenerationOptions:
should_strip_declspec = False
should_strip_attributes = False
should_strip_at_address = False
should_convert_binary_literals = False
should_replace_enums_in_initializers = False
should_strip_initializer_trailing_commas = False
#endregion
#region Regex Patterns
at_address_pattern = re.compile(r"(?:.*?)(?:[a-zA-Z_$][\w$]*\s*\*?\s[a-zA-Z_$][\w$\[\]]*)\s*((?:AT_ADDRESS|:)(?:\s*\(?\s*)(0x[0-9a-fA-F]+|[a-zA-Z_$][\w$]*)\)?);")
attribute_pattern = re.compile(r"(__attribute__)")
declspec_pattern = re.compile(r"(__declspec)")
binary_literal_pattern = re.compile(r"\b(0b[01]+)\b")
trailing_initializer_pattern = re.compile(r"^.*?=\s*\{(?:.|\s)+?(,)?\s*(?:\/\/.*?|\/\*.*?\*\/)*\s*?\}\s*;", re.MULTILINE)
enum_array_size_initializer_pattern = re.compile(r"\[\s*([a-zA-Z_$][\w$]*)\s*\]\s*;")
enum_declaration_pattern = re.compile(r"^.*(?:typedef\s+)*enum\s(?:[a-zA-Z_$][\w$]*)*\s*\{\s*((?:.|\s)*?)\}\s*(?:[a-zA-Z_$][\w$]*)*\s*;", re.MULTILINE)
enum_value_pattern = re.compile(r"([a-zA-Z_$][\w$]*)\s*(?:=\s*(.*))*")
word_pattern = re.compile(r"\b([a-zA-Z_][\w]*)\b")
white_space_pattern = re.compile(r"\s+")
cast_patterns = re.compile(r"\(int\)")
#endregion
#region Defaults
default_defines: typing.Dict[str, str] = {"__MWERKS__" : "1", "_LANGUAGE_C": "1", "F3DEX_GBI_2": "1"}
src_dir = "src"
include_dir = "include"
cwd_dir = os.getcwd()
script_dir = os.path.dirname(os.path.realpath(__file__))
root_dir = os.path.abspath(os.path.join(script_dir, ".."))
default_include_directories: typing.List[str] = [
os.path.join(root_dir, src_dir),
os.path.join(root_dir, include_dir),
os.path.join(script_dir, src_dir),
os.path.join(script_dir, include_dir),
os.path.join(cwd_dir, src_dir),
os.path.join(cwd_dir, include_dir),
]
default_output_filename = "ctx.h"
#endregion
#region N64 SDK
def get_n64_sdk(sdk_argument: str)->str:
if sdk_argument:
return sdk_argument
# No sdk path provided. Try to use default
sdk_argument = os.environ['N64_SDK']
if not sdk_argument:
return None
# Since we don't want the user to have to type the full path, all they need
# is to provide the top-level folder for the SDK
sdk_argument = os.path.join(sdk_argument, "ultra/usr/include")
return sdk_argument
#endregion
#region Attribute Stripping
def strip_attributes(text_to_strip: str)->str:
if not text_to_strip:
return text_to_strip
attribute_matches = reversed(list(re.finditer(attribute_pattern, text_to_strip)))
for attribute_match in attribute_matches:
# Find the end index of the second double paranthesis
paren_count = 0
match_span = attribute_match.span(0)
end_index = match_span[1]
attribute_opened = False
while end_index < len(text_to_strip):
if text_to_strip[end_index] == "(":
paren_count += 1
if paren_count == 2:
attribute_opened = True
if text_to_strip[end_index] == ")":
paren_count -= 1
if attribute_opened and paren_count == 0:
end_index += 1
break
end_index += 1
# Create the substring
start_index = match_span[0]
prefix = text_to_strip[0:start_index]
postfix = text_to_strip[end_index:len(text_to_strip)]
text_to_strip = prefix + postfix
return text_to_strip
#endregion
#region declspec Stripping
def strip_declspec(text_to_strip: str)->str:
if not text_to_strip:
return text_to_strip
declspec_matches = reversed(list(re.finditer(declspec_pattern, text_to_strip)))
for declspec_match in declspec_matches:
# Find the end index of the second double paranthesis
paren_count = 0
match_span = declspec_match.span(0)
end_index = match_span[1]
declspec_opened = False
while end_index < len(text_to_strip):
if text_to_strip[end_index] == "(":
paren_count += 1
if paren_count == 1:
declspec_opened = True
if text_to_strip[end_index] == ")":
paren_count -= 1
if declspec_opened and paren_count == 0:
end_index += 1
break
end_index += 1
# Create the substring
start_index = match_span[0]
prefix = text_to_strip[0:start_index]
postfix = text_to_strip[end_index:len(text_to_strip)]
text_to_strip = prefix + postfix
return text_to_strip
#endregion
#region At Address Stripping
def strip_at_address(text_to_strip: str) -> str:
if not text_to_strip:
return text_to_strip
at_address_matches = reversed(list(re.finditer(at_address_pattern, text_to_strip)))
for attribute_match in at_address_matches:
# Create the substring
match_span = attribute_match.span(1)
start_index = match_span[0]
end_index = match_span[1]
prefix = text_to_strip[0:start_index]
postfix = text_to_strip[end_index:len(text_to_strip)]
text_to_strip = prefix + postfix
return text_to_strip
#endregion
#region Binary Literal Conversion
def convert_binary_literals(text_to_strip: str) -> str:
if not text_to_strip:
return text_to_strip
binary_literal_matches = reversed(list(re.finditer(binary_literal_pattern, text_to_strip)))
for binary_literal_match in binary_literal_matches:
# Create the substring
match_span = binary_literal_match.span(1)
start_index = match_span[0]
end_index = match_span[1]
# Convert from binary literal format to regular int
binary_converted = int(text_to_strip[start_index:end_index], 2)
prefix = text_to_strip[0:start_index]
postfix = text_to_strip[end_index:len(text_to_strip)]
text_to_strip = prefix + str(binary_converted) + postfix
return text_to_strip
#endregion
#region Strip Trailing Commas
def strip_initializer_trailing_commas(text_to_strip: str) -> str:
if not text_to_strip:
return text_to_strip
trailing_comma_matches = reversed(list(re.finditer(trailing_initializer_pattern, text_to_strip)))
for comma_match in trailing_comma_matches:
# Create the substring
if not comma_match[1]:
continue
match_span = comma_match.span(1)
start_index = match_span[0]
end_index = match_span[1]
prefix = text_to_strip[0:start_index]
postfix = text_to_strip[end_index:len(text_to_strip)]
text_to_strip = prefix + postfix
return text_to_strip
#endregion
#region Enums
def replace_enums_with_numeric_values(text_to_strip: str)->str:
if not text_to_strip:
return text_to_strip
# Check if there are any uses of enums to initialize arrays
enum_array_size_initializer_matches = list(re.finditer(enum_array_size_initializer_pattern, text_to_strip))
if len(enum_array_size_initializer_matches) == 0:
# None found, so no need to evaluate the enums
return text_to_strip
# We need to replace enums. But to do so we need to gather all of the enum values from the context thus far
enum_declarations = list(re.finditer(enum_declaration_pattern, text_to_strip))
if len(enum_declarations) == 0:
return text_to_strip
preprocessor = Preprocessor()
enum_to_numeric_dict : typing.Dict[str, int] = {}
for enum_declaration in enum_declarations:
enum_members = enum_declaration[1]
split_enum_members = enum_members.split(",")
enum_numeric_value = 0
for split_member in split_enum_members:
split_member = re.sub(white_space_pattern, "", split_member)
if not split_member or split_member.isspace():
continue
enum_value_match = re.match(enum_value_pattern, split_member)
enum_member_name = enum_value_match[1]
# Does the enum have an explicit value assigned?
if enum_value_match[2]:
assigned_value = enum_value_match[2]
try:
# Replace usages of enum with numeric value
numeric_expression = enum_value_match[2]
# Remove casts
numeric_expression = re.sub(cast_patterns, "", numeric_expression)
# Replace enum names with numerical values
for word_match in reversed(list(re.finditer(word_pattern, numeric_expression))):
word = word_match[1]
if word not in enum_to_numeric_dict:
continue
word_span = word_match.span(1)
numeric_expression = numeric_expression[0:word_span[0]] + str(enum_to_numeric_dict[word]) + numeric_expression[word_span[1]:len(numeric_expression)]
# Try to parse it out
tokens = preprocessor.tokenize(numeric_expression)
evaluation = preprocessor.evalexpr(tokens)
assigned_value = evaluation[0]
except Exception as e:
# Can't parse. Might be another enum
print(e)
# Convert to int
enum_numeric_value = int(assigned_value)
# Record the value
enum_to_numeric_dict[enum_member_name] = enum_numeric_value
# By default the enum increases by 1
enum_numeric_value += 1
# With the enum map built we can now replace the usages with the numeric values
enum_array_size_initializer_matches_reversed = reversed(enum_array_size_initializer_matches)
for array_size_initializer_match in enum_array_size_initializer_matches_reversed:
# Does this use a known enum?
enum_name = array_size_initializer_match[1]
if enum_name not in enum_to_numeric_dict:
continue
enum_numeric_value = enum_to_numeric_dict[enum_name]
# Create the substring
match_span = array_size_initializer_match.span(1)
start_index = match_span[0]
end_index = match_span[1]
prefix = text_to_strip[0:start_index]
postfix = text_to_strip[end_index:len(text_to_strip)]
text_to_strip = prefix + str(enum_numeric_value) + postfix
return text_to_strip
#endregion
#region Preprocessing
def generate_context(preprocessor_arguments: typing.List[str], context_options: ContextGenerationOptions)->str:
# Create the temp string writer to pass to the preprocessor since we still want to modify
# the contents for project-specific conditions
with StringIO() as preprocessor_string_writer:
with redirect_stdout(preprocessor_string_writer):
# Parse the target file:
CmdPreprocessor(preprocessor_arguments)
# Check if empty
string_writer_position = preprocessor_string_writer.tell()
if string_writer_position == 0:
return None
# Do we need to sanitize this further?
if not context_options.should_strip_declspec and not context_options.should_strip_attributes and not context_options.should_strip_at_address and not context_options.should_strip_initializer_trailing_commas and not context_options.should_convert_binary_literals:
# No sanitation needed, so write the entire file out
return preprocessor_string_writer.getvalue()
# Sanitize/change the file depending on the context options
with StringIO() as context_string_writer:
# Sanitize line-by line for easier parsing
preprocessor_string_writer.seek(0)
while True:
line_to_write = preprocessor_string_writer.readline()
if not line_to_write:
break
if context_options.should_strip_declspec:
line_to_write = strip_declspec(line_to_write)
if context_options.should_strip_attributes:
line_to_write = strip_attributes(line_to_write)
if context_options.should_strip_at_address:
line_to_write = strip_at_address(line_to_write)
if context_options.should_convert_binary_literals:
line_to_write = convert_binary_literals(line_to_write)
context_string_writer.writelines(line_to_write)
# SIngle line cleanup completed
generated_context = context_string_writer.getvalue()
# Search for multi-line cleanup
if context_options.should_strip_initializer_trailing_commas or context_options.should_replace_enums_in_initializers:
if context_options.should_strip_initializer_trailing_commas:
generated_context = strip_initializer_trailing_commas(generated_context)
if context_options.should_replace_enums_in_initializers:
generated_context = replace_enums_with_numeric_values(generated_context)
return generated_context
#endregion
#region Main
def main():
# Write initial parser
parser = argparse.ArgumentParser(prog="Decomp Context", description="Wrapper around pcpp that can create a context file which can be used for decompilation", add_help=False)
parser.add_argument("c_file", nargs="?", help="File from which to create context")
parser.add_argument("-h", "-help", "--help", dest="help", action="store_true")
parser.add_argument("-n64", "--n64-sdk", dest="n64_sdk", help="Path to the N64 SDK top level directory", action="store")
parser.add_argument('-D', dest = 'defines', metavar = 'macro[=val]', nargs = 1, action = 'append', help = 'Predefine name as a macro [with value]')
parser.add_argument("--strip-declspec", dest="strip_declspec", help="If __declspec() string should be stripped", action="store_true", default=False)
parser.add_argument("--strip-attributes", dest="strip_attributes", help="If __attribute__(()) string should be stripped", action="store_true", default=False)
parser.add_argument("--strip-at-address", dest="strip_at_address", help="If AT_ADDRESS or : formatted string should be stripped", action="store_true", default=False)
parser.add_argument("--strip-initializer_trailing_commas", dest="strip_initializer_trailing_commas", help="If trailing commas in initializers should be stripped", action="store_true", default=False)
parser.add_argument("--convert-binary-literals", dest="convert_binary_literals", help="If binary literals (0bxxxx) should be converted to decimal", action="store_true", default=False)
parser.add_argument("--replace-enums-in-initializers", dest="replace_enums_in_initializers", help="If enums should be replaced by its numeric value in initializers", action="store_true", default=False)
parser.add_argument("--clipboard", dest="copy_to_clipboard", help="If the context should be copied to the clipboard", action="store_true", default=False)
# For the output path, we either want to be explicit or relative, but not both
output_target_group = parser.add_mutually_exclusive_group()
output_target_group.add_argument("-o", dest="output_path", help="Explicit path to output the context file to", action="store")
output_target_group.add_argument("-r", "--relative", dest="relative", help="Generate context relative to the source file", action="store_true")
# When targeting a specific platform we want to only do one thing or another
platform_target_group = parser.add_mutually_exclusive_group()
platform_target_group.add_argument("--m2c", dest="m2c", help="Generates an m2c-friendly file", action="store_true")
platform_target_group.add_argument("--ghidra", dest="ghidra", help="Generates an Ghidra-friendly file", action="store_true")
# Parse the known arguments
parsed_args = parser.parse_known_args()
known_args = parsed_args[0]
preprocessor_arguments = ['pcpp']
if known_args.help:
# Since this script acts as a wrapper for the main pcpp script
# we want to manually display the help and pass it through to the
# pcpp preprocessor to show its full list of arguments
parser.print_help()
preprocessor_arguments.append("--help")
CmdPreprocessor(preprocessor_arguments).tokenize
return
# Append in the default include directories
include_directories: typing.List[str] = []
include_directories.extend(default_include_directories)
n64_sdk = get_n64_sdk(known_args.n64_sdk)
if n64_sdk:
include_directories.append(n64_sdk)
for include_directory in include_directories:
preprocessor_arguments.extend(("-I", include_directory))
# Check if we have any passed in defines
include_defines = []
known_defines: typing.List[str] = []
if known_args.defines:
argument_defines = [x[0] for x in known_args.defines]
for define in argument_defines:
include_defines.append(define)
known_defines.append(define.split("=")[0])
if not known_args.c_file:
# If not file is specified it is assumed we want to create a mega context
# file that is the aggregate of all include files
include_files : typing.Set[str, str] = set()
for include_directory in default_include_directories:
files = [y for x in os.walk(include_directory) for y in glob(os.path.join(x[0], '*.h'))]
for include_file in files:
include_files.add(include_file)
# Add each file as an input so that pccpp can parse them into a single output file
# Sort the files for some consistency
sorted_files = list(include_files)
sorted_files.sort()
for include_file in include_files:
preprocessor_arguments.append(include_file)
else:
# Add the file we want to read
c_file = known_args.c_file
preprocessor_arguments.append(known_args.c_file)
# Add in the default defines unless explicitly passed in as arguments
for default_define, default_define_value in default_defines.items():
if default_define in known_defines:
continue
define_str: str = default_define + "=" + default_define_value
include_defines.append(define_str)
# Add the defines to the arguments
for define in include_defines:
preprocessor_arguments.extend(("-D", define))
# If not targeting Ghidra or m2c we can include more in
if not known_args.ghidra and not known_args.m2c:
preprocessor_arguments.append("--passthru-defines")
else:
# Don't include the line directives if targeting Ghidra/m2c
preprocessor_arguments.append("--line-directive")
# For debugging purposes, include unfound includes in output to mark errors
preprocessor_arguments.append("--passthru-unfound-includes")
# Compress to minimize whitespace
preprocessor_arguments.append("--compress")
# Add unknown arguments and pass them to pcpp
pass_through_args = parsed_args[1]
preprocessor_arguments.extend(pass_through_args)
# Check if we need to do further conversions after the file is preprocessed
context_options = ContextGenerationOptions()
context_options.should_strip_declspec = known_args.strip_declspec or known_args.ghidra or known_args.m2c
context_options.should_strip_at_address = known_args.strip_at_address or known_args.ghidra or known_args.m2c
context_options.should_strip_attributes = known_args.strip_attributes or known_args.m2c
context_options.should_convert_binary_literals = known_args.convert_binary_literals or known_args.ghidra
context_options.should_strip_initializer_trailing_commas = known_args.strip_initializer_trailing_commas or known_args.ghidra
context_options.should_replace_enums_in_initializers = known_args.replace_enums_in_initializers or known_args.ghidra
# Generate the context
generated_context = generate_context(preprocessor_arguments, context_options)
# Determine the file to write to
target_file_name = None
if known_args.output_path:
target_file_name = known_args.output_path
elif known_args.relative:
target_file_name = f"{c_file}.ctx"
else:
target_file_name = os.path.join(os.getcwd(), default_output_filename)
# Write the generated context to the file
with open(target_file_name, "w", encoding="utf-8", newline="\n") as file_writer:
file_writer.write(generated_context)
# Check if we also want to copy to the clipboard
if known_args.copy_to_clipboard:
pyperclip.copy(generated_context)
#endregion
if __name__ == "__main__":
main()