tools/decompctx.py

# This script makes leaves most of the heavy lifting to pcpp which does preprocessing and expansion of files:
# https://github.com/ned14/pcpp
# To use it make sure you run 'pip install pcpp'
#
# This script also optionally uses pyperclip to conveniently copy the context to the clipboard:
# https://github.com/asweigart/pyperclip
# Install via `pip install pyperclip`

import os
import re
import typing
import argparse
import pyperclip
from glob import glob
from re import Pattern
from io import StringIO
from pcpp import Preprocessor
from pcpp import CmdPreprocessor
from contextlib import redirect_stdout

#region Context Options
class ContextGenerationOptions:
    should_strip_declspec = False
    should_strip_attributes = False
    should_strip_at_address = False
    should_convert_binary_literals = False
    should_replace_enums_in_initializers = False
    should_strip_initializer_trailing_commas = False
#endregion

#region Regex Patterns
at_address_pattern = re.compile(r"(?:.*?)(?:[a-zA-Z_$][\w$]*\s*\*?\s[a-zA-Z_$][\w$\[\]]*)\s*((?:AT_ADDRESS|:)(?:\s*\(?\s*)(0x[0-9a-fA-F]+|[a-zA-Z_$][\w$]*)\)?);")
attribute_pattern = re.compile(r"(__attribute__)")
declspec_pattern = re.compile(r"(__declspec)")
binary_literal_pattern = re.compile(r"\b(0b[01]+)\b")
trailing_initializer_pattern = re.compile(r"^.*?=\s*\{(?:.|\s)+?(,)?\s*(?:\/\/.*?|\/\*.*?\*\/)*\s*?\}\s*;", re.MULTILINE)
enum_array_size_initializer_pattern = re.compile(r"\[\s*([a-zA-Z_$][\w$]*)\s*\]\s*;")
enum_declaration_pattern = re.compile(r"^.*(?:typedef\s+)*enum\s(?:[a-zA-Z_$][\w$]*)*\s*\{\s*((?:.|\s)*?)\}\s*(?:[a-zA-Z_$][\w$]*)*\s*;", re.MULTILINE)
enum_value_pattern = re.compile(r"([a-zA-Z_$][\w$]*)\s*(?:=\s*(.*))*")
word_pattern = re.compile(r"\b([a-zA-Z_][\w]*)\b")
white_space_pattern = re.compile(r"\s+")
cast_patterns = re.compile(r"\(int\)")
#endregion

#region Defaults
default_defines: typing.Dict[str, str] = {"__MWERKS__" : "1", "_LANGUAGE_C": "1", "F3DEX_GBI_2": "1"}

src_dir = "src"
include_dir = "include"
cwd_dir = os.getcwd()
script_dir = os.path.dirname(os.path.realpath(__file__))
root_dir = os.path.abspath(os.path.join(script_dir, ".."))
default_include_directories: typing.List[str] = [
    os.path.join(root_dir, src_dir),
    os.path.join(root_dir, include_dir),
    os.path.join(script_dir, src_dir),
    os.path.join(script_dir, include_dir),
    os.path.join(cwd_dir, src_dir),
    os.path.join(cwd_dir, include_dir),
]

default_output_filename = "ctx.h"
#endregion

#region N64 SDK
def get_n64_sdk(sdk_argument: str)->str:
    if sdk_argument:
        return sdk_argument
    
    # No sdk path provided. Try to use default
    sdk_argument = os.environ['N64_SDK']
    if not sdk_argument:
        return None
    
    # Since we don't want the user to have to type the full path, all they need
    # is to provide the top-level folder for the SDK
    sdk_argument = os.path.join(sdk_argument, "ultra/usr/include")
    return sdk_argument
#endregion

#region Attribute Stripping
def strip_attributes(text_to_strip: str)->str:
    if not text_to_strip:
        return text_to_strip
    
    attribute_matches = reversed(list(re.finditer(attribute_pattern, text_to_strip)))
    for attribute_match in attribute_matches:
        # Find the end index of the second double paranthesis
        paren_count = 0

        match_span = attribute_match.span(0)
        end_index = match_span[1]
        attribute_opened = False
        while end_index < len(text_to_strip):
            if text_to_strip[end_index] == "(":
                paren_count += 1

                if paren_count == 2:
                    attribute_opened = True
            
            if text_to_strip[end_index] == ")":
                paren_count -= 1

            if attribute_opened and paren_count == 0:
                end_index += 1
                break

            end_index += 1

        # Create the substring
        start_index = match_span[0]
        prefix = text_to_strip[0:start_index]
        postfix = text_to_strip[end_index:len(text_to_strip)]
        text_to_strip = prefix + postfix

    return text_to_strip
#endregion

#region declspec Stripping
def strip_declspec(text_to_strip: str)->str:
    if not text_to_strip:
        return text_to_strip
    
    declspec_matches = reversed(list(re.finditer(declspec_pattern, text_to_strip)))
    for declspec_match in declspec_matches:
        # Find the end index of the second double paranthesis
        paren_count = 0

        match_span = declspec_match.span(0)
        end_index = match_span[1]
        declspec_opened = False
        while end_index < len(text_to_strip):
            if text_to_strip[end_index] == "(":
                paren_count += 1

                if paren_count == 1:
                    declspec_opened = True
            
            if text_to_strip[end_index] == ")":
                paren_count -= 1

            if declspec_opened and paren_count == 0:
                end_index += 1
                break

            end_index += 1

        # Create the substring
        start_index = match_span[0]
        prefix = text_to_strip[0:start_index]
        postfix = text_to_strip[end_index:len(text_to_strip)]
        text_to_strip = prefix + postfix

    return text_to_strip
#endregion

#region At Address Stripping
def strip_at_address(text_to_strip: str) -> str:
    if not text_to_strip:
        return text_to_strip
    
    at_address_matches = reversed(list(re.finditer(at_address_pattern, text_to_strip)))
    for attribute_match in at_address_matches:
        # Create the substring
        match_span = attribute_match.span(1)
        start_index = match_span[0]
        end_index = match_span[1]
        prefix = text_to_strip[0:start_index]
        postfix = text_to_strip[end_index:len(text_to_strip)]
        text_to_strip = prefix + postfix
    
    return text_to_strip
#endregion

#region Binary Literal Conversion
def convert_binary_literals(text_to_strip: str) -> str:
    if not text_to_strip:
        return text_to_strip
    
    binary_literal_matches = reversed(list(re.finditer(binary_literal_pattern, text_to_strip)))
    for binary_literal_match in binary_literal_matches:
        # Create the substring
        match_span = binary_literal_match.span(1)
        start_index = match_span[0]
        end_index = match_span[1]

        # Convert from binary literal format to regular int
        binary_converted = int(text_to_strip[start_index:end_index], 2)

        prefix = text_to_strip[0:start_index]
        postfix = text_to_strip[end_index:len(text_to_strip)]
        text_to_strip = prefix + str(binary_converted) + postfix
    
    return text_to_strip
#endregion

#region Strip Trailing Commas
def strip_initializer_trailing_commas(text_to_strip: str) -> str:
    if not text_to_strip:
        return text_to_strip
    
    trailing_comma_matches = reversed(list(re.finditer(trailing_initializer_pattern, text_to_strip)))
    for comma_match in trailing_comma_matches:
        # Create the substring
        if not comma_match[1]:
            continue

        match_span = comma_match.span(1)
        start_index = match_span[0]
        end_index = match_span[1]
        prefix = text_to_strip[0:start_index]
        postfix = text_to_strip[end_index:len(text_to_strip)]
        text_to_strip = prefix + postfix
    
    return text_to_strip
#endregion

#region Enums
def replace_enums_with_numeric_values(text_to_strip: str)->str:
    if not text_to_strip:
        return text_to_strip
    
    # Check if there are any uses of enums to initialize arrays
    enum_array_size_initializer_matches = list(re.finditer(enum_array_size_initializer_pattern, text_to_strip))
    if len(enum_array_size_initializer_matches) == 0:
        # None found, so no need to evaluate the enums
        return text_to_strip
    
    # We need to replace enums. But to do so we need to gather all of the enum values from the context thus far
    enum_declarations = list(re.finditer(enum_declaration_pattern, text_to_strip))
    if len(enum_declarations) == 0:
        return text_to_strip
    
    preprocessor = Preprocessor()
    enum_to_numeric_dict : typing.Dict[str, int] = {}
    for enum_declaration in enum_declarations:
        enum_members = enum_declaration[1]
        split_enum_members = enum_members.split(",")

        enum_numeric_value = 0
        for split_member in split_enum_members:
            split_member = re.sub(white_space_pattern, "", split_member)
            if not split_member or split_member.isspace():
                continue

            enum_value_match = re.match(enum_value_pattern, split_member)
            enum_member_name = enum_value_match[1]

            # Does the enum have an explicit value assigned?
            if enum_value_match[2]:
                assigned_value = enum_value_match[2]
                try:
                    # Replace usages of enum with numeric value
                    numeric_expression = enum_value_match[2]

                    # Remove casts
                    numeric_expression = re.sub(cast_patterns, "", numeric_expression)

                    # Replace enum names with numerical values
                    for word_match in reversed(list(re.finditer(word_pattern, numeric_expression))):
                        word = word_match[1]
                        if word not in enum_to_numeric_dict:
                            continue

                        word_span = word_match.span(1)
                        numeric_expression = numeric_expression[0:word_span[0]] + str(enum_to_numeric_dict[word]) + numeric_expression[word_span[1]:len(numeric_expression)]

                    # Try to parse it out
                    tokens = preprocessor.tokenize(numeric_expression)
                    evaluation = preprocessor.evalexpr(tokens)
                    assigned_value = evaluation[0]
                except Exception as e:
                    # Can't parse. Might be another enum
                    print(e)

                # Convert to int
                enum_numeric_value = int(assigned_value)

            # Record the value
            enum_to_numeric_dict[enum_member_name] = enum_numeric_value

            # By default the enum increases by 1
            enum_numeric_value += 1

    # With the enum map built we can now replace the usages with the numeric values
    enum_array_size_initializer_matches_reversed = reversed(enum_array_size_initializer_matches)
    for array_size_initializer_match in enum_array_size_initializer_matches_reversed:
        # Does this use a known enum?
        enum_name = array_size_initializer_match[1]
        if enum_name not in enum_to_numeric_dict:
            continue
        
        enum_numeric_value = enum_to_numeric_dict[enum_name]

        # Create the substring
        match_span = array_size_initializer_match.span(1)
        start_index = match_span[0]
        end_index = match_span[1]

        prefix = text_to_strip[0:start_index]
        postfix = text_to_strip[end_index:len(text_to_strip)]
        text_to_strip = prefix + str(enum_numeric_value) + postfix
    
    return text_to_strip
#endregion

#region Preprocessing
def generate_context(preprocessor_arguments: typing.List[str], context_options: ContextGenerationOptions)->str:
    # Create the temp string writer to pass to the preprocessor since we still want to modify
    # the contents for project-specific conditions
    with StringIO() as preprocessor_string_writer:
        with redirect_stdout(preprocessor_string_writer):
            # Parse the target file:
            CmdPreprocessor(preprocessor_arguments)
          
            # Check if empty
            string_writer_position = preprocessor_string_writer.tell()
            if string_writer_position == 0:
                return None
            
            # Do we need to sanitize this further?
            if not context_options.should_strip_declspec and not context_options.should_strip_attributes and not context_options.should_strip_at_address and not context_options.should_strip_initializer_trailing_commas and not context_options.should_convert_binary_literals:
                # No sanitation needed, so write the entire file out
                return preprocessor_string_writer.getvalue()
            
            # Sanitize/change the file depending on the context options
            with StringIO() as context_string_writer:
                # Sanitize line-by line for easier parsing
                preprocessor_string_writer.seek(0)
                while True:
                    line_to_write = preprocessor_string_writer.readline()
                    if not line_to_write:
                        break

                    if context_options.should_strip_declspec:
                        line_to_write = strip_declspec(line_to_write)

                    if context_options.should_strip_attributes:
                        line_to_write = strip_attributes(line_to_write)

                    if context_options.should_strip_at_address:
                        line_to_write = strip_at_address(line_to_write)

                    if context_options.should_convert_binary_literals:
                        line_to_write = convert_binary_literals(line_to_write)
                    
                    context_string_writer.writelines(line_to_write)
                
                # SIngle line cleanup completed
                generated_context = context_string_writer.getvalue()

                # Search for multi-line cleanup
                if context_options.should_strip_initializer_trailing_commas or context_options.should_replace_enums_in_initializers:
                    if context_options.should_strip_initializer_trailing_commas:
                        generated_context = strip_initializer_trailing_commas(generated_context)

                    if context_options.should_replace_enums_in_initializers:
                        generated_context = replace_enums_with_numeric_values(generated_context)
                
                return generated_context
#endregion

#region Main
def main():
    # Write initial parser
    parser = argparse.ArgumentParser(prog="Decomp Context", description="Wrapper around pcpp that can create a context file which can be used for decompilation", add_help=False)
    parser.add_argument("c_file", nargs="?", help="File from which to create context")
    parser.add_argument("-h", "-help", "--help", dest="help", action="store_true")
    parser.add_argument("-n64", "--n64-sdk", dest="n64_sdk", help="Path to the N64 SDK top level directory", action="store")
    parser.add_argument('-D', dest = 'defines', metavar = 'macro[=val]', nargs = 1, action = 'append', help = 'Predefine name as a macro [with value]')
    parser.add_argument("--strip-declspec", dest="strip_declspec", help="If __declspec() string should be stripped", action="store_true", default=False)
    parser.add_argument("--strip-attributes", dest="strip_attributes", help="If __attribute__(()) string should be stripped", action="store_true", default=False)
    parser.add_argument("--strip-at-address", dest="strip_at_address", help="If AT_ADDRESS or : formatted string should be stripped", action="store_true", default=False)
    parser.add_argument("--strip-initializer_trailing_commas", dest="strip_initializer_trailing_commas", help="If trailing commas in initializers should be stripped", action="store_true", default=False)
    parser.add_argument("--convert-binary-literals", dest="convert_binary_literals", help="If binary literals (0bxxxx) should be converted to decimal", action="store_true", default=False)
    parser.add_argument("--replace-enums-in-initializers", dest="replace_enums_in_initializers", help="If enums should be replaced by its numeric value in initializers", action="store_true", default=False)
    parser.add_argument("--clipboard", dest="copy_to_clipboard", help="If the context should be copied to the clipboard", action="store_true", default=False)

    # For the output path, we either want to be explicit or relative, but not both
    output_target_group = parser.add_mutually_exclusive_group()
    output_target_group.add_argument("-o", dest="output_path", help="Explicit path to output the context file to", action="store")
    output_target_group.add_argument("-r", "--relative", dest="relative", help="Generate context relative to the source file", action="store_true")

    # When targeting a specific platform we want to only do one thing or another
    platform_target_group = parser.add_mutually_exclusive_group()
    platform_target_group.add_argument("--m2c", dest="m2c", help="Generates an m2c-friendly file", action="store_true")
    platform_target_group.add_argument("--ghidra", dest="ghidra", help="Generates an Ghidra-friendly file", action="store_true")
    
    # Parse the known arguments
    parsed_args = parser.parse_known_args()
    known_args = parsed_args[0]
    
    preprocessor_arguments = ['pcpp']
    if known_args.help:
        # Since this script acts as a wrapper for the main pcpp script
        # we want to manually display the help and pass it through to the
        # pcpp preprocessor to show its full list of arguments
        parser.print_help()
        preprocessor_arguments.append("--help")
        CmdPreprocessor(preprocessor_arguments).tokenize
        return

    # Append in the default include directories
    include_directories: typing.List[str] = []
    include_directories.extend(default_include_directories)
    n64_sdk = get_n64_sdk(known_args.n64_sdk)
    if n64_sdk:
        include_directories.append(n64_sdk)

    for include_directory in include_directories:
        preprocessor_arguments.extend(("-I", include_directory))

    # Check if we have any passed in defines
    include_defines = []
    known_defines: typing.List[str] = []
    if known_args.defines:
        argument_defines = [x[0] for x in known_args.defines]
        for define in argument_defines:
            include_defines.append(define)
            known_defines.append(define.split("=")[0])

    if not known_args.c_file:
        # If not file is specified it is assumed we want to create a mega context
        # file that is the aggregate of all include files
        include_files : typing.Set[str, str] = set()
        for include_directory in default_include_directories:
            files = [y for x in os.walk(include_directory) for y in glob(os.path.join(x[0], '*.h'))]
            for include_file in files:
                include_files.add(include_file)

        # Add each file as an input so that pccpp can parse them into a single output file
        # Sort the files for some consistency
        sorted_files = list(include_files)
        sorted_files.sort()
        for include_file in include_files:
            preprocessor_arguments.append(include_file)    
    else:
        # Add the file we want to read
        c_file = known_args.c_file
        preprocessor_arguments.append(known_args.c_file)
    
    # Add in the default defines unless explicitly passed in as arguments
    for default_define, default_define_value in default_defines.items():
        if default_define in known_defines:
            continue
        define_str: str = default_define + "=" + default_define_value
        include_defines.append(define_str)

    # Add the defines to the arguments
    for define in include_defines:
        preprocessor_arguments.extend(("-D", define))

    # If not targeting Ghidra or m2c we can include more in
    if not known_args.ghidra and not known_args.m2c:
        preprocessor_arguments.append("--passthru-defines")
    else:
        # Don't include the line directives if targeting Ghidra/m2c
        preprocessor_arguments.append("--line-directive")

    # For debugging purposes, include unfound includes in output to mark errors
    preprocessor_arguments.append("--passthru-unfound-includes")

    # Compress to minimize whitespace
    preprocessor_arguments.append("--compress")

    # Add unknown arguments and pass them to pcpp
    pass_through_args = parsed_args[1]
    preprocessor_arguments.extend(pass_through_args)

    # Check if we need to do further conversions after the file is preprocessed
    context_options = ContextGenerationOptions()
    context_options.should_strip_declspec = known_args.strip_declspec or known_args.ghidra or known_args.m2c
    context_options.should_strip_at_address = known_args.strip_at_address or known_args.ghidra or known_args.m2c
    context_options.should_strip_attributes = known_args.strip_attributes or known_args.m2c
    context_options.should_convert_binary_literals = known_args.convert_binary_literals or known_args.ghidra
    context_options.should_strip_initializer_trailing_commas = known_args.strip_initializer_trailing_commas or known_args.ghidra
    context_options.should_replace_enums_in_initializers = known_args.replace_enums_in_initializers or known_args.ghidra

    # Generate the context
    generated_context = generate_context(preprocessor_arguments, context_options)

    # Determine the file to write to
    target_file_name = None
    if known_args.output_path:
        target_file_name = known_args.output_path
    elif known_args.relative:
        target_file_name = f"{c_file}.ctx"
    else:
        target_file_name = os.path.join(os.getcwd(), default_output_filename)
    
    # Write the generated context to the file
    with open(target_file_name, "w", encoding="utf-8", newline="\n") as file_writer:
        file_writer.write(generated_context)

    # Check if we also want to copy to the clipboard
    if known_args.copy_to_clipboard:
        pyperclip.copy(generated_context)
#endregion

if __name__ == "__main__":
    main()