diff --git a/CHANGELOG.md b/CHANGELOG.md index a5d3b653..db06a292 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,46 @@ # splat Release Notes +### Unreleased + +* Add support for Win32 PE binaries (x86 PE32 and x86_64 PE32+). + * New `platform: win32` option backed by a self-contained PE parser + (every populated data directory + COFF symtab + x64 SEH unwind + info + the .NET CLR header) and a Capstone-based x86 / x86_64 + disassembler. Optional dependency group `win32` pulls in + `capstone>=5.0.0`. + * New segtypes under `segtypes/win32/`: `header` (structured PE + header byte-by-byte dump + human-readable summary block), + `text` / `asm` (Capstone disasm with GAS-compatible operand + rewrites), `data` / `rodata` (heuristic string + pointer + detection, NUL-run collapse), `bss` (NOLOAD reservation), `bin` + (opaque blob for `.reloc` / `.rsrc` / signature / COFF symtab), + `pdata` (PE32+ RUNTIME_FUNCTION rows with optional decoded + UNWIND_INFO opcode lists). + * New compiler tags: `MSVC2..14`, `MINGW`, `CLANG_LLD`. All share + the same MASM-style asm conventions; distinct names preserve + provenance of generated configs. + * `create_config` auto-detects PE files (MZ + PE magic), generates + a YAML + symbol_addrs.txt with named symbols for the entrypoint, + exports (incl. forwarders as comments), eager + delay imports, + TLS callbacks, SafeSEH handlers, /guard:cf targets, /GS security + cookie, .NET CLR metadata pointers, and unwind RVAs. + * `auto_link_sections` default is `[]` for `platform: win32` (PE + sections are independent subsegments — implicit MIPS-style + sibling generation produces phantom linker entries otherwise). + * New `python -m splat.scripts.win32_reassemble ` script: + runs `as` + `ld` + `objcopy` against the splat-generated + layout to reconstruct a PE. With `exact_encoding: true` on + text/data/pdata subsegments, the reassembled PE is + byte-identical to the original. Verified end-to-end on + 5 real-world binaries: Sysinternals PsExec (PE32) + PsExec64 + (PE32+), PuTTY 0.60 (vintage MSVC6), PuTTY 0.70 32-bit (MSVC14 + with `.00cfg` CFG section), PuTTY 0.83 64-bit (MSVC14 PE32+ + with 2410 RUNTIME_FUNCTION entries). + * Tests: 199 unit tests covering the PE parser, label generation + helpers, segtype emission, header rendering, and string + detectors; 10 end-to-end tests covering split + reassemble + + GAS-clean assembly on both PE32 and PE32+ synthetic fixtures. + ### 0.40.1 * Always write the link dependency file. diff --git a/README.md b/README.md index 2ec129f1..d007d97b 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A binary splitting tool to assist with decompilation and modding projects -Currently, only N64, PSX, PS2 and PSP binaries are supported. More platforms may come in the future. +Currently N64, PSX, PS2, PSP and Win32 PE (x86 / x86_64) binaries are supported. More platforms may come in the future. Please check out the [wiki](https://github.com/ethteck/splat/wiki) for more information including [examples](https://github.com/ethteck/splat/wiki/Examples) of projects that use splat. @@ -27,8 +27,23 @@ splat64[mips]>=0.40.1,<1.0.0 ### Optional dependencies - `mips`: Required when using the N64, PSX, PS2 or PSP platforms. +- `win32`: Required when using the Win32 PE platform (pulls in Capstone for x86 / x86_64 disassembly). - `dev`: Installs all the available dependencies groups and other packages for development. ### Gamecube / Wii For Gamecube / Wii projects, see [decomp-toolkit](https://github.com/encounter/decomp-toolkit)! + +### Win32 PE support + +The `win32` platform handles PE32 (x86) and PE32+ (x86_64) binaries built by MSVC 4.x-14.x, MinGW (libgcc-linked), and Clang-LLD. Decoded directories include exports, imports, delay imports, bound imports, resources, exception/SEH tables (with unwind-info opcode lists), TLS, /GS + /SAFESEH + /guard:cf load-config, base relocations, debug (CodeView PDB GUID/age extraction), the CLR runtime header (.NET assemblies), and the deprecated COFF symbol table. + +Workflow: + +```bash +python -m splat.scripts.create_config my.exe # auto-generate YAML + symbol_addrs.txt +python -m splat split my.exe.yaml # produce GAS-clean .s + linker script +python -m splat.scripts.win32_reassemble my.exe.yaml # link bytes back into a PE +``` + +With `exact_encoding: true` on the text/data/pdata subsegments the reassembled PE is byte-identical to the original. diff --git a/pyproject.toml b/pyproject.toml index e04cdd70..d2789364 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,8 +26,12 @@ mips = [ "n64img>=0.3.3", "crunch64>=0.5.1,<1.0.0", ] +win32 = [ + "capstone>=5.0.0", +] dev = [ "splat64[mips]", + "splat64[win32]", "ruff", "mypy", "types-PyYAML", diff --git a/src/splat/disassembler/capstone_disassembler.py b/src/splat/disassembler/capstone_disassembler.py new file mode 100644 index 00000000..42043bae --- /dev/null +++ b/src/splat/disassembler/capstone_disassembler.py @@ -0,0 +1,109 @@ +"""Capstone-backed disassembler used by the win32 platform. + +The MIPS disassembler stack (spimdisasm/rabbitizer) is incompatible with x86, +so win32 segments do not flow through `CommonSegCodeSubsegment`. This module +exposes a tiny façade: configure a Capstone engine once, hand it out to +segtypes for them to decode byte ranges, and surface known section names. +""" + +from typing import Optional, Set + +from . import disassembler +from ..util import log + + +class CapstoneDisassembler(disassembler.Disassembler): + CAPSTONE_MIN = (5, 0, 0) + + def __init__(self): + self._md = None + + def configure(self): + # Defer engine creation to `get_engine()` — at this point in startup + # the target hasn't been parsed yet, so we don't yet know whether + # it's PE32 (CS_MODE_32) or PE32+ (CS_MODE_64). + try: + import capstone # noqa: F401 — just verify availability + except ImportError: + log.error( + "The win32 platform requires the optional 'capstone' dependency. " + "Install it with: pip install 'splat64[win32]'" + ) + + def check_version(self, skip_version_check: bool, splat_version: str): + try: + import capstone + except ImportError: + log.error( + "The win32 platform requires the optional 'capstone' dependency. " + "Install it with: pip install 'splat64[win32]'" + ) + + if not skip_version_check: + cs_version = getattr(capstone, "__version__", None) + if cs_version is not None: + parts = [] + for chunk in cs_version.split("."): + digits = "".join(c for c in chunk if c.isdigit()) + parts.append(int(digits) if digits else 0) + while len(parts) < 3: + parts.append(0) + if tuple(parts[:3]) < self.CAPSTONE_MIN: + log.error( + f"splat {splat_version} requires at least capstone " + f"{self.CAPSTONE_MIN}, but {cs_version} is installed" + ) + log.write( + f"splat {splat_version} (powered by capstone {cs_version or '?'})" + ) + + def get_engine(self): + if self._md is not None: + return self._md + + import capstone + from ..platforms import win32 as win32_platform + + arch = capstone.CS_ARCH_X86 + # Honour the parsed PE's bitness when the platform module has been + # initialized; otherwise default to PE32 (32-bit). + if win32_platform.info.is_pe32_plus: + mode = capstone.CS_MODE_64 + else: + mode = capstone.CS_MODE_32 + + md = capstone.Cs(arch, mode) + md.detail = True + md.syntax = capstone.CS_OPT_SYNTAX_INTEL + self._md = md + return md + + def known_types(self) -> Set[str]: + # Mirror the standard primitive type names that the spimdisasm + # backend exposes so symbol_addrs files written for win32 binaries + # can use the same `type:u32` / `type:asciz` vocabulary. + return { + "u8", + "u16", + "u32", + "u64", + "s8", + "s16", + "s32", + "s64", + "f32", + "f64", + "char", + "char*", + "asciz", + } + + +def get_capstone_disassembler() -> Optional["CapstoneDisassembler"]: + """Return the active CapstoneDisassembler if one is wired up, else None.""" + from . import disassembler_instance + + inst = disassembler_instance.get_instance() + if isinstance(inst, CapstoneDisassembler): + return inst + return None diff --git a/src/splat/disassembler/disassembler_instance.py b/src/splat/disassembler/disassembler_instance.py index 1745a442..523f4fc5 100644 --- a/src/splat/disassembler/disassembler_instance.py +++ b/src/splat/disassembler/disassembler_instance.py @@ -1,5 +1,6 @@ from .disassembler import Disassembler from .spimdisasm_disassembler import SpimdisasmDisassembler +from .capstone_disassembler import CapstoneDisassembler from .null_disassembler import NullDisassembler from ..util import options @@ -19,6 +20,14 @@ def create_disassembler_instance(skip_version_check: bool, splat_version: str): __instance.configure() return + if options.opts.platform == "win32": + __instance = CapstoneDisassembler() + __initialized = True + + __instance.check_version(skip_version_check, splat_version) + __instance.configure() + return + raise NotImplementedError("No disassembler for requested platform") diff --git a/src/splat/platforms/__init__.py b/src/splat/platforms/__init__.py index 173970fc..88fe1f66 100644 --- a/src/splat/platforms/__init__.py +++ b/src/splat/platforms/__init__.py @@ -2,3 +2,4 @@ from . import ps2 as ps2 from . import psx as psx from . import psp as psp +from . import win32 as win32 diff --git a/src/splat/platforms/win32.py b/src/splat/platforms/win32.py new file mode 100644 index 00000000..15d00b63 --- /dev/null +++ b/src/splat/platforms/win32.py @@ -0,0 +1,1642 @@ +"""Win32 PE platform support. + +Parses the PE/COFF header of the target binary at `init()` time and exposes +the result via module-level globals that win32 segtypes can consult. + +The parser intentionally implements only what splat needs (DOS stub, file +header, optional header, section table). It accepts both PE32 (i386, what +MSVC6 emits) and PE32+ (x86_64) optional headers but the rest of the win32 +support is geared at PE32 / i386. +""" + +from dataclasses import dataclass, field +import struct +from typing import Dict, List, Optional, Set, Tuple + +from ..util import log + + +DOS_MAGIC = b"MZ" +PE_MAGIC = b"PE\x00\x00" + +MACHINE_I386 = 0x014C +MACHINE_AMD64 = 0x8664 +MACHINE_ARM32 = 0x01C4 # ARMv7 Thumb-2 (Windows on ARM 32-bit) +MACHINE_ARM64 = 0xAA64 # AArch64 (Windows on ARM 64-bit) + +# IMAGE_OPTIONAL_HEADER.Magic — identifies which optional-header +# layout follows (PE32 has 32-bit fields for ImageBase etc., PE32+ +# has 64-bit equivalents). +OPT_MAGIC_PE32 = 0x10B +OPT_MAGIC_PE32_PLUS = 0x20B + +# IMAGE_DIRECTORY_ENTRY_* indices into pe.data_directories. +DIR_EXPORT = 0 +DIR_IMPORT = 1 +DIR_RESOURCE = 2 +DIR_EXCEPTION = 3 +DIR_CERTIFICATE = 4 +DIR_BASERELOC = 5 +DIR_DEBUG = 6 +DIR_ARCHITECTURE = 7 +DIR_GLOBALPTR = 8 +DIR_TLS = 9 +DIR_LOAD_CONFIG = 10 +DIR_BOUND_IMPORT = 11 +DIR_IAT = 12 +DIR_DELAY_IMPORT = 13 +DIR_COM_DESCRIPTOR = 14 + +# Section header flags (IMAGE_SCN_*) +SCN_CNT_CODE = 0x00000020 +SCN_CNT_INITIALIZED_DATA = 0x00000040 +SCN_CNT_UNINITIALIZED_DATA = 0x00000080 +SCN_MEM_READ = 0x40000000 +SCN_MEM_WRITE = 0x80000000 +SCN_MEM_EXECUTE = 0x20000000 + + +@dataclass +class PESection: + """One IMAGE_SECTION_HEADER entry from the PE section table. + + `virtual_address` is the section's RVA — its load-time location + relative to ImageBase. `raw_pointer` is the on-disk offset. The + section spans [virtual_address, virtual_address + virtual_size) + in memory and [raw_pointer, raw_pointer + raw_size) in the file; + when virtual_size > raw_size the loader zero-fills the tail.""" + + name: str + virtual_size: int + virtual_address: int # RVA + raw_size: int + raw_pointer: int # file offset + characteristics: int + + @property + def is_code(self) -> bool: + return bool(self.characteristics & (SCN_CNT_CODE | SCN_MEM_EXECUTE)) + + @property + def is_bss(self) -> bool: + return bool(self.characteristics & SCN_CNT_UNINITIALIZED_DATA) + + @property + def is_writable(self) -> bool: + return bool(self.characteristics & SCN_MEM_WRITE) + + @property + def is_readonly_data(self) -> bool: + return ( + bool(self.characteristics & SCN_CNT_INITIALIZED_DATA) + and not self.is_writable + and not self.is_code + ) + + +@dataclass +class PEExport: + """One entry in the export table (data dir 0). + + `name` is None for ordinal-only exports (the DLL exposes the + function by number rather than symbolic name). `rva` is the + in-image RVA the export points at — UNLESS `forwarder` is set, + in which case the export forwards to another DLL and rva is + interpreted by the loader as a pointer to the forwarder string.""" + + name: Optional[str] + ordinal: int + rva: int + forwarder: Optional[str] = None + + +@dataclass +class PEImport: + """One imported symbol — either eager (data dir 1, IMAGE_IMPORT_*) + or delay-loaded (data dir 13, IMAGE_DELAYLOAD_*). + + `dll` is the import source's DLL filename. Exactly one of `name` + or `ordinal` is set: `name` for the typical hint/name import, + `ordinal` (`name` is None) when the import is by ordinal index. + `iat_rva` is the in-image RVA of the IAT slot the loader writes + the resolved function pointer into — call sites read through it + via `call qword ptr []` style indirect calls.""" + + dll: str + name: Optional[str] + ordinal: Optional[int] + iat_rva: int + + +@dataclass +class PEBoundImport: + """IMAGE_BOUND_IMPORT_DESCRIPTOR entry. + + `timestamp` is the DLL build timestamp the binary was bound against; + `forwarder_refs` are 0-N additional DLLs the bound-import chain + transitively pre-resolved through.""" + + dll: str + timestamp: int + forwarder_refs: List[str] = field(default_factory=list) + + +@dataclass +class CLRHeader: + """Decoded IMAGE_COR20_HEADER (data dir 14 — `.NET CLR Runtime + Header`). Identifies the binary as a .NET assembly and points at + the CLR metadata + entry-point token. + + Splat doesn't decode the metadata tables themselves (would need + the full ECMA-335 reader); this just surfaces the header fields + so the analyst sees the assembly is .NET-native and can fetch + the metadata blob from its RVA.""" + + cb_size: int # always 72 + runtime_major: int + runtime_minor: int + metadata_rva: int + metadata_size: int + flags: int + entry_point_token_or_rva: int + resources_rva: int + resources_size: int + strong_name_signature_rva: int + strong_name_signature_size: int + + +@dataclass +class UnwindInfo: + """Decoded x64 SEH IMAGE_UNWIND_INFO record (PE32+ only). + + `prolog_size` is the number of bytes the prologue occupies; codes + are a flattened list of `(offset_in_prolog, op_name, info_nibble)` + triples describing the prolog ops the unwinder will replay. The + `chained_function_rva` is set when the chain-info flag is on and + a subsequent RUNTIME_FUNCTION's begin/end/unwind triple follows + the codes.""" + + version: int + flags: int + prolog_size: int + frame_register: int # 0 = none; otherwise the x86_64 register index + frame_register_offset: int # nibble × 16 + codes: List[Tuple[int, str, int]] = field(default_factory=list) + chained_function_rva: Optional[int] = None + + +@dataclass +class COFFSymbol: + """One IMAGE_SYMBOL record (18 bytes) from the deprecated COFF + symbol table that vintage MSVC linkers wrote past the last raw + section. Modern toolchains rely on .pdb instead and leave the + optional header's PointerToSymbolTable zero, so this list is + typically empty on Windows 7+ era binaries.""" + + name: str + value: int # VA or RVA depending on storage class + section_number: int # 1-based, 0 = undefined, -1 = absolute, -2 = debug + sym_type: int # combined base + complex type + storage_class: int + aux_records: int # number of trailing IMAGE_AUX_SYMBOL entries + + +@dataclass +class PEResource: + """One leaf in the .rsrc tree. + + `rtype` is the resource-type ID (or a UTF-16 name for custom types). + `rid` is the per-type identifier or name. `language` is the locale id. + `rva` / `size` point at the resource's raw bytes inside the image. + """ + + rtype: object # int (standard) or str (custom-named type) + rid: object # int or str + language: int + rva: int + size: int + + +@dataclass +class PEInfo: + """Result of parsing a Win32 PE32 / PE32+ image. + + Carries every field splat needs from the DOS stub, COFF file + header, optional header, section table, and all 16 data + directories. Populated by `parse_pe(target_bytes)` and exposed + via the module-level `info` global so segtypes and the + disassembler can consult it without threading it through every + call. Empty / zero defaults mean 'directory not present' — there + is no separate Optional[List] for directory-derived fields.""" + + machine: int = 0 + num_sections: int = 0 + timestamp: int = 0 + characteristics: int = 0 + + is_pe32_plus: bool = False + image_base: int = 0 + entry_point_rva: int = 0 + section_alignment: int = 0 + file_alignment: int = 0 + size_of_image: int = 0 + size_of_headers: int = 0 + subsystem: int = 0 + dll_characteristics: int = 0 + size_of_stack_reserve: int = 0 + size_of_stack_commit: int = 0 + size_of_heap_reserve: int = 0 + size_of_heap_commit: int = 0 + linker_major: int = 0 + linker_minor: int = 0 + + pe_header_offset: int = 0 + sections: List[PESection] = field(default_factory=list) + + # 16 (rva, size) entries from the optional header. Populated only when + # NumberOfRvaAndSizes is large enough. + data_directories: List[Tuple[int, int]] = field(default_factory=list) + exports: List[PEExport] = field(default_factory=list) + export_dll_name: Optional[str] = None + imports: List[PEImport] = field(default_factory=list) + bound_imports: List[PEBoundImport] = field(default_factory=list) + delay_imports: List[PEImport] = field(default_factory=list) + # PE32+ / ARM: array of (begin_rva, end_rva, unwind_rva) describing + # function bounds for SEH unwinding. Empty for PE32. + runtime_functions: List[Tuple[int, int, int]] = field(default_factory=list) + # /GS security cookie VA (data dir 10 → IMAGE_LOAD_CONFIG_DIRECTORY). + security_cookie_va: int = 0 + # /SAFESEH handler RVAs (PE32 only). + safe_seh_handlers: List[int] = field(default_factory=list) + # /guard:cf — array of valid indirect-call target RVAs. + cfg_function_rvas: List[int] = field(default_factory=list) + cfg_flags: int = 0 + # RVAs the loader's base-relocation logic identifies as absolute + # pointers (HIGHLOW for PE32, DIR64 for PE32+). Useful for emitting + # data-section .long/.quad entries with symbolic targets. + pointer_rvas: Set[int] = field(default_factory=set) + # Deprecated COFF symbol table at the end of the file (PE binaries + # essentially never have these populated — debug info lives in the + # external .pdb instead — but a non-zero value is informative.) + coff_symtab_ptr: int = 0 + coff_num_symbols: int = 0 + # Parsed IMAGE_SYMBOL records when coff_symtab_ptr/coff_num_symbols + # are non-zero. Empty on modern PEs. + coff_symbols: List["COFFSymbol"] = field(default_factory=list) + # PDB filename embedded in the Debug directory's CodeView record, when + # present. + pdb_path: Optional[str] = None + # GUID (RSDS) or 32-bit signature (NB10) identifying the matching PDB. + pdb_guid: Optional[str] = None + # Build age — incremented every time the PDB is updated. + pdb_age: Optional[int] = None + # TLS callback VAs gathered from data directory 9. + tls_callback_vas: List[int] = field(default_factory=list) + # Decoded IMAGE_UNWIND_INFO records, keyed by unwind RVA. + # PE32+ only; remains empty for PE32 binaries. + unwind_info: Dict[int, UnwindInfo] = field(default_factory=dict) + # Decoded IMAGE_COR20_HEADER when data dir 14 is populated (.NET). + clr_header: Optional[CLRHeader] = None + # Resources enumerated from data directory 2 (.rsrc). + resources: List[PEResource] = field(default_factory=list) + # Decoded VS_VERSIONINFO key/value pairs (CompanyName, FileVersion, + # ProductName, OriginalFilename, etc.). + version_info: dict = field(default_factory=dict) + + @property + def entry_point_va(self) -> int: + return self.image_base + self.entry_point_rva + + def section_by_name(self, name: str) -> Optional[PESection]: + for s in self.sections: + if s.name == name: + return s + return None + + def rva_to_file_offset(self, rva: int) -> Optional[int]: + """Translate an RVA to its on-disk file offset, or None when + the RVA isn't backed by file bytes — either because it falls + outside every section's virtual range or because it sits in + the virtual-only tail of a section whose VirtualSize exceeds + SizeOfRawData (loader zero-fills that range; no file bytes + back it). Callers must handle None to avoid reading into a + neighbouring section's data.""" + for s in self.sections: + sec_end = s.virtual_address + max(s.virtual_size, s.raw_size) + if not (s.virtual_address <= rva < sec_end): + continue + # In a section whose VirtualSize > SizeOfRawData (MSVC zero-init + # tail or read-only constants past the file boundary), RVAs + # within the trailing virtual-only range have NO backing + # bytes — the loader zero-fills them at map time. Returning + # raw_pointer + offset for those RVAs would land in the + # next section's raw bytes. Reject instead. + offset_in_section = rva - s.virtual_address + if offset_in_section >= s.raw_size: + return None + return s.raw_pointer + offset_in_section + return None + + def va_to_file_offset(self, va: int) -> Optional[int]: + """Translate an image-base-relative virtual address to a file + offset. Convenience wrapper that subtracts `image_base` and + delegates to `rva_to_file_offset`; same None semantics.""" + return self.rva_to_file_offset(va - self.image_base) + + +# Populated by `init`, consulted by segtypes/disassembler. +info: PEInfo = PEInfo() +# Full file bytes — kept on the side so heuristics in segtypes can peek +# at arbitrary section content (e.g. validate a candidate function start +# byte) without threading rom_bytes through every call. +raw_image: bytes = b"" + + +def resolve_exact_encoding( + yaml: object, parent: "Optional[object]", default: bool = False +) -> bool: + """Shared `exact_encoding` flag resolution used by Win32SegText / + Win32SegData / Win32SegPdata. Priority order matches what users + expect: per-subsegment YAML setting wins; if absent, fall back to + the parent code-group YAML; finally fall back to `default`. The + flag toggles label-substitution off so emitted bytes match the + original file verbatim (needed for byte-identical reassembly).""" + if isinstance(yaml, dict): + v = yaml.get("exact_encoding") + if v is not None: + return bool(v) + if parent is not None: + parent_yaml = getattr(parent, "yaml", None) + if isinstance(parent_yaml, dict): + v = parent_yaml.get("exact_encoding") + if v is not None: + return bool(v) + return default + + +def sanitize_label(s: str) -> str: + """Canonical GAS-label sanitization shared by every site that emits + labels derived from PE strings. Non-alphanumeric chars become '_'; + leading-digit identifiers (GAS-invalid) are prefixed with '_'.""" + out = "".join(c if c.isalnum() or c == "_" else "_" for c in s) + if out and out[0].isdigit(): + out = "_" + out + return out + + +def compute_iat_labels(pe: PEInfo) -> Dict[int, str]: + """Return a {slot_va: label} mapping for every IAT (eager + delay) + slot. Labels match what `create_win32_config` writes to + symbol_addrs.txt — including dedup-on-collision behaviour — so + disassembly references resolve to the same identifiers.""" + out: Dict[int, str] = {} + + def populate(items: List[PEImport], prefix: str) -> None: + seen: Set[str] = set() + for imp in items: + slot_va = pe.image_base + imp.iat_rva + nm = imp.name or f"ordinal_{imp.ordinal}" + safe = sanitize_label(nm) + # Empty DLL stem after sanitisation (corrupted descriptor + # with missing name) — substitute a recognisable + # placeholder so the label doesn't collapse to `imp__foo` + # for every unknown-DLL import. + dll_safe = sanitize_label(imp.dll) or "unknown" + full = f"{prefix}_{dll_safe}_{safe}" + if full in seen: + full = f"{full}__rva{imp.iat_rva:X}" + seen.add(full) + out[slot_va] = full + + populate(pe.imports, "imp") + populate(pe.delay_imports, "dimp") + return out + + +def compute_export_labels( + pe: PEInfo, reserved: Optional[Set[str]] = None +) -> Dict[int, str]: + """Return a {export_va: label} mapping for every non-forwarder + export. `reserved` is a pre-seeded set of labels already in use + (e.g. {'entrypoint'}); colliding exports get an ordinal suffix. + Matches create_win32_config's symbol_addrs emission.""" + out: Dict[int, str] = {} + seen: Set[str] = set(reserved or set()) + for exp in pe.exports: + if exp.forwarder is not None: + continue + nm = exp.name or f"export_{exp.ordinal}" + safe = sanitize_label(nm) + if safe in seen: + safe = f"{safe}__ord{exp.ordinal}" + seen.add(safe) + out[pe.image_base + exp.rva] = safe + return out + + +def parse_pe(data: bytes) -> PEInfo: + """Parse `data` as a Win32 PE32 or PE32+ image and return a populated + `PEInfo`. Walks the DOS stub, COFF file header, optional header, and + every populated data directory: + + 0 Export Table → `exports`, `export_dll_name` + 1 Import Table → `imports` (+ IAT slot RVAs) + 2 Resource Table → `resources`, `version_info` + 3 Exception Table → `runtime_functions` + `unwind_info` + 5 Base Relocation Table → `pointer_rvas` + 6 Debug → `pdb_path`, `pdb_guid`, `pdb_age` + 9 TLS Table → `tls_callback_vas` + 10 Load Config → `security_cookie_va`, `safe_seh_handlers`, + `cfg_function_rvas`, `cfg_flags` + 11 Bound Import → `bound_imports` + 13 Delay Import → `delay_imports` + 14 CLR Runtime Header → `clr_header` (.NET assemblies) + + Plus the deprecated COFF symbol table when the optional header + points at one (`coff_symbols`). + + Logs a fatal error and exits on structural malformations: missing + MZ/PE magics, mismatched machine/magic combinations, optional + header below the per-format minimum size, or sections that run + past EOF. Every iteration loop has a defensive cap so a fuzzed PE + can't make the parser scan past realistic bounds. Safe to call on + hand-crafted byte buffers.""" + if len(data) < 0x40 or data[:2] != DOS_MAGIC: + log.error("win32 target does not start with an MZ DOS header") + + pe_off = struct.unpack_from(" len(data) or data[pe_off : pe_off + 4] != PE_MAGIC: + log.error(f"win32 target does not contain a PE header at 0x{pe_off:X}") + + # COFF file header (20 bytes) follows the 4-byte PE signature. + coff_off = pe_off + 4 + ( + machine, + num_sections, + timestamp, + coff_symtab_ptr, + coff_num_symbols, + size_of_optional_header, + characteristics, + ) = struct.unpack_from(" len(data): + log.error("win32 optional header runs past end of file") + + magic = struct.unpack_from(" opt_off + size_of_optional_header: + break + rva = struct.unpack_from(" len(data): + log.error(f"win32 section header {i} runs past end of file") + raw_name = data[sh : sh + 8] + name = raw_name.split(b"\x00", 1)[0].decode("ascii", errors="replace") + ( + virt_size, + virt_addr, + raw_size, + raw_ptr, + _ptr_relocs, + _ptr_linenums, + _num_relocs, + _num_linenums, + scn_chars, + ) = struct.unpack_from(" str: + """Read a NUL-terminated ASCII string. Truncates at `_MAX_CSTR_LEN` + to bound parse-time cost on malformed or non-NUL-terminated input.""" + if off < 0 or off >= len(data): + return "" + cap = min(off + _MAX_CSTR_LEN, len(data)) + end = data.find(b"\x00", off, cap) + if end < 0: + end = cap + return data[off:end].decode("ascii", errors="replace") + + +def parse_exports(data: bytes, pe: PEInfo) -> None: + """Populate `pe.exports` and `pe.export_dll_name` from data directory 0.""" + if not pe.data_directories: + return + exp_rva, exp_size = pe.data_directories[DIR_EXPORT] + if exp_rva == 0 or exp_size == 0: + return + exp_off = pe.rva_to_file_offset(exp_rva) + if exp_off is None or exp_off + 40 > len(data): + return + + ( + _flags, + _ts, + _vmaj, + _vmin, + name_rva, + ord_base, + num_funcs, + num_names, + funcs_rva, + names_rva, + ordinals_rva, + ) = struct.unpack_from(" len(data): + break + if ordinals_off + i * 2 + 2 > len(data): + break + name_ptr_rva = struct.unpack_from(" len(data): + break + func_rva = struct.unpack_from(" Tuple[int, str, str, int]: + """Per-bitness pointer-slot constants used by data + text segments: + (size_bytes, struct_fmt, asm_directive, hex_print_width). + + PE32 uses 4-byte slots emitted as `.long 0xXXXXXXXX`; PE32+ uses + 8-byte slots emitted as `.quad 0xXXXXXXXXXXXXXXXX`.""" + if is_pe32_plus: + return 8, " Tuple[int, int, str]: + """Per-bitness thunk-array constants: (size_bytes, ordinal_flag, struct_fmt). + PE32 thunks are 32-bit DWORDs with the ordinal flag at bit 31; + PE32+ thunks are 64-bit QWORDs with the ordinal flag at bit 63. + Signature matches `ptr_layout` for symmetry — both take a bool + rather than a full PEInfo object.""" + if is_pe32_plus: + return 8, 1 << 63, " None: + """Walk a NULL-terminated array of import thunks (used by both + eager and delay-load import descriptors). Each thunk is either an + ordinal value (high-bit set) or an RVA pointing at a hint/name + record. Resolved entries are appended to `dest` as PEImport + records keyed by the corresponding IAT slot RVA.""" + for _ in range(65536): + if thunk_off + thunk_size > len(data): + break + thunk = struct.unpack_from(thunk_fmt, data, thunk_off)[0] + if thunk == 0: + break + name: Optional[str] = None + ordinal: Optional[int] = None + if thunk & ordinal_flag: + ordinal = thunk & 0xFFFF + else: + hint_off = pe.rva_to_file_offset(thunk & 0x7FFFFFFF) + if hint_off is not None and hint_off + 2 < len(data): + # Capture the 16-bit hint as a fallback ordinal when + # the name string at hint+2 is empty (stripped binary + # or hand-crafted IAT). + hint = struct.unpack_from(" None: + """Populate `pe.imports` from data directory 1 (the Import Table). + + Reads IMAGE_IMPORT_DESCRIPTOR entries until the null terminator. For + each DLL walks the Import Lookup Table (OriginalFirstThunk) — falling + back to the IAT (FirstThunk) when the ILT is absent — and records the + DLL name, imported symbol, and the IAT slot RVA so call sites that + reach the IAT can be tagged with the imported name. + """ + if len(pe.data_directories) <= DIR_IMPORT: + return + imp_rva, imp_size = pe.data_directories[DIR_IMPORT] + if imp_rva == 0 or imp_size == 0: + return + imp_off = pe.rva_to_file_offset(imp_rva) + if imp_off is None: + return + + thunk_size, ordinal_flag, thunk_fmt = _thunk_layout(pe.is_pe32_plus) + + desc_off = imp_off + # Cap descriptor count at a sane maximum to bound parse cost on + # malformed binaries. + for _desc_i in range(4096): + if desc_off + 20 > len(data): + break + ilt_rva, _ts, _fwd, dll_name_rva, iat_rva = struct.unpack_from( + " None: + """Parse data directory 13 (Delay Import). MSVC `__declspec(dllimport)` + with `/DELAYLOAD` produces a separate import table for lazy resolution. + + Each IMAGE_DELAYLOAD_DESCRIPTOR is 32 bytes; entries are NULL-terminated. + `Attributes` flag bit 0 indicates RVA-based fields (v2); else they're + VAs that need ImageBase subtraction (v1, legacy).""" + if len(pe.data_directories) <= DIR_DELAY_IMPORT: + return + di_rva, di_size = pe.data_directories[DIR_DELAY_IMPORT] + if di_rva == 0 or di_size == 0: + return + di_off = pe.rva_to_file_offset(di_rva) + if di_off is None: + return + + thunk_size, ordinal_flag, thunk_fmt = _thunk_layout(pe.is_pe32_plus) + + desc_off = di_off + end = di_off + di_size + for _desc_i in range(4096): + if desc_off + 32 > end or desc_off + 32 > len(data): + break + ( + attrs, + dll_name_field, + _module_handle, + iat_field, + int_field, + _bound_iat, + _unload_iat, + _ts, + ) = struct.unpack_from(" int: + if attrs & 1: + return field + return field - pe.image_base if field >= pe.image_base else field + + dll_name_rva = to_rva(dll_name_field) + iat_rva = to_rva(iat_field) + int_rva = to_rva(int_field) if int_field else iat_rva + + dll_off = pe.rva_to_file_offset(dll_name_rva) + dll_name = _read_cstr(data, dll_off) if dll_off is not None else "?" + + int_off = pe.rva_to_file_offset(int_rva) + if int_off is None or iat_rva == 0: + continue + + _walk_thunk_array( + data, + pe, + int_off, + iat_rva, + thunk_size, + thunk_fmt, + ordinal_flag, + dll_name, + pe.delay_imports, + ) + + +def parse_bound_imports(data: bytes, pe: PEInfo) -> None: + """Parse data directory 11 (Bound Import Table). + + Unlike the regular import table, bound-import entries reference DLL + names by an offset relative to the start of the bound-import directory + itself (NOT an RVA). Entries are 8-byte IMAGE_BOUND_IMPORT_DESCRIPTOR + structs terminated by an all-zero entry, optionally followed by + forwarder-ref descriptors.""" + if len(pe.data_directories) <= DIR_BOUND_IMPORT: + return + bi_rva, bi_size = pe.data_directories[DIR_BOUND_IMPORT] + if bi_rva == 0 or bi_size == 0: + return + bi_off = pe.rva_to_file_offset(bi_rva) + if bi_off is None: + return + end = bi_off + bi_size + + cur = bi_off + for _bi_i in range(4096): + if cur + 8 > end or cur + 8 > len(data): + break + ts, name_off, n_fwd = struct.unpack_from(" len(data) or cur + 8 > end: + break + _ts, fname_off, _rsv = struct.unpack_from(" None: + """Parse data directory 10 (Load Config). Extracts: + + - SecurityCookie VA (`/GS` cookie used to detect stack-buffer overruns) + - SEHandlerTable RVA + count: array of valid SEH handler RVAs the + MSVC `/SAFESEH` linker switch produced for PE32 binaries. + """ + if len(pe.data_directories) <= DIR_LOAD_CONFIG: + return + lc_rva, lc_size = pe.data_directories[DIR_LOAD_CONFIG] + if lc_rva == 0 or lc_size == 0: + return + lc_off = pe.rva_to_file_offset(lc_rva) + if lc_off is None: + return + + if pe.is_pe32_plus: + # PE32+ layout: SecurityCookie at +0x58 (QWORD); CFG fields at + # +0x70/+0x78/+0x80/+0x88/+0x90. + if lc_off + 0x60 > len(data): + return + pe.security_cookie_va = struct.unpack_from(" len(data): + return + pe.security_cookie_va = struct.unpack_from(" len(data): + break + pe.safe_seh_handlers.append( + struct.unpack_from(" None: + """Read a GuardCFFunctionTable. Each entry is at least 4 bytes (RVA); + the high bits of `GuardFlags` indicate optional metadata bytes that + follow each RVA. We compute the per-entry stride and harvest only the + RVA from each slot.""" + if not table_va or not count: + return + table_off = pe.va_to_file_offset(table_va) + if table_off is None: + return + # GuardFlags bits 28..31 hold the count of extra metadata bytes per + # entry, capped to 7. Stride is 4 + extra_bytes. + extra = (pe.cfg_flags >> 28) & 0x0F + stride = 4 + min(extra, 7) + # /guard:cf tables can be large (Windows 10 ntdll has ~40k entries) + # but a megabyte of guarded functions is well beyond realistic. + for i in range(min(count, 1_000_000)): + entry_off = table_off + i * stride + if entry_off + 4 > len(data): + break + rva = struct.unpack_from(" None: + """Parse data directory 3 (Exception Table). For PE32+ this is an + array of RUNTIME_FUNCTION entries (12 bytes each) giving definitive + function boundaries — useful both for surfacing real function starts + and for navigating SEH unwind data.""" + if len(pe.data_directories) <= DIR_EXCEPTION: + return + et_rva, et_size = pe.data_directories[DIR_EXCEPTION] + if et_rva == 0 or et_size == 0: + return + et_off = pe.rva_to_file_offset(et_rva) + if et_off is None: + return + end = min(et_off + et_size, len(data)) + # Bound at ~1M RUNTIME_FUNCTION entries: more than any realistic PE. + max_entries = min((end - et_off) // 12, 1_000_000) + for i in range(max_entries): + cur = et_off + i * 12 + if cur + 12 > end: + break + begin, fin, uw = struct.unpack_from(" None: + """Populate `pe.pointer_rvas` from data directory 5 (the Base Relocation + Table). Each block applies to one 4 KB page; entries of type 3 + (HIGHLOW, 32-bit) or 10 (DIR64, 64-bit) mark RVAs of absolute pointers + that the PE loader needs to rebase. Padding entries (type 0) are + skipped.""" + if len(pe.data_directories) <= DIR_BASERELOC: + return + rel_rva, rel_size = pe.data_directories[DIR_BASERELOC] + if rel_rva == 0 or rel_size == 0: + return + rel_off = pe.rva_to_file_offset(rel_rva) + if rel_off is None: + return + + end = rel_off + rel_size + accept = {RELOC_TYPE_HIGHLOW, RELOC_TYPE_DIR64} + block = rel_off + while block + 8 <= end and block + 8 <= len(data): + page_rva, block_size = struct.unpack_from(" end: + break + entries_end = block + block_size + entry = block + 8 + while entry + 2 <= entries_end: + word = struct.unpack_from("> 12 + if kind == RELOC_TYPE_ABSOLUTE: + continue + if kind not in accept: + continue + pe.pointer_rvas.add(page_rva + (word & 0x0FFF)) + block += block_size + + +def parse_debug(data: bytes, pe: PEInfo) -> None: + """Populate `pe.pdb_path` from data directory 6 (Debug). Walks the + IMAGE_DEBUG_DIRECTORY array, looking for a CodeView (type 2) entry that + embeds either an `RSDS`/`NB10` record with a trailing PDB filename.""" + if len(pe.data_directories) <= DIR_DEBUG: + return + dbg_rva, dbg_size = pe.data_directories[DIR_DEBUG] + if dbg_rva == 0 or dbg_size == 0: + return + dbg_off = pe.rva_to_file_offset(dbg_rva) + if dbg_off is None: + return + + end = dbg_off + dbg_size + entry = dbg_off + while entry + 28 <= end and entry + 28 <= len(data): + ( + _chars, + _ts, + _vmaj, + _vmin, + entry_type, + size_of_data, + _addr_of_raw, + ptr_to_raw, + ) = struct.unpack_from(" len(data): + continue + cv = data[ptr_to_raw : ptr_to_raw + size_of_data] + magic = cv[:4] + if magic == b"RSDS" and len(cv) >= 24: + # Layout: magic(4) + GUID(16) + age(4) + name + g0, g1, g2, g3 = struct.unpack_from("= 16: + # Layout: magic(4) + offset(4) + signature(4) + age(4) + name + sig = struct.unpack_from(" None: + """Populate `pe.tls_callback_vas` from data directory 9 (TLS). + `IMAGE_TLS_DIRECTORY` for PE32 has AddressOfCallBacks at offset 0x0C + (PE32+ at 0x18). The pointer dereferences to a NULL-terminated array + of callback VAs.""" + if len(pe.data_directories) <= DIR_TLS: + return + tls_rva, tls_size = pe.data_directories[DIR_TLS] + if tls_rva == 0 or tls_size == 0: + return + tls_off = pe.rva_to_file_offset(tls_rva) + if tls_off is None: + return + + if pe.is_pe32_plus: + if tls_off + 0x20 > len(data): + return + cb_va = struct.unpack_from(" len(data): + return + cb_va = struct.unpack_from(" len(data): + break + v = struct.unpack_from(ptr_fmt, data, cb_off)[0] + if v == 0: + break + pe.tls_callback_vas.append(v) + cb_off += ptr_size + + +def linker_version_label(major: int, minor: int) -> str: + """Translate an `IMAGE_OPTIONAL_HEADER.MajorLinkerVersion` value into a + rough Visual C++ / linker product name. Real binaries are usually + produced by Microsoft's `link.exe`; the major number tracks the MSVC + release closely enough to surface as a hint.""" + mapping = { + 2: "MSVC 2.x", + 3: "MSVC 4.x", + 4: "MSVC 4.x", + 5: "MSVC 5.0", + 6: "MSVC 6.0", + 7: "MSVC 7.0 / VS .NET 2002", + 8: "MSVC 8.0 / VS 2005", + 9: "MSVC 9.0 / VS 2008", + 10: "MSVC 10.0 / VS 2010", + 11: "MSVC 11.0 / VS 2012", + 12: "MSVC 12.0 / VS 2013", + 14: "MSVC 14.x / VS 2015-2022", + } + return mapping.get(major, f"linker v{major}.{minor:02d}") + + +RESOURCE_TYPE_NAMES = { + 1: "CURSOR", + 2: "BITMAP", + 3: "ICON", + 4: "MENU", + 5: "DIALOG", + 6: "STRING", + 7: "FONTDIR", + 8: "FONT", + 9: "ACCELERATOR", + 10: "RCDATA", + 11: "MESSAGETABLE", + 12: "GROUP_CURSOR", + 14: "GROUP_ICON", + 16: "VERSION", + 17: "DLGINCLUDE", + 19: "PLUGPLAY", + 20: "VXD", + 21: "ANICURSOR", + 22: "ANIICON", + 23: "HTML", + 24: "MANIFEST", +} + + +def _read_resource_name(data: bytes, name_field: int, root_off: int) -> object: + """Return either the integer ID, or the decoded UTF-16 name string.""" + if name_field & 0x80000000: + name_off = root_off + (name_field & 0x7FFFFFFF) + if name_off + 2 > len(data): + return name_field & 0x7FFFFFFF + length = struct.unpack_from(" len(data): + return name_field & 0x7FFFFFFF + return data[text_off:text_end].decode("utf-16-le", errors="replace") + return name_field + + +def parse_resources(data: bytes, pe: PEInfo) -> None: + """Walk the .rsrc tree (3 nominal levels: type → name → language) and + record each leaf in `pe.resources`.""" + if len(pe.data_directories) <= DIR_RESOURCE: + return + rsrc_rva, rsrc_size = pe.data_directories[DIR_RESOURCE] + if rsrc_rva == 0 or rsrc_size == 0: + return + root_off = pe.rva_to_file_offset(rsrc_rva) + if root_off is None: + return + + def walk_dir(dir_off: int, depth: int, path: tuple) -> None: + # Bound the recursion: a valid resource tree has only 3 levels + # (type → name → language). Anything deeper indicates a circular + # reference or malformed data — bail out rather than recurse. + if depth > 8: + return + if dir_off + 16 > len(data): + return + ( + _chars, + _ts, + _vmaj, + _vmin, + n_named, + n_id, + ) = struct.unpack_from(" len(data): + return + name_field, data_field = struct.unpack_from(" len(data): + continue + leaf_rva, leaf_size, _cp, _rsv = struct.unpack_from( + "= 2: + rtype, rid = path[0], path[1] + else: + rtype, rid = path[0], None + pe.resources.append( + PEResource( + rtype=rtype, + rid=rid, + language=name_val if isinstance(name_val, int) else 0, + rva=leaf_rva, + size=leaf_size, + ) + ) + + walk_dir(root_off, 0, ()) + + +def _align4(off: int) -> int: + return (off + 3) & ~3 + + +def _read_wstr(blob: bytes, off: int) -> tuple: + """Read a UTF-16 NUL-terminated string starting at `off`. Returns + (decoded_string, next_offset_past_terminator).""" + end = off + while end + 1 < len(blob): + if blob[end] == 0 and blob[end + 1] == 0: + break + end += 2 + text = blob[off:end].decode("utf-16-le", errors="replace") + return text, end + 2 + + +def _walk_versioninfo_node(blob: bytes, off: int, out: dict, base_off: int) -> int: + """Walk one VS_VERSIONINFO-style node starting at `off`. Recurses into + children. Strings are recorded into `out`. Returns the offset just past + this node (already aligned).""" + if off + 6 > len(blob): + return len(blob) + w_length, w_value_length, w_type = struct.unpack_from(" len(blob): + return len(blob) + key, body_off = _read_wstr(blob, off + 6) + body_off = _align4(body_off - base_off) + base_off + + value_end = body_off + if ( + w_value_length > 0 + and body_off + (w_value_length * (2 if w_type == 1 else 1)) <= node_end + ): + if w_type == 1: # text — value is UTF-16 + value = blob[body_off : body_off + w_value_length * 2] + # Strip trailing NUL WCHAR(s) without splitting on misaligned + # zero pairs. + chars = [ + value[i : i + 2] for i in range(0, len(value) - (len(value) & 1), 2) + ] + text_chars = [] + for wch in chars: + if wch == b"\x00\x00": + break + text_chars.append(wch) + value_text = b"".join(text_chars).decode("utf-16-le", errors="replace") + if key not in {"VS_VERSION_INFO", "StringFileInfo", "VarFileInfo"}: + out[key] = value_text + value_end = body_off + w_value_length * 2 + else: + # Binary value. The VarFileInfo's "Translation" child carries + # an array of (LANGID, codepage) WORD pairs as binary data — + # one entry per locale supported by the version resource. + if key == "Translation" and w_value_length >= 4: + pairs = [] + pair_off = body_off + while pair_off + 4 <= node_end: + langid, codepage = struct.unpack_from("= w_value_length: + break + # Render as a comma-separated list of `0xLLLL/0xCCCC` + # so the version_info dict stays str→str. + out["Translation"] = ", ".join( + f"0x{lid:04X}/0x{cp:04X}" for lid, cp in pairs + ) + value_end = body_off + w_value_length + value_end = _align4(value_end - base_off) + base_off + + # Recurse into children, if any space remains. + child = value_end + while child < node_end: + next_child = _walk_versioninfo_node(blob, child, out, base_off) + if next_child <= child: + break + child = _align4(next_child - base_off) + base_off + + return node_end + + +_UNWIND_OP_NAMES = { + 0: "PUSH_NONVOL", + 1: "ALLOC_LARGE", + 2: "ALLOC_SMALL", + 3: "SET_FPREG", + 4: "SAVE_NONVOL", + 5: "SAVE_NONVOL_FAR", + 6: "EPILOG", + 7: "SPARE_CODE", + 8: "SAVE_XMM128", + 9: "SAVE_XMM128_FAR", + 10: "PUSH_MACHFRAME", +} + + +def parse_clr_header(data: bytes, pe: PEInfo) -> None: + """Parse data directory 14 (CLR Runtime Header) when present. + Identifies the binary as a .NET assembly and surfaces metadata / + entry-point / strong-name fields so the analyst doesn't have to + chase down the assembly's structure manually.""" + if len(pe.data_directories) <= DIR_COM_DESCRIPTOR: + return + clr_rva, clr_size = pe.data_directories[DIR_COM_DESCRIPTOR] + if clr_rva == 0 or clr_size == 0: + return + f_off = pe.rva_to_file_offset(clr_rva) + if f_off is None or f_off + 72 > len(data): + return + ( + cb_size, + rt_major, + rt_minor, + md_rva, + md_size, + flags, + entry_tok, + res_rva, + res_size, + sn_rva, + sn_size, + ) = struct.unpack_from(" None: + """Decode each PE32+ RUNTIME_FUNCTION's IMAGE_UNWIND_INFO record. + + The Microsoft x64 SEH spec lays UNWIND_INFO out as: + +0 : byte Version (low 3 bits) | Flags (high 5 bits) + +1 : byte SizeOfProlog + +2 : byte CountOfUnwindCodes + +3 : byte FrameRegister (low 4 bits) | FrameRegOffset*16 (high 4) + +4 : code[] CountOfUnwindCodes × 2 bytes + + : padding to QWORD + + : optional handler / chain-info (per flags) + + Each unwind code is a `(prolog_offset, opcode, info)` triple. We + only decode the ops + chained-record pointer; exception-handler + data isn't surfaced (rarely useful from a disassembly viewpoint). + """ + if not pe.is_pe32_plus: + return + seen: Set[int] = set() + for begin, _end, raw_uw in pe.runtime_functions: + uw = raw_uw & 0x7FFFFFFF + if not uw or uw in seen: + continue + seen.add(uw) + f_off = pe.rva_to_file_offset(uw) + if f_off is None or f_off + 4 > len(data): + continue + b0, b1, n_codes, b3 = struct.unpack_from("> 3 + prolog_size = b1 + frame_register = b3 & 0x0F + frame_register_offset = (b3 >> 4) * 16 + + codes_off = f_off + 4 + codes_end = codes_off + n_codes * 2 + if codes_end > len(data): + continue + codes: List[Tuple[int, str, int]] = [] + i = 0 + while i < n_codes: + code_off = codes_off + i * 2 + offset_in_prolog = data[code_off] + packed = data[code_off + 1] + op = packed & 0x0F + info = packed >> 4 + codes.append((offset_in_prolog, _UNWIND_OP_NAMES.get(op, f"op{op}"), info)) + # Ops 1, 4, 5, 8, 9 carry extra slots — skip them so we + # don't misread the next code's prolog_offset. + extra_slots = {1: 1 + (1 if info else 0), 4: 1, 5: 2, 8: 1, 9: 2}.get(op, 0) + i += 1 + extra_slots + + chained_rva: Optional[int] = None + if flags & 0x04: # UNW_FLAG_CHAININFO + # The chained RUNTIME_FUNCTION starts at the byte + # immediately after the unwind codes, aligned to DWORD. + chain_off = (codes_end + 3) & ~3 + if chain_off + 12 <= len(data): + chained_rva = struct.unpack_from(" None: + """Parse the deprecated COFF symbol table when the optional header + points at one. Modern MSVC binaries leave PointerToSymbolTable + zero and ship debug info via PDB; this parser exists so vintage + MSVC 4-6 binaries (and some object-file-style PEs) get their + embedded symbol records surfaced as `pe.coff_symbols`. + + Each IMAGE_SYMBOL record is 18 bytes: + 0 : 8 bytes Name (zero-terminated; if first 4 bytes are 0 + the next 4 bytes are a string-table offset) + 8 : 4 bytes Value + 12 : 2 bytes SectionNumber (signed: 0/-1/-2 are special) + 14 : 2 bytes Type + 16 : 1 byte StorageClass + 17 : 1 byte NumberOfAuxSymbols + + The string table immediately follows the symbol records; its + leading DWORD is its total length. + """ + if not pe.coff_symtab_ptr or not pe.coff_num_symbols: + return + base = pe.coff_symtab_ptr + n_syms = pe.coff_num_symbols + end = base + n_syms * 18 + if end > len(data): + return + str_table_off = end + + def _read_name(record_off: int) -> str: + name_bytes = data[record_off : record_off + 8] + # If the first 4 bytes are zero, the next 4 are the string- + # table offset (relative to the string table base). + if name_bytes[:4] == b"\x00\x00\x00\x00": + str_off = struct.unpack_from("= len(data): + return "" + return _read_cstr(data, abs_off) + return name_bytes.split(b"\x00", 1)[0].decode("ascii", errors="replace") + + # Cap iteration the same way other parsers do. + i = 0 + while i < min(n_syms, 1_000_000): + rec = base + i * 18 + if rec + 18 > len(data): + break + name = _read_name(rec) + value, section_number, sym_type, storage_class, aux = struct.unpack_from( + " None: + """Decode the VS_VERSIONINFO StringTable entries from every VERSION + resource (`rtype == 16`). Populates `pe.version_info` with keys like + `CompanyName`, `FileVersion`, `ProductName`, `OriginalFilename`.""" + for r in pe.resources: + if not (isinstance(r.rtype, int) and r.rtype == 16): + continue + f_off = pe.rva_to_file_offset(r.rva) + if f_off is None: + continue + if f_off + r.size > len(data): + continue + blob = data[f_off : f_off + r.size] + try: + _walk_versioninfo_node(blob, 0, pe.version_info, 0) + except Exception: + # Malformed VERSIONINFO; leave whatever we already extracted. + continue + + +def init(target_bytes: bytes): + """Splat platform entry point — called once per run with the full + target file bytes. Parses the PE, stashes the result in the + module-level `info` and `raw_image` globals (consulted by every + segtype + the disassembler), and rejects architectures we don't + have a Capstone backend for (ARM32 / ARM64 / unsupported machines) + with a friendly error pointing the user at the limitation.""" + global info, raw_image + info = parse_pe(target_bytes) + raw_image = target_bytes + if info.machine in (MACHINE_ARM64, MACHINE_ARM32): + log.error( + f"win32 target uses ARM architecture (machine 0x{info.machine:04X}); " + "the splat win32 platform currently only supports x86 / x86_64 " + "(Capstone-driven disassembly). PE structures parse cleanly but " + "instruction decode would need a separate backend." + ) + if info.machine not in (MACHINE_I386, MACHINE_AMD64): + log.error( + f"win32 target has unsupported machine type 0x{info.machine:04X} " + "(only i386 / amd64 are recognized)" + ) diff --git a/src/splat/scripts/create_config.py b/src/splat/scripts/create_config.py index 56ef8652..5b29e626 100644 --- a/src/splat/scripts/create_config.py +++ b/src/splat/scripts/create_config.py @@ -5,7 +5,7 @@ from pathlib import Path import subprocess import sys -from typing import Optional +from typing import List, Optional from ..util.n64 import find_code_length, rominfo from ..util.psx import psxexeinfo @@ -36,6 +36,16 @@ def main(file_path: Path, objcopy: Optional[str]): do_elf(file_path, file_bytes, objcopy) return + # Check for Win32 PE + if file_bytes[0:2] == b"MZ" and len(file_bytes) >= 0x40: + pe_off = int.from_bytes(file_bytes[0x3C:0x40], "little") + if ( + pe_off + 4 <= len(file_bytes) + and file_bytes[pe_off : pe_off + 4] == b"PE\x00\x00" + ): + create_win32_config(file_path, file_bytes) + return + log.error(f"create_config does not support the file format of '{file_path}'") @@ -374,6 +384,466 @@ def create_psx_config(exe_path: Path, exe_bytes: bytes): file_presets.write_all_files() +def create_win32_config(exe_path: Path, exe_bytes: bytes): + from ..platforms import win32 as _w32 + from ..platforms.win32 import ( + parse_pe, + SCN_CNT_CODE, + SCN_CNT_UNINITIALIZED_DATA, + SCN_MEM_EXECUTE, + SCN_MEM_WRITE, + ) + + pe = parse_pe(exe_bytes) + basename = exe_path.name.replace(" ", "").lower() + cleaned_basename = remove_invalid_path_characters(basename) + if not cleaned_basename: + # Pathological filename (all spaces / all invalid chars stripped + # to empty) would produce a bare ".yaml" / ".ld" output and a + # YAML basename: '' that splat rejects. Fall back to a synthetic + # placeholder so generated artefacts still have names. + cleaned_basename = "pe_target" + basename = cleaned_basename + + sha1 = hashlib.sha1(exe_bytes).hexdigest() + + # Quote paths to survive YAML special characters (spaces, ':', '#' + # are all syntactically meaningful when unquoted). + def _yaml_quote(s: object) -> str: + text = str(s) + return '"' + text.replace("\\", "\\\\").replace('"', '\\"') + '"' + + # Pick a compiler tag from telltale PE characteristics. The bulk of + # MSVC-linked binaries are identified via MajorLinkerVersion; MinGW + # and Clang-LLD are recognised via distinct fingerprints — MinGW + # builds typically import from `msvcrt.dll` or `libgcc_s_*.dll`, + # ship a `.idata`/`.CRT`/`.bss` section layout, and use linker + # version 2.x or 1.x. LLD-linked PEs identify themselves through + # a "Rich"-less DOS stub plus a `.rdata$zzzdebug` section, but we + # rely on the simpler heuristic: any import of `libc++.dll` or a + # GCC runtime stamps the binary as MinGW. + _LINKER_TO_MSVC = { + 2: "MSVC2", + 3: "MSVC4", + 4: "MSVC4", + 5: "MSVC5", + 6: "MSVC6", + 7: "MSVC7", + 8: "MSVC8", + 9: "MSVC9", + 10: "MSVC10", + 11: "MSVC11", + 12: "MSVC12", + 14: "MSVC14", + } + + def _detect_compiler() -> str: + dlls_lower = {imp.dll.lower() for imp in pe.imports} + # MinGW (gcc-linked) signatures: links to libgcc, libstdc++, + # libwinpthread, or has a .CRT section. + mingw_dlls = { + "libgcc_s_dw2-1.dll", + "libgcc_s_seh-1.dll", + "libstdc++-6.dll", + "libwinpthread-1.dll", + "libgcc_s.dll", + "libssp-0.dll", + } + section_names = {s.name for s in pe.sections} + if dlls_lower & mingw_dlls or ".CRT" in section_names: + return "MINGW" + # LLD signature: linker_major 14 but characteristics differ. + # Conservative: only flag if .text$mn or .rdata$zzzdebug are + # present (LLD-specific section grouping). + if any( + s.startswith(".text$") or s.startswith(".rdata$") for s in section_names + ): + return "CLANG_LLD" + return _LINKER_TO_MSVC.get(pe.linker_major, "MSVC6") + + compiler_tag = _detect_compiler() + + header = f"""\ +# name: {exe_path.name} +sha1: {sha1} +options: + basename: {basename} + target_path: {_yaml_quote(exe_path)} + base_path: . + platform: win32 + compiler: {compiler_tag} + + # asm_path: asm + # src_path: src + # build_path: build + + ld_script_path: {cleaned_basename}.ld + ld_dependencies: True + + o_as_suffix: True + + section_order: [".header", ".text", ".rdata", ".data", ".pdata", ".rodata", ".bss"] + + symbol_addrs_path: + - symbol_addrs.txt + + # undefined_funcs_auto_path: undefined_funcs_auto.txt + # undefined_syms_auto_path: undefined_syms_auto.txt + + extensions_path: tools/splat_ext + + string_encoding: ASCII + data_string_encoding: ASCII +""" + + # A section with raw_size > 0 but raw_pointer == 0 is loader-treated + # as uninitialised at runtime (the file simply doesn't back any bytes + # for it). Group those with the BSS bucket so we don't generate a + # file-backed segment pointing at offset 0 (the DOS header). + def _is_file_backed(s) -> bool: + return s.raw_size > 0 and s.raw_pointer > 0 + + segments = "\nsegments:\n" + segments += """\ + - name: header + type: header + start: 0x0 + +""" + + # Order sections by file position so segments stay monotonically + # increasing in rom_start (splat requires this). + file_sections = sorted( + (s for s in pe.sections if _is_file_backed(s)), + key=lambda s: s.raw_pointer, + ) + bss_sections = [ + s + for s in pe.sections + if not _is_file_backed(s) + and (s.characteristics & SCN_CNT_UNINITIALIZED_DATA or s.virtual_size > 0) + ] + + # Disambiguate duplicate section names (PE spec doesn't require + # uniqueness; some packers and hand-crafted images have repeats). + seen_names: dict = {} + + def _unique_name(raw: str) -> str: + n = remove_invalid_path_characters(raw.lstrip(".") or "section") + # GAS labels can't start with a digit (PuTTY's `.00cfg`, MSVC's + # `.rdata$zzzdebug` numeric subsection, etc.). Prefix with `_` + # to keep the resulting `_main` global label valid. + if n and n[0].isdigit(): + n = "_" + n + count = seen_names.get(n, 0) + seen_names[n] = count + 1 + return n if count == 0 else f"{n}_{count}" + + for s in file_sections: + # Derive a sensible subsegment type. Special-case `.pdata` to + # the dedicated Win32SegPdata so RUNTIME_FUNCTION rows render + # structured instead of as opaque byte runs; treat `.reloc` / + # `.rsrc` as opaque binary since they hold structured loader + # data, not GAS-meaningful pointers or strings. + if pe.is_pe32_plus and s.name == ".pdata": + sub_type = "pdata" + elif s.name in (".reloc", ".rsrc"): + sub_type = "bin" + elif s.characteristics & (SCN_CNT_CODE | SCN_MEM_EXECUTE): + sub_type = "text" + elif s.characteristics & SCN_MEM_WRITE: + sub_type = "data" + else: + sub_type = "rodata" + + safe_name = _unique_name(s.name) + vram = pe.image_base + s.virtual_address + segments += f"""\ + - name: {safe_name} + type: code + start: 0x{s.raw_pointer:X} + vram: 0x{vram:08X} + subsegments: + - [0x{s.raw_pointer:X}, {sub_type}, {safe_name}_main] + +""" + + # Virtual-only tail: file-backed section that extends in memory + # past its raw bytes (MSVC zero-init globals). Model as a BSS + # segment so the linker layout matches the runtime image. + # Virtual-only tail: file-backed section that extends in memory + # past its raw bytes — loader zero-fills the tail. Applies to + # both writable .data (MSVC zero-init globals) and any other + # section with VirtualSize > SizeOfRawData (occasionally seen on + # .rdata when constants are aligned past the file boundary). + if s.virtual_size > s.raw_size: + tail_vram = pe.image_base + s.virtual_address + s.raw_size + tail_size = s.virtual_size - s.raw_size + segments += f"""\ + - {{ name: {safe_name}_bss, type: bss, vram: 0x{tail_vram:08X}, bss_size: 0x{tail_size:X} }} + +""" + + for s in bss_sections: + # Sections claiming "uninitialized data" with VirtualSize 0 carry + # no runtime footprint — skip rather than emit `bss_size: 0x0` + # which splat treats as a malformed segment. + if s.virtual_size == 0: + continue + safe_name = _unique_name(s.name or "bss") + vram = pe.image_base + s.virtual_address + segments += f"""\ + - {{ name: {safe_name}, type: bss, vram: 0x{vram:08X}, bss_size: 0x{s.virtual_size:X} }} + +""" + + # Tack on a `bin` segment for the COFF symbol table if the optional + # header points at one. Modern MSVC binaries don't emit it (PDB + # replaces it) but vintage MSVC 4-6 binaries still ship it past the + # last raw-data section. The trailing `[len(exe_bytes)]` entry + # delimits its end. + # Post-section appendages (COFF symtab, Authenticode signature) sit + # past the last section's raw bytes. Collect them, sort by file + # offset, and emit in order — splat requires segments to be + # monotonically increasing by rom_start. + tail_segs: List[tuple] = [] + if ( + pe.coff_symtab_ptr + and pe.coff_num_symbols + and pe.coff_symtab_ptr < len(exe_bytes) + ): + tail_segs.append((pe.coff_symtab_ptr, "coff_symtab")) + if len(pe.data_directories) > 4: + cert_ptr, cert_size = pe.data_directories[_w32.DIR_CERTIFICATE] + # Authenticode signature: directory entry 4 (Certificate Table) + # is a FILE offset / size pair (unlike the RVA-based entries). + if cert_ptr and cert_size and cert_ptr < len(exe_bytes): + tail_segs.append((cert_ptr, "signature")) + # Post-section appendages have file offsets but no defined load + # VAs (the PE loader doesn't map them). Splat needs *some* VMA + # for each segment, so pin them at a high reserved range — well + # past the last section's VirtualAddress + VirtualSize — to keep + # the linker from assigning overlapping addresses. + tail_vma = pe.image_base + 0x10000000 + for start, name in sorted(tail_segs, key=lambda t: t[0]): + segments += ( + f" - {{ name: {name}, type: bin, " + f"start: 0x{start:X}, vram: 0x{tail_vma:X} }}\n\n" + ) + tail_vma += 0x100000 + + segments += f" - [0x{len(exe_bytes):X}]\n" + + out_file = Path(f"{cleaned_basename}.yaml") + with out_file.open("w", encoding="utf-8", newline="\n") as f: + print(f"Writing config to {out_file}") + f.write(header) + f.write(segments) + + conf.load([out_file]) + file_presets.write_all_files() + + # Stash the entry point and any exported function as known symbols so + # the disassembly labels them. Some DLLs are built without DllMain and + # leave AddressOfEntryPoint = 0 — skip the entrypoint symbol in that + # case so we don't emit a label pointing at the PE header. + _sanitize_id = _w32.sanitize_label + + symbol_addrs: List[str] = [] + if pe.entry_point_rva: + symbol_addrs.append(f"entrypoint = 0x{pe.entry_point_va:08X}; // type:func") + export_labels = _w32.compute_export_labels( + pe, reserved={"entrypoint"} if pe.entry_point_rva else set() + ) + # Build a {ordinal: label} lookup so we can emit them in iteration + # order while still using the centralised dedup-aware map. + va_to_label = export_labels + # Only print the "// Exports from X" header when there's at least one + # non-forwarder export — DLLs that re-export everything (e.g. + # apisetschema, downlevel shims) would otherwise emit a header with + # zero following rows. + named_exports = [e for e in pe.exports if e.forwarder is None] + if named_exports: + symbol_addrs.append("") + symbol_addrs.append(f"// Exports from {pe.export_dll_name or exe_path.name}") + for exp in named_exports: + va = pe.image_base + exp.rva + safe = va_to_label.get(va) + if safe is None: + continue + trailing = f"// type:func -- ordinal {exp.ordinal}" + if exp.name and safe != exp.name: + trailing += f" (original {exp.name})" + symbol_addrs.append(f"{safe} = 0x{va:08X}; {trailing}") + forwarders = [e for e in pe.exports if e.forwarder is not None] + if forwarders: + symbol_addrs.append("") + symbol_addrs.append("// Forwarded exports (live outside this DLL)") + for exp in forwarders: + name = exp.name or f"export_{exp.ordinal}" + safe = _sanitize_id(name) + # No real VA — record as a comment so users see the mapping. + symbol_addrs.append( + f"// {safe} -> {exp.forwarder} (ordinal {exp.ordinal})" + ) + + iat_labels = _w32.compute_iat_labels(pe) + if pe.imports: + symbol_addrs.append("") + symbol_addrs.append( + "// IAT slots (each `call dword ptr []` reaches one of these)" + ) + for imp in pe.imports: + slot_va = pe.image_base + imp.iat_rva + full = iat_labels.get(slot_va) + if full is None or not full.startswith("imp_"): + continue + trailing = f"// type:u32 -- import from {imp.dll}" + if imp.ordinal is not None: + trailing += f" ordinal {imp.ordinal}" + symbol_addrs.append(f"{full} = 0x{slot_va:08X}; {trailing}") + + if pe.delay_imports: + symbol_addrs.append("") + symbol_addrs.append( + "// Delay-load IAT slots (resolved on first call via __delayLoadHelper2)" + ) + for imp in pe.delay_imports: + slot_va = pe.image_base + imp.iat_rva + full = iat_labels.get(slot_va) + if full is None or not full.startswith("dimp_"): + continue + trailing = f"// type:u32 -- delay-loaded import from {imp.dll}" + if imp.ordinal is not None: + trailing += f" ordinal {imp.ordinal}" + symbol_addrs.append(f"{full} = 0x{slot_va:08X}; {trailing}") + + if pe.clr_header is not None and pe.clr_header.metadata_rva: + symbol_addrs.append("") + symbol_addrs.append( + "// .NET CLR header — points at the assembly's metadata " + "(ECMA-335) and entry-point token." + ) + md_va = pe.image_base + pe.clr_header.metadata_rva + symbol_addrs.append(f"clr_metadata = 0x{md_va:08X}; // type:u8") + if pe.clr_header.strong_name_signature_rva: + sn_va = pe.image_base + pe.clr_header.strong_name_signature_rva + symbol_addrs.append( + f"clr_strong_name_signature = 0x{sn_va:08X}; // type:u8" + ) + if pe.clr_header.resources_rva: + res_va = pe.image_base + pe.clr_header.resources_rva + symbol_addrs.append(f"clr_resources = 0x{res_va:08X}; // type:u8") + + if pe.security_cookie_va: + symbol_addrs.append("") + symbol_addrs.append( + "// /GS security cookie (xor'd with frame pointer at function entry)" + ) + symbol_addrs.append( + f"security_cookie = 0x{pe.security_cookie_va:08X}; // type:u32" + ) + + if pe.tls_callback_vas: + symbol_addrs.append("") + symbol_addrs.append( + "// TLS callbacks (run by the loader before DllMain / entrypoint)" + ) + for idx, cb_va in enumerate(pe.tls_callback_vas): + symbol_addrs.append(f"tls_callback_{idx} = 0x{cb_va:08X}; // type:func") + + if pe.safe_seh_handlers: + symbol_addrs.append("") + symbol_addrs.append( + "// SafeSEH handlers (registered exception filter functions)" + ) + for idx, rva in enumerate(pe.safe_seh_handlers): + va = pe.image_base + rva + symbol_addrs.append(f"safeseh_{idx} = 0x{va:08X}; // type:func") + + if pe.runtime_functions: + symbol_addrs.append("") + symbol_addrs.append( + "// UNWIND_INFO blobs — each RUNTIME_FUNCTION's UnwindInfoAddress " + "points at a (variable-length) IMAGE_UNWIND_INFO record." + ) + unwind_cap = 2048 + seen_unwind: set = set() + for begin, _end, uw in pe.runtime_functions[:unwind_cap]: + # The high bit of the UnwindInfoAddress flags a chained record — + # the same target then naturally collides with itself. Mask off + # before symbol emission so multiple chained refs share one + # `unwind_` label. + base_uw = uw & 0x7FFFFFFF + if base_uw == 0 or base_uw in seen_unwind: + continue + seen_unwind.add(base_uw) + va = pe.image_base + base_uw + symbol_addrs.append(f"unwind_{va:X} = 0x{va:08X}; // type:u8") + + if pe.cfg_function_rvas: + symbol_addrs.append("") + symbol_addrs.append( + "// /guard:cf valid indirect-call targets — every entry is " + "a function the loader's CFG bitmap whitelists." + ) + # CFG tables in real binaries can be huge (ntdll ~40k). Cap the + # symbol emission at 1024 to keep symbol_addrs.txt readable; the + # text.py call-target seed already covers all entries for label + # emission. Pass --full-cfg if you want every row. + cfg_cap = 1024 + shown = pe.cfg_function_rvas[:cfg_cap] + for idx, rva in enumerate(shown): + va = pe.image_base + rva + symbol_addrs.append(f"cfg_target_{idx} = 0x{va:08X}; // type:func") + if len(pe.cfg_function_rvas) > cfg_cap: + symbol_addrs.append( + f"// ... and {len(pe.cfg_function_rvas) - cfg_cap} more CFG " + "targets omitted; bump the cap in create_win32_config to list all." + ) + + with Path("symbol_addrs.txt").open("w", encoding="utf-8", newline="\n") as f: + print("Writing symbol_addrs.txt") + f.write( + "// Visit https://github.com/ethteck/splat/wiki/Adding-Symbols for documentation about this file\n" + f"// Generated from {exe_path.name} (sha1 {sha1[:12]}...) by create_win32_config.\n" + "// Edits are preserved across re-runs only via merging in a separate symbols file.\n" + ) + body = "\n".join(symbol_addrs) + f.write(body) + # POSIX convention: text files end with a newline. Avoid the + # "missing newline at end of file" lint when symbol_addrs.txt + # has no body entries (resource-only DLL, all-forwarder shim). + if not body.endswith("\n"): + f.write("\n") + + # One-line summary of the corpus so the user knows at-a-glance what + # auto-config found in their PE. + parts = [ + f"{len(pe.sections)} sections", + f"{len(pe.exports)} exports" if pe.exports else None, + f"{len(pe.imports)} imports" if pe.imports else None, + f"{len(pe.delay_imports)} delay-imports" if pe.delay_imports else None, + f"{len(pe.tls_callback_vas)} TLS callbacks" if pe.tls_callback_vas else None, + f"{len(pe.safe_seh_handlers)} SafeSEH handlers" + if pe.safe_seh_handlers + else None, + f"{len(pe.cfg_function_rvas)} CFG targets" if pe.cfg_function_rvas else None, + f"{len(pe.runtime_functions)} RUNTIME_FUNCTIONs" + if pe.runtime_functions + else None, + f"{len(pe.unwind_info)} unwind records" if pe.unwind_info else None, + f"{len(pe.coff_symbols)} COFF symbols" if pe.coff_symbols else None, + f".NET v{pe.clr_header.runtime_major}.{pe.clr_header.runtime_minor}" + if pe.clr_header + else None, + ] + summary = ", ".join(p for p in parts if p) + print(f"Detected: {summary}.") + if pe.pdb_path: + print(f"PDB hint: {pe.pdb_path}") + + def do_elf(elf_path: Path, elf_bytes: bytes, objcopy: Optional[str]): elf = ps2elfinfo.Ps2Elf.get_info(elf_path, elf_bytes) if elf is None: @@ -568,7 +1038,7 @@ def run_objcopy(objcopy_name: str, elf_path: str, rom: str) -> list[str]: def add_arguments_to_parser(parser: argparse.ArgumentParser): parser.add_argument( "file", - help="Path to a .z64/.n64 ROM, PSX executable or PS2 ELF", + help="Path to a .z64/.n64 ROM, PSX executable, PS2 ELF, or Win32 PE", type=Path, ) parser.add_argument( @@ -582,7 +1052,9 @@ def process_arguments(args: argparse.Namespace): main(args.file, args.objcopy) -script_description = "Create a splat config from an N64 ROM or PSX executable." +script_description = ( + "Create a splat config from an N64 ROM, PSX executable, PS2 ELF, or Win32 PE." +) def add_subparser(subparser: argparse._SubParsersAction): diff --git a/src/splat/scripts/win32_reassemble.py b/src/splat/scripts/win32_reassemble.py new file mode 100644 index 00000000..96eb6ca5 --- /dev/null +++ b/src/splat/scripts/win32_reassemble.py @@ -0,0 +1,270 @@ +"""Reassemble a splat-split win32 PE back into a single PE/EXE/DLL. + +Pipeline: + + 1. Run `as` on every .s under asm_path / data_path → .o files placed + at the build_path layout the splat-generated linker script + expects (build/asm/.s.o). + 2. Wrap any .bin assets into ELF objects via `objcopy -I binary -O + elf32-i386|elf64-x86-64` so they can be linked in. + 3. Invoke `ld -T ` from the base_path to produce an ELF + image whose section layout matches the original PE. + 4. Run `objcopy -O pei-i386|pei-x86-64` to convert the ELF to a PE. + +Output defaults to `.reasm` next to the original target +binary so an accidental run doesn't clobber the source. +""" + +from __future__ import annotations + +import argparse +import hashlib +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import List, Optional + +import yaml + +from ..util import log + + +def _which(cmd: str) -> str: + found = shutil.which(cmd) + if not found: + log.error( + f"win32_reassemble: required tool '{cmd}' not on PATH — " + "install binutils (provides as / ld / objcopy)" + ) + return found + + +def _read_yaml(yaml_path: Path) -> dict: + return yaml.safe_load(yaml_path.read_text(encoding="utf-8")) + + +def _detect_bitness(yaml_path: Path, conf: dict) -> bool: + """Return True if the source PE is PE32+ (x86_64). Inspect the + `target_path` binary's optional-header magic.""" + target = conf["options"].get("target_path") + if not target: + log.error("win32_reassemble: YAML missing options.target_path") + opts = conf["options"] + base_path = (yaml_path.parent / opts.get("base_path", ".")).resolve() + target_path = (base_path / target).resolve() + if not target_path.exists(): + log.error(f"win32_reassemble: target binary not found at {target_path}") + data = target_path.read_bytes() + if len(data) < 0x100 or data[:2] != b"MZ": + log.error(f"win32_reassemble: {target_path} is not a PE") + pe_off = int.from_bytes(data[0x3C:0x40], "little") + magic = int.from_bytes(data[pe_off + 0x18 : pe_off + 0x1A], "little") + return magic == 0x20B + + +def _run(cmd: List[str], verbose: bool, cwd: Optional[Path] = None) -> None: + if verbose: + prefix = f"(cd {cwd}) " if cwd else "" + print(f"$ {prefix}" + " ".join(str(c) for c in cmd)) + r = subprocess.run(cmd, capture_output=True, cwd=str(cwd) if cwd else None) + if r.returncode != 0: + sys.stderr.write(r.stderr.decode(errors="replace")) + log.error(f"win32_reassemble: command failed: {cmd[0]}") + + +def _collect_sources( + asm_path: Path, data_path: Path, asset_path: Path +) -> "tuple[List[Path], List[Path]]": + """Return (.s sources, .bin assets) under the splat-configured + source directories.""" + s_paths: List[Path] = [] + seen = set() + for root in (asm_path, data_path): + if not root.exists(): + continue + for p in sorted(root.rglob("*.s")): + if p in seen: + continue + seen.add(p) + s_paths.append(p) + bin_paths: List[Path] = [] + if asset_path.exists(): + bin_paths = sorted(asset_path.rglob("*.bin")) + return s_paths, bin_paths + + +def reassemble(yaml_path: Path, out_path: Path, verbose: bool = False) -> Path: + """Drive the full assemble + link + PE-convert pipeline for a + splat-generated win32 config. Returns the path to the produced PE.""" + conf = _read_yaml(yaml_path) + opts = conf["options"] + base_path = (yaml_path.parent / opts.get("base_path", ".")).resolve() + asm_path = (base_path / opts.get("asm_path", "asm")).resolve() + data_path = (base_path / opts.get("data_path", "data")).resolve() + asset_path = (base_path / opts.get("asset_path", "assets")).resolve() + build_path = (base_path / opts.get("build_path", "build")).resolve() + ld_path = base_path / opts.get("ld_script_path", "") + if not ld_path.exists(): + log.error( + f"win32_reassemble: linker script not found at {ld_path} — " + "run `python -m splat split ` first" + ) + + is_pe32_plus = _detect_bitness(yaml_path, conf) + mode_flag = "--64" if is_pe32_plus else "--32" + ld_emulation = "elf_x86_64" if is_pe32_plus else "elf_i386" + bin_obj_fmt = "elf64-x86-64" if is_pe32_plus else "elf32-i386" + bin_obj_arch = "i386:x86-64" if is_pe32_plus else "i386" + + asm_tool = _which("as") + ld_tool = _which("ld") + objcopy = _which("objcopy") + + s_paths, bin_paths = _collect_sources(asm_path, data_path, asset_path) + if not s_paths and not bin_paths: + log.error( + "win32_reassemble: no .s or .bin sources found — run " + "`python -m splat split ` first" + ) + + # The splat-generated linker script references object files at + # `/`. With splat's default + # `o_as_suffix: False` the suffix is `.s.o` (`.s.o`); with + # `o_as_suffix: True` it's just `.o` (`.o`). Match + # whichever the YAML opted into. + use_o_as_suffix = bool(opts.get("o_as_suffix", False)) + + def _obj_for(src: Path) -> Path: + for root in (asm_path, data_path): + try: + rel = src.relative_to(root) + if use_o_as_suffix: + rel = rel.with_suffix(".o") + else: + rel = rel.with_suffix(rel.suffix + ".o") + return build_path / "asm" / rel + except ValueError: + continue + return src.with_suffix(src.suffix + ".o") + + for s_path in s_paths: + o_path = _obj_for(s_path) + o_path.parent.mkdir(parents=True, exist_ok=True) + _run([asm_tool, mode_flag, str(s_path), "-o", str(o_path)], verbose) + + # Wrap .bin assets so ld can link them. Splat's linker script + # references each bin as `/assets/.o` (no .bin + # suffix), pulling the `.data` section that `objcopy -I binary` + # populates by default. Run `objcopy` from `bin_path.parent` so + # the embedded `_binary__start` symbols come out + # deterministic regardless of where the source file lives. + for bin_path in bin_paths: + try: + rel = bin_path.relative_to(asset_path) + except ValueError: + rel = Path(bin_path.name) + o_rel = rel.with_suffix(".o") + o_path = build_path / "assets" / o_rel + o_path.parent.mkdir(parents=True, exist_ok=True) + # objcopy from CWD=bin_path.parent so the auto-generated + # `_binary__start` symbol uses just the filename. + _run( + [ + objcopy, + "-I", + "binary", + "-O", + bin_obj_fmt, + "-B", + bin_obj_arch, + bin_path.name, + str(o_path.resolve()), + ], + verbose, + cwd=bin_path.parent, + ) + + # Link via the splat-generated linker script. Run from base_path + # so the script's `build/asm/...` references resolve. + with tempfile.TemporaryDirectory(prefix="splat-reasm-") as td: + elf_path = Path(td) / "linked.elf" + # -N (omagic): produce an ELF without page-aligned segments — + # the splat .ld layout packs sections contiguously by LMA and + # would otherwise blow past the program-header capacity. + _run( + [ + ld_tool, + "-m", + ld_emulation, + "-N", + "-T", + ld_path.name, + "-o", + str(elf_path), + ], + verbose, + cwd=base_path, + ) + # Force alloc/load on splat's custom .header section so the + # binary extraction includes it. GAS marks .header as + # READONLY-only because there's no exec/write flag in the + # `.section .header` line; that's enough for the linker but + # makes `-O binary` skip the bytes. + _run( + [ + objcopy, + "--set-section-flags", + ".header=alloc,load,data", + str(elf_path), + ], + verbose, + ) + # Extract the loaded image as a raw byte blob — the splat + # `.header` section already contains the full PE header + # (DOS stub + COFF + optional header + section table) and + # every other section is positioned at its file-offset by + # the linker script. Wrapping with `-O pei-*` would prepend + # a second PE header; we just want the bytes verbatim. + out_path.parent.mkdir(parents=True, exist_ok=True) + _run( + [objcopy, "-O", "binary", str(elf_path), str(out_path)], + verbose, + ) + + if verbose: + sha = hashlib.sha1(out_path.read_bytes()).hexdigest() + print(f"Produced {out_path} ({len(out_path.read_bytes())} bytes, sha1 {sha})") + + return out_path + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Reassemble a splat-split Win32 PE back into a single .exe/.dll" + ) + parser.add_argument("yaml", type=Path, help="splat YAML config") + parser.add_argument( + "--out", + type=Path, + default=None, + help="Output PE path (defaults to .reasm)", + ) + parser.add_argument("-v", "--verbose", action="store_true") + args = parser.parse_args() + + conf = _read_yaml(args.yaml) + target = conf["options"].get("target_path") + out_path = args.out + if out_path is None: + if not target: + log.error("win32_reassemble: YAML has no target_path; pass --out") + base_path = (args.yaml.parent / conf["options"].get("base_path", ".")).resolve() + out_path = Path(str((base_path / target).resolve()) + ".reasm") + + reassemble(args.yaml, out_path, verbose=args.verbose) + + +if __name__ == "__main__": + main() diff --git a/src/splat/segtypes/__init__.py b/src/splat/segtypes/__init__.py index 8204998a..a7cfd8be 100644 --- a/src/splat/segtypes/__init__.py +++ b/src/splat/segtypes/__init__.py @@ -6,3 +6,4 @@ from . import ps2 as ps2 from . import psx as psx from . import psp as psp +from . import win32 as win32 diff --git a/src/splat/segtypes/win32/__init__.py b/src/splat/segtypes/win32/__init__.py new file mode 100644 index 00000000..a548542c --- /dev/null +++ b/src/splat/segtypes/win32/__init__.py @@ -0,0 +1,8 @@ +from . import header as header +from . import text as text +from . import asm as asm +from . import data as data +from . import rodata as rodata +from . import bss as bss +from . import bin as bin +from . import pdata as pdata diff --git a/src/splat/segtypes/win32/asm.py b/src/splat/segtypes/win32/asm.py new file mode 100644 index 00000000..f4661c9c --- /dev/null +++ b/src/splat/segtypes/win32/asm.py @@ -0,0 +1,14 @@ +"""`type: asm` alias for the win32 text segment. + +Lets win32 YAML use the more conventional `asm` segtype name (matching the +other platforms) instead of `text`. Same behaviour as `Win32SegText`.""" + +from .text import Win32SegText + + +class Win32SegAsm(Win32SegText): + """Alias for Win32SegText so YAML can use `type: asm` (the + convention on other splat platforms) instead of `type: text`. + No behavioural difference.""" + + pass diff --git a/src/splat/segtypes/win32/bin.py b/src/splat/segtypes/win32/bin.py new file mode 100644 index 00000000..8f6f3d29 --- /dev/null +++ b/src/splat/segtypes/win32/bin.py @@ -0,0 +1,17 @@ +"""Win32 binary blob segment — reuse the common bin segment for things like +.rsrc / .reloc / .idata / coff_symtab / signature where the section's +bytes are structured loader-time data rather than meaningful code or +labelled pointers. Splat writes the raw bytes to a `.bin` file under +asset_path; the linker layout (or the win32_reassemble post-process) +incorporates them at the right rom_start without any decoding pass.""" + +from ..common.bin import CommonSegBin + + +class Win32SegBin(CommonSegBin): + """Win32-specific marker class — same behaviour as CommonSegBin, + re-exported so YAML `type: bin` resolves through the win32 + segtype lookup. Used for .rsrc / .reloc / .idata / coff_symtab / + signature segments produced by create_win32_config.""" + + pass diff --git a/src/splat/segtypes/win32/bss.py b/src/splat/segtypes/win32/bss.py new file mode 100644 index 00000000..e53fb04c --- /dev/null +++ b/src/splat/segtypes/win32/bss.py @@ -0,0 +1,61 @@ +"""Win32 .bss segment — emits a NOLOAD reservation.""" + +from pathlib import Path +from typing import Optional + +from ..common.segment import CommonSegment +from ...util import options + + +class Win32SegBss(CommonSegment): + """Uninitialised data segment (`.bss`). + + Emits a `.section .bss, "wa"` block with a single `.space N` + directive — the loader zero-fills these bytes at map time, so + they have no on-file representation. `reserved_size` resolves to + the YAML's `bss_size:` value if set, else `vram_end - vram_start`, + else zero (in which case the segment is degenerate and the create- + config layer skips it).""" + + @staticmethod + def is_noload() -> bool: + return True + + def get_linker_section(self) -> str: + return ".bss" + + def get_section_flags(self) -> Optional[str]: + return "wa" + + def out_path(self) -> Path: + return options.opts.data_path / self.dir / f"{self.name}.s" + + @property + def reserved_size(self) -> int: + if isinstance(self.yaml, dict): + sz = self.yaml.get("bss_size") + if sz is not None: + return int(sz) + if self.vram_start is not None and self.vram_end is not None: + return self.vram_end - self.vram_start + return 0 + + def should_split(self) -> bool: + return self.extract and options.opts.is_mode_active("code") + + def split(self, rom_bytes: bytes): + path = self.out_path() + path.parent.mkdir(parents=True, exist_ok=True) + + size = self.reserved_size + with path.open("w", encoding="utf-8", newline="\n") as f: + preamble = options.opts.generated_s_preamble + if preamble: + f.write(preamble + "\n\n") + f.write(self.get_section_asm_line() + "\n\n") + f.write(f".global {self.name}\n") + f.write(f"{self.name}:\n") + if size > 0: + f.write(f" .space 0x{size:X}\n") + + self.log(f"Wrote {self.name} to {path}") diff --git a/src/splat/segtypes/win32/data.py b/src/splat/segtypes/win32/data.py new file mode 100644 index 00000000..338d5054 --- /dev/null +++ b/src/splat/segtypes/win32/data.py @@ -0,0 +1,496 @@ +"""Win32 .data segment — dumped as a `.byte` block so it can be reassembled +without depending on spimdisasm's data analyzer.""" + +import struct +from pathlib import Path +from typing import List, Optional + +from ..common.segment import CommonSegment +from ...util import options + + +# Minimum length (excluding the NUL terminator) of an ASCII run that should +# be emitted as `.asciz` rather than raw bytes. +STRING_MIN_LEN = 4 + + +def _is_string_byte(b: int) -> bool: + # ASCII control/printable subset OR Latin-1 Supplement printables. + # Mirrors the wide-string scanner; covers localised ANSI resources + # written in Western European code pages. + return b == 0x09 or b == 0x0A or b == 0x0D or 0x20 <= b <= 0x7E or 0xA0 <= b <= 0xFF + + +def _escape_string(raw: bytes) -> str: + out = [] + for b in raw: + if b == ord('"'): + out.append('\\"') + elif b == ord("\\"): + out.append("\\\\") + elif b == 0x0A: + out.append("\\n") + elif b == 0x0D: + out.append("\\r") + elif b == 0x09: + out.append("\\t") + elif 0x20 <= b <= 0x7E: + out.append(chr(b)) + else: + out.append(f"\\x{b:02x}") + return "".join(out) + + +def _scan_string(data: bytes, start: int) -> Optional[int]: + """If a printable run beginning at `start` and ending in a NUL byte is + at least STRING_MIN_LEN characters long, return the end offset (one + past the NUL). Otherwise return None.""" + i = start + while i < len(data) and _is_string_byte(data[i]): + i += 1 + if i >= len(data) or data[i] != 0: + return None + if (i - start) < STRING_MIN_LEN: + return None + return i + 1 + + +# Minimum length in WCHARs (excluding the WCHAR NUL terminator) for a +# UTF-16LE string to be recognised as such. +WIDE_STRING_MIN_LEN = 4 + + +def _scan_wide_string(data: bytes, start: int) -> Optional[int]: + """Detect a UTF-16LE printable run terminated by `\\x00\\x00`. Returns + the end offset (one past the terminating WCHAR), or None when no + valid wide string of sufficient length is present. + + Only matches at even offsets — WCHAR strings are 2-byte aligned.""" + if start & 1: + return None + i = start + count = 0 + while i + 1 < len(data): + lo, hi = data[i], data[i + 1] + if hi != 0: + # Cautious: reject non-Latin-1 to avoid false positives. + return None + if lo == 0: + # WCHAR terminator. + break + # ASCII control/printable subset OR Latin-1 supplement printables + # (0xA0–0xFF, NBSP through ÿ). Covers German umlauts, accented + # French chars, Spanish ñ, etc. — common in localised resources. + if not (_is_string_byte(lo) or 0xA0 <= lo <= 0xFF): + return None + count += 1 + i += 2 + if i + 1 >= len(data): + return None + if data[i] != 0 or data[i + 1] != 0: + return None + if count < WIDE_STRING_MIN_LEN: + return None + return i + 2 + + +def _decode_wide(raw: bytes) -> str: + """Decode a WCHAR string body (no terminator) into a printable form + using the same escapes as `_escape_string`.""" + try: + s = raw.decode("utf-16-le", errors="replace") + except Exception: + s = "" + out = [] + for ch in s: + b = ord(ch) + if b == ord('"'): + out.append('\\"') + elif b == ord("\\"): + out.append("\\\\") + elif b == 0x0A: + out.append("\\n") + elif b == 0x0D: + out.append("\\r") + elif b == 0x09: + out.append("\\t") + elif 0x20 <= b <= 0x7E: + out.append(chr(b)) + else: + out.append(f"\\u{b:04x}") + return "".join(out) + + +class Win32SegData(CommonSegment): + """Writable initialised data segment (`.data` in MASM lingo). + + Emits a `.byte` / `.long` / `.quad` representation of the + section bytes. Detects: + - pointer slots flagged by base-relocations (and synthesises a + `func_` / `D_` label for the target); + - NUL-terminated printable strings → `.asciz`; + - UTF-16LE wide-string runs → preserved as raw bytes with a + `/* L"..." */` preview comment; + - long zero runs → collapsed into `.space N` directives. + `exact_encoding: true` in the YAML disables every heuristic so + bytes pass through verbatim.""" + + LINKER_SECTION = ".data" + SECTION_FLAGS = "wa" + # Detect printable NUL-terminated runs and emit them as `.asciz`. + # Enabled by default; .data has plenty of CRT strings, source paths, + # and format strings worth surfacing. The min-length filter (see + # `data._scan_string`) keeps the false-positive rate low. + DETECT_STRINGS = True + # When the PE has no .reloc table (RELOCS_STRIPPED EXEs) we have no + # ground truth for what bytes are pointers. Subclasses that opt in get + # a heuristic scan: any 4-byte-aligned word whose value falls inside an + # image section is treated as a pointer. Off by default to avoid + # rewriting integer data as bogus pointers. + HEURISTIC_POINTERS = False + + @property + def exact_encoding(self) -> bool: + """When enabled, pointer slots emit raw `.long 0xN` / `.quad 0xN` + instead of `.long