diff --git a/.cargo/config.toml b/.cargo/config.toml
new file mode 100644
index 000000000..a3f04c60a
--- /dev/null
+++ b/.cargo/config.toml
@@ -0,0 +1,17 @@
+# Force the WASM bundle back to size-optimized codegen.
+#
+# `[profile.release]` (in the workspace Cargo.toml) is `opt-level = 3` so native
+# binaries (CLI, server, MCP, pysimlin's libsimlin, C FFI) optimize for speed.
+# The browser WASM bundle is dominated by download size, not CPU, so we override
+# the opt-level for the wasm32 target here. Keying this on the *target* (rather
+# than passing a flag in each build script) means every wasm build path -- the
+# `cargo build --target wasm32-unknown-unknown` in src/engine/build.sh, any
+# future wasm-pack invocation, etc. -- stays size-optimized automatically.
+#
+# Caveat: a `RUSTFLAGS` *environment variable* takes precedence over and replaces
+# these target rustflags (Cargo does not merge the two). Today only the asan test
+# scripts set RUSTFLAGS, and they build for the host, so the wasm bundle is
+# unaffected. Do not set RUSTFLAGS during a wasm release build or it will pick up
+# opt-level 3.
+[target.wasm32-unknown-unknown]
+rustflags = ["-C", "opt-level=z"]
diff --git a/Cargo.lock b/Cargo.lock
index 41cce9ee3..ee1694b26 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2076,6 +2076,15 @@ version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 
+[[package]]
+name = "libmimalloc-sys"
+version = "0.1.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2892ae4ea6fa2cb7acb0e236a6880d39523239cd9089de71d220910ccc806790"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.12.1"
@@ -2316,6 +2325,15 @@ dependencies = [
  "autocfg",
 ]
 
+[[package]]
+name = "mimalloc"
+version = "0.1.51"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebca48a43116bc25f18a61360f1be98412f50cc218f5e52c823086b999a4a21a"
+dependencies = [
+ "libmimalloc-sys",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.17"
@@ -3768,6 +3786,7 @@ name = "simlin"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "mimalloc",
  "prost",
  "serde",
  "serde_json",
@@ -3824,6 +3843,7 @@ name = "simlin-mcp"
 version = "0.1.6"
 dependencies = [
  "anyhow",
+ "mimalloc",
  "rmcp",
  "serde_json",
  "serde_yaml",
@@ -3862,6 +3882,7 @@ dependencies = [
  "hyper-util",
  "ignore",
  "loro",
+ "mimalloc",
  "mime_guess",
  "notify-debouncer-full",
  "open",
diff --git a/Cargo.toml b/Cargo.toml
index aa66574c1..c2a44e799 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,8 +11,14 @@ members = [
     "src/xmutil",
 ]
 
+# Native release builds optimize for speed (opt-level = 3). The WASM bundle,
+# where download size dominates, is forced back to opt-level = "z" via
+# `.cargo/config.toml` ([target.wasm32-unknown-unknown] rustflags), which is
+# keyed on the target rather than the build invocation so every wasm build path
+# stays size-optimized. Measured on C-LEARN: opt-level 3 vs "z" is ~-30% compile
+# and ~-41% simulate on native (see docs/design/engine-performance.md).
 [profile.release]
-opt-level = "z"
+opt-level = 3
 lto = true
 panic = "abort"
 strip = true
diff --git a/docs/README.md b/docs/README.md
index a6954f1fc..ea9715bb8 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,6 +4,7 @@
 
 - [architecture.md](architecture.md) -- Component descriptions, dependency graph, project structure
 - [design/2026-02-21-incremental-compilation.md](design/2026-02-21-incremental-compilation.md) -- Incremental compilation via salsa: symbolic bytecode, per-variable tracking, LTM integration
+- [design/engine-performance.md](design/engine-performance.md) -- Engine compile/simulate profile (C-LEARN), implemented optimizations, and remaining proposals
 - [design/ltm--loops-that-matter.md](design/ltm--loops-that-matter.md) -- LTM implementation design: data structures, synthetic variables, module handling
 - [design/mdl-parser.md](design/mdl-parser.md) -- Vensim MDL parser design history and implementation notes
 - [design/vdf.md](design/vdf.md) -- VDF binary format specification and parser design
diff --git a/docs/design/engine-performance.md b/docs/design/engine-performance.md
new file mode 100644
index 000000000..7d13a15e4
--- /dev/null
+++ b/docs/design/engine-performance.md
@@ -0,0 +1,314 @@
+# Engine performance: profile and optimization opportunities
+
+Status: analysis + first clear wins landed. 2026-05-19.
+
+This documents an empirical CPU/memory profile of **compiling and simulating the
+C-LEARN hero model** (the largest model we have: ~53k MDL lines / 1.4 MB, 934
+datamodel variables, 5726 root slots, 162 graphical functions, 1000 Euler
+timesteps), the clear-win optimizations already implemented on top of it, and a
+set of larger proposals grounded in the measured data.
+
+## Methodology
+
+- Harness: `src/simlin-engine/examples/clearn_profile.rs` — times each pipeline
+  stage (parse → compile-via-salsa → `Vm::new` → `run_to_end`) and, with
+  `CLEARN_COUNT_ALLOCS=1`, reports allocation counts / peak live bytes per stage
+  via a gated counting global allocator. With high `CLEARN_COMPILE_ITERS` /
+  `CLEARN_RUN_ITERS` it is a focused `perf record` / `callgrind` target.
+- `CompiledSimulation::bytecode_profile()` — opcode histogram + table sizes.
+- CPU: `perf record -g --call-graph dwarf` and `valgrind --tool=callgrind`
+  (exact call counts). Memory: the counting allocator. Machine: Ryzen 9950X.
+- Numbers below are the shipped `[profile.release]` (`opt-level="z"`, LTO)
+  unless noted. Profile builds add `CARGO_PROFILE_RELEASE_DEBUG=1
+  CARGO_PROFILE_RELEASE_STRIP=false`.
+
+## Measured baseline (before this work)
+
+| Phase | Wall (per iter) | Allocations | Dominant costs |
+|---|---|---|---|
+| parse (`open_vensim`) | ~69 ms | 0.82M | MDL lexer/parser/convert |
+| **compile (salsa)** | **~3574 ms** | **73M (8.9 GiB churned, 3.3 MiB retained)** | ~30% raw `malloc`/`free`; `reconstruct_variable` 6.4%; `canonicalize`+`to_lowercase` ~3.8% (6.1M `to_lowercase` calls); parse front-end ~4% (3.86M `parse_app`) |
+| `Vm::new` | ~0.6 ms | 7.8k | buffer allocation |
+| **run (`run_to_end`)** | **~342 ms** | **2.9M (~2944/timestep)** | `eval_bytecode` 35%; **~15% `make_module_key` clone + `HashMap<ModuleKey>` SipHash inside `EvalModule`**; `RuntimeView` machinery ~9% |
+
+Two structural facts dominate:
+
+1. **Compile is ~10× the run and is allocation-bound.** ~30% of compile
+   instructions are in glibc `malloc`/`free`, churning millions of tiny,
+   short-lived allocations (AST `Box` nodes, `canonicalize` `String`s, repeated
+   `datamodel::Variable` reconstruction). The front-end node count is amplified
+   because arrayed equations are parsed per declared element.
+2. **The run's entire per-timestep allocation churn was one thing:** the
+   `EvalModule` opcode rebuilt a `(String, BTreeSet<String>)` module key and
+   SipHashed it for a `HashMap` lookup on every module evaluation, every step
+   (~1344 `EvalModule` × 1000 steps ≈ 1.34M key constructions, each ≥2 heap
+   allocations).
+
+Bytecode shape (unchanged by this work): 64420 opcodes (8 B each = 503 KiB);
+34673 are flow (the hot per-step program = 277 KiB). Flow histogram: `LoadVar`
+32.8%, `Op2` 18.9%, `LoadConstant` 12.1%, `AssignCurr` 6.8%, `If`/`SetCond` 4.7%
+each. So ~70% of executed opcodes are load / store / binary-op.
+
+## Clear wins implemented
+
+All three are behavior-preserving: the 3530 engine lib tests, 91 `simulate`
+integration tests, and the `clearn_residual_exactness` guard (C-LEARN matches
+Vensim's `Ref.vdf` byte-for-byte) all pass, and the compiled bytecode is
+byte-identical (64420 opcodes).
+
+### 1. `EvalModule` index dispatch (run −17%, run allocations → 0)
+
+`make_module_key` cloned a `String` + `BTreeSet<String>` and the `EvalModule`
+opcode SipHashed it for a `HashMap<ModuleKey, _>` lookup, every module-eval every
+timestep. Replaced the three keyed maps (`flow_modules` / `stock_modules` /
+`initial_modules`) with a single `Vec<ResolvedModule>` indexed by integer, plus a
+`child_targets: Vec<u32>` per module resolving each `EvalModule` declaration to
+its child's index **once** at `Vm::new`. The eval recursion threads a
+`module_idx` and array-indexes; the `ModuleKey` map survives only for the cold
+`set_value` / `clear_values` literal-override paths.
+
+- **run 342 → 283 ms (−17%)**; `run_to_end` allocations **2.94M → 0**.
+- Post-change profile: `eval_bytecode` 35% → 46% (now the real work), the ~15%
+  SipHash cost gone entirely.
+
+### 2. Allocation-free 0-arity-builtin check (compile −3%, −1.45M allocs)
+
+`Expr0::reify_0_arity_builtins` called `id.as_str().to_lowercase()` (a heap
+allocation) on **every** variable reference just to test membership in a
+9-element ASCII set. Added `builtins::is_0_arity_builtin_fn_ci` (ASCII
+case-insensitive, allocation-free) and only materialize the lowercased name in
+the rare case a genuine `pi`/`time`/etc. reference is reified.
+
+- **compile 3574 → 3458 ms (−3.2%)**, −1.45M allocations.
+
+### 3. Cached project dims in `compile_var_fragment` (−130k allocs)
+
+`compile_var_fragment` (salsa-tracked, once per variable) rebuilt the full
+datamodel dimension `Vec` via `source_dims_to_datamodel(project.dimensions(db))`
+per variable; switched to the already-cached `project_datamodel_dims` query
+(`returns(ref)`). Provably equivalent (the cached query is defined as exactly
+that call). Marginal on C-LEARN (only 18 dims) but strictly correct and removes a
+redundant per-variable rebuild.
+
+## Build-level levers (measured, near-free, the biggest wins) — IMPLEMENTED
+
+These need no engine-code changes and dwarf the code-level compile work. Both are
+**native-only**: the WASM bundle (built via `cargo build --target
+wasm32-unknown-unknown --release`) keeps `opt-level="z"` for download size and
+never links mimalloc.
+
+### A. `opt-level = 3` for native (compile −30%, run −41%)
+
+`[profile.release]` is now `opt-level = 3`. The WASM bundle is forced back to
+`opt-level=z` by `.cargo/config.toml` (`[target.wasm32-unknown-unknown] rustflags
+= ["-C", "opt-level=z"]`) — keyed on the target, so every wasm build path stays
+size-optimized regardless of invocation (verified: wasm bundle 7.19 MB at z vs
+9.75 MB at 3). Measured on C-LEARN (with the code wins in):
+
+| | opt="z" | opt=3 | delta |
+|---|---|---|---|
+| compile | 3485 ms | 2450 ms | **−30%** |
+| run | 283 ms | 168 ms | **−41%** |
+
+Caveat documented in `.cargo/config.toml`: a `RUSTFLAGS` *env var* replaces the
+target rustflags, so don't set `RUSTFLAGS` during a wasm release build.
+
+### B. mimalloc for native (compile −40% on top of opt=3)
+
+Compile is allocation-bound, so a faster allocator pays off directly:
+
+| | system malloc | mimalloc | delta |
+|---|---|---|---|
+| compile | 2450 ms | 1459 ms | **−40%** |
+| run | 168 ms | 167 ms | none (run is allocation-free post-win #1) |
+
+Wiring: the binaries (`simlin-cli`, `simlin-serve`, `simlin-mcp`) set
+`#[global_allocator] mimalloc::MiMalloc` in their `main.rs` (native binaries,
+never wasm). `libsimlin` (the cdylib used by pysimlin via cffi and by C/C++ FFI,
+*and* the wasm crate) gates it behind an opt-in `mimalloc` feature that is
+additionally `cfg(not(target_arch = "wasm32"))`; pysimlin's build
+(`Makefile`, `scripts/build_wheels.py`) enables `--features mimalloc`. The feature
+is off by default, so `simlin-cli` (which links libsimlin) sees no allocator
+there and supplies its own without conflict.
+
+**Cumulative compile: 3574 → 1459 ms (−59%)** via code wins + opt=3 + mimalloc.
+**Cumulative run: 342 → 168 ms (−51%)** via code win + opt=3.
+
+## Run-side proposals (post-win hot path: `eval_bytecode` 46%, `RuntimeView` ~20%)
+
+### R1. Bounds-check elimination on `curr`/`next` indexing — INVESTIGATED, not worth it
+
+The hot opcodes index `curr[module_off + off]`, `next[...]`,
+`bytecode.literals[id]`, and `context.graphical_functions[gf]`. Disassembly
+confirms `eval_bytecode` carries 127 `panic_bounds_check` sites, so LLVM is not
+eliding them. An earlier draft of this doc proposed `get_unchecked` here as "the
+biggest code-level run win" — direct measurement disproves that.
+
+**Measured ceiling: ~0.** Replacing the bounds checks on the hottest scalar arms
+(`LoadVar`, `LoadConstant`, `LoadGlobalVar`, `AssignCurr`/`Next`,
+`AssignConstCurr`, `BinOpAssignCurr`/`Next`) *and* the dispatch `code[pc]` access
+with `get_unchecked` moved the C-LEARN run by less than run-to-run noise (165–172
+ms across runs, vs ~167 ms checked). On a modern out-of-order core at
+`opt-level=3` an always-in-bounds check is a perfectly-predicted, never-taken
+branch with an out-of-line cold panic path — effectively free. (The ~10% in
+`RuntimeView::flat_offset` is a per-element `SmallVec` rebuild + linear sparse
+search, *not* a bounds check — see R4.)
+
+**Can safe code eliminate them (the optimizer-coaxing question)?**
+- The dispatch index is *already* check-free in safe code: `while pc <
+  code.len() { match &code[pc] }` — the loop guard dominates the access with the
+  identical bound, so LLVM proves it in range. This is the canonical safe-BCE
+  pattern (the Go equivalent is the elision after `for i := 0; i < len(s);
+  i++`). Confirmed: `get_unchecked` on `code[pc]` made no difference.
+- The data-driven indices cannot be made check-free in safe code. `off` is `u16`
+  opcode data and `module_off` is a runtime module base; the in-range invariant
+  is established by a separate validation pass and is not re-derivable at the
+  access site from types or local control flow. The safe idioms that *do* elide
+  don't fit: sequential iteration / `chunks`/`windows` (this is random access);
+  fixed-size `[T; N]` (n_slots is runtime); power-of-two masking `i & (len-1)`
+  (needs a compile-time-constant power-of-two length); a hoisted `assert!(i <
+  len)` (that *is* the check, relocated — `i` is per-opcode so it can't hoist out
+  of the loop). Removing them would require `unsafe` `get_unchecked` + a static
+  validation pass (the `Stack` pattern), verifiable under miri — and miri detects
+  OOB at runtime, it does not remove checks.
+
+**Decision: do not implement.** `unsafe` in a `#![deny(unsafe_code)]` crate, plus
+a validation pass and a miri burden, is not justified for a sub-noise gain. The
+run's *instruction count*, not its bounds checks, is the lever — that is R2. The
+"bytecode density / dcache" intuition is also a non-issue: the program streams
+linearly (prefetcher-friendly) and is already 8 B/opcode.
+
+### R2. 3-address binop fusion — IMPLEMENTED (run −6.8% on C-LEARN)
+
+~70% of executed opcodes are load/store/binop. A stack VM evaluates `a op b` as
+`LoadX; LoadY; Op2` (3 dispatches); folding the leaf operand loads into the op
+makes it 1. Crucially **the `curr[]` slot array is already the register file** —
+variables live at fixed offsets — so the fused ops read operands straight from
+`curr[]`/`literals` (or pop one from the stack), and the stack carries only
+nested subexpression results.
+
+**Opcode budget forced a 2-operand design.** A full 3-operand `dst = a op b`
+(3×u16 + Op2 = 7-byte payload → 10-byte enum) blows the asserted 8-byte `Opcode`.
+So the fused ops are 2-operand *pushing* forms (≤6 bytes): `BinVarVar`,
+`BinVarConst`, `BinConstVar` (both operands are leaves; fuse `Load; Load; Op2`,
+3→1) and `BinStackVar`, `BinStackConst` (lhs already on the stack; fuse `Load;
+Op2`, 2→1). A leaf *assignment* `dst = a op b` keeps the existing
+`BinOpAssignCurr` for the store (so it stays 3 ops, not 1) — those are a minority
+(`BinOpAssignCurr` ≪ `Op2`).
+
+**Where it runs.** A late `ByteCode::fuse_three_address` pass applied to the Vm's
+flow/stock execution bytecode at `Vm::new`, reusing `peephole_optimize`'s
+jump-target guard + old→new PC remap and preserving `max_stack_depth`. It runs at
+`Vm::new` rather than compile time deliberately: the `CompiledSimulation` stays a
+pure, *symbolizable*, salsa-cached artifact (the symbolic roundtrip tests
+symbolize it; the fused opcodes have no symbolic form), and the `Vm`'s execution
+copy is where the optimization lives. Per-`Vm` fusion is a linear scan, negligible
+vs a run. Initials are left unfused (run once; `extract_assign_curr_offsets` reads
+their `AssignCurr` targets).
+
+**Result.** Flow opcodes 34673 → 26539 on C-LEARN (−23.5%); run 166.8 → 155.4 ms
+(−6.8%). The opcode reduction outweighs the runtime gain because the f64
+arithmetic, stock phase, save/copy, and array machinery (`flat_offset`, R4) are
+untouched — only the scalar *dispatch* shrinks. Scalar-heavy models benefit more
+than array-heavy C-LEARN. Behavior-preserving: full suite + `clearn_residual_
+exactness` pass, with dedicated fusion-pass and operand-order unit tests.
+
+A true register VM with a scratch-register file and a 3-operand instruction set
+(register allocation over each expression DAG) would cut more, but is a large
+codegen rewrite touching the symbolic/incremental layer; the 2-operand fusion
+captures most of the dispatch win at a fraction of the risk.
+
+### R3. Faster dispatch
+
+The dispatch is `while pc < len { match &code[pc] { … } }`, which LLVM lowers to a
+jump table (one indirect branch whose target is data-dependent → BTB-unfriendly).
+Classic threaded dispatch (computed-goto / guaranteed tail calls) would spread the
+indirect branch across handlers for better prediction, but **stable Rust offers
+neither computed-goto nor guaranteed TCO** (the `become` keyword is unstable).
+Portable options:
+
+- **More superinstructions** for the top opcode bigrams/trigrams (e.g.
+  `LoadVar; LoadVar; Op2`, `LoadConstant; Op2`). Each fused opcode removes a
+  dispatch; incremental and low-risk. This is the portable lever today.
+- Revisit explicit tail-call dispatch if/when `become` stabilizes.
+- R2 (register VM) reduces dispatch count more than any dispatch-mechanism change.
+
+### R4. `RuntimeView` allocation + `flat_offset` (~20% of post-win run)
+
+`PushVarView`/`PushTempView` rebuild `SmallVec`s (dims, strides, dim_ids) on every
+execution; `flat_offset` (10.3%) recomputes row-major offsets per element. For
+arrayed models this is now the #2 run cost.
+
+Proposal: (a) push more views through the compile-time `PushStaticView` path
+(precomputed `StaticArrayView`) and store dynamic view descriptors in the
+`ByteCodeContext` referenced by id (as `dim_lists` already does for dim ids),
+eliminating per-op `SmallVec` construction; (b) ensure the `is_contiguous` fast
+path in iteration/reduction is taken for the common dense case so `flat_offset`'s
+general strided arithmetic only runs for transposed/sparse views.
+
+- Effort: medium. Risk: low–medium (array semantics are well-tested by
+  `array_tests`).
+
+## Compile-side proposals (the bigger pie — but build levers A+B capture most of it)
+
+After opt=3 + mimalloc the compile is ~1.46 s (from 3.57 s) with **no code
+changes**. The following are second-order and worth it only if compile latency
+remains a UX problem after the build levers (it matters for the salsa
+*incremental* edit loop more than cold compile).
+
+### C1. Arena-allocate the transient parse AST
+
+The equation parser builds `Expr0` with `Box` children + `Vec` args — 3.86M+
+transient heap allocations, all lowered to `VariableStage0` and dropped.
+`bumpalo` is already a dependency. Allocating the AST in a per-parse arena turns
+these into pointer bumps. The constraint: the salsa-cached result
+(`ParsedVariableResult`) must be owned/`'static`, so the arena can only back the
+transient parse→lower step, with the cached value being the owned lowered form.
+Much of this benefit is captured more cheaply by mimalloc (B); pursue the arena
+only if profiling after B still shows the parser as a hotspot.
+
+- Effort: large (thread an arena through the parser; verify nothing cached
+  retains an arena reference). Risk: medium.
+
+### C2. Halve `reconstruct_variable` (6.4% of compile)
+
+`reconstruct_variable` rebuilds a full `datamodel::Variable` (ident/equation/
+inflows/outflows/compat clones) and is called ~2× per variable: once in the
+per-variable parse, and once in `module_ident_context_for_model` →
+`collect_module_idents`. The latter only needs each variable's `(ident, kind,
+is-module-call)` — a lighter projection straight from `SourceVariable` would
+avoid ~half the full reconstructions (and their clones).
+
+- Effort: medium. Risk: low–medium (changes the `collect_module_idents` input
+  type; behavior must stay identical).
+
+### C3. `canonicalize` ASCII fast-path + ident interning
+
+6.1M `to_lowercase` calls; ~4.6M are the `canonicalize` slow path (Vensim names
+have spaces/capitals so they don't hit the alloc-free fast path). Two levers:
+(a) lowercase ASCII in place into the output buffer instead of allocating a
+per-part intermediate `String` (careful: keep Unicode correctness — the function
+has extensive idempotence tests, #559); (b) **intern** canonical idents so
+repeated canonicalization of the same name is a hashmap hit rather than a
+re-derivation. (b) is broader but touches many call sites.
+
+- Effort: (a) small/careful, (b) medium–large. Risk: (a) medium (correctness-
+  critical function), (b) medium.
+
+## Suggested ordering
+
+1. ~~**Build levers A (opt=3 native) + B (mimalloc native)**~~ — DONE. Measured
+   −59% compile / −41% run for ~no engine code and near-zero risk
+   (`[profile.release] opt-level=3` + `.cargo/config.toml` wasm override;
+   `mimalloc` global allocator on the native binaries + libsimlin's opt-in
+   feature). WASM stays on `z` and links no mimalloc.
+2. ~~**R1 (bounds-check elimination)**~~ — INVESTIGATED, dropped: measured
+   sub-noise (~0) ceiling; bounds checks are effectively free at opt-level=3.
+3. ~~**R2 (3-address binop fusion)**~~ — DONE. Flow opcodes −23.5%, run −6.8% on
+   C-LEARN; a late `fuse_three_address` pass at Vm::new (the `CompiledSimulation`
+   stays symbolizable). A full register VM would cut more but is a large rewrite.
+4. **R4 (RuntimeView)** — now the largest remaining run lever for arrayed models;
+   the ~10% `flat_offset` cost is a per-element `SmallVec` rebuild + sparse
+   search, not bounds checks.
+5. **R3 superinstructions** — incremental dispatch wins, low risk.
+6. **C2 / C3** — only if incremental-compile latency still bites after A+B.
diff --git a/src/libsimlin/Cargo.toml b/src/libsimlin/Cargo.toml
index 9f27ea2ff..1585091dd 100644
--- a/src/libsimlin/Cargo.toml
+++ b/src/libsimlin/Cargo.toml
@@ -15,6 +15,12 @@ default = []
 debug-derive = ["simlin-engine/debug-derive"]
 file_io = ["simlin-engine/file_io"]
 ext_data = ["simlin-engine/ext_data"]
+# Use mimalloc as the global allocator. Off by default; enabled by native
+# consumers of the cdylib/staticlib (pysimlin via cffi, C/C++ FFI) where the
+# allocation-heavy engine compile path benefits. Never enabled for the wasm32
+# bundle (the wasm build uses --no-default-features), and the global_allocator
+# is additionally cfg'd off for wasm32 in lib.rs as belt-and-suspenders.
+mimalloc = ["dep:mimalloc"]
 
 [dependencies]
 simlin-engine = { version = "0.1", path = "../simlin-engine", default-features = false, features = ["png_render"] }
@@ -22,6 +28,7 @@ prost = "0.14"
 serde_json = "1.0"
 serde = { version = "1.0", features = ["derive"] }
 anyhow = "1.0"
+mimalloc = { version = "0.1", optional = true }
 
 [dev-dependencies]
 
diff --git a/src/libsimlin/src/lib.rs b/src/libsimlin/src/lib.rs
index f9e45b1e1..11eaa6516 100644
--- a/src/libsimlin/src/lib.rs
+++ b/src/libsimlin/src/lib.rs
@@ -22,6 +22,17 @@
 //! Shared types (enums, structs, helpers) live here in `lib.rs` and are
 //! imported by the modules via `crate::`.
 
+// Native consumers of this cdylib/staticlib (pysimlin via cffi, C/C++ FFI) opt
+// into mimalloc with the `mimalloc` feature: the engine compile path is
+// allocation-heavy (millions of small, short-lived allocations) and mimalloc
+// roughly halves allocator time vs the system malloc. Never enabled for the
+// wasm32 bundle. See docs/design/engine-performance.md. This is the Rust global
+// allocator and is independent of the `simlin_malloc`/`simlin_free`
+// cross-boundary helpers in `memory`.
+#[cfg(all(feature = "mimalloc", not(target_arch = "wasm32")))]
+#[global_allocator]
+static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
 use anyhow::{Error as AnyError, Result};
 use simlin_engine::{self as engine};
 use std::collections::HashMap;
diff --git a/src/pysimlin/Makefile b/src/pysimlin/Makefile
index a23818997..311e02b74 100644
--- a/src/pysimlin/Makefile
+++ b/src/pysimlin/Makefile
@@ -1,8 +1,10 @@
 .PHONY: all build test clean install dev lint type-check
 
-# Build the libsimlin library
+# Build the libsimlin library. mimalloc: the engine compile path is
+# allocation-heavy and mimalloc roughly halves allocator time on native builds
+# (see docs/design/engine-performance.md).
 build-lib:
-	cd ../libsimlin && cargo build --release
+	cd ../libsimlin && cargo build --release --features mimalloc
 
 # Build the Python package
 build: build-lib
diff --git a/src/pysimlin/scripts/build_wheels.py b/src/pysimlin/scripts/build_wheels.py
index 89e1f115a..83c0d2edd 100644
--- a/src/pysimlin/scripts/build_wheels.py
+++ b/src/pysimlin/scripts/build_wheels.py
@@ -37,9 +37,11 @@ def build_libsimlin() -> Path:
     project_root = Path(__file__).parent.parent.parent.parent
     libsimlin_dir = project_root / "libsimlin"
     
-    # Build the library
+    # Build the library. The mimalloc feature swaps in mimalloc as the global
+    # allocator: the engine compile path is allocation-heavy and mimalloc roughly
+    # halves allocator time on native builds (docs/design/engine-performance.md).
     subprocess.run(
-        ["cargo", "build", "--release"],
+        ["cargo", "build", "--release", "--features", "mimalloc"],
         cwd=libsimlin_dir,
         check=True
     )
diff --git a/src/simlin-cli/Cargo.toml b/src/simlin-cli/Cargo.toml
index 22bfe7ed0..5216d9118 100644
--- a/src/simlin-cli/Cargo.toml
+++ b/src/simlin-cli/Cargo.toml
@@ -14,4 +14,8 @@ clap = { version = "4", features = ["derive"] }
 stringreader = "0.1"
 sha2 = "0.10"
 simlin-engine = { version = "0.1", path = "../simlin-engine", features = ["file_io"] }
-simlin = { version = "0.1", path = "../libsimlin" }
+# `mimalloc` installs mimalloc as the process global allocator (this binary runs
+# the allocation-heavy engine compile path). Routed through libsimlin rather than
+# a direct dep + local `#[global_allocator]` so there is exactly one global
+# allocator in this artifact even under `cargo clippy --all-features`.
+simlin = { version = "0.1", path = "../libsimlin", features = ["mimalloc"] }
diff --git a/src/simlin-cli/src/main.rs b/src/simlin-cli/src/main.rs
index 5eb52e924..de9f88260 100644
--- a/src/simlin-cli/src/main.rs
+++ b/src/simlin-cli/src/main.rs
@@ -2,6 +2,14 @@
 // Use of this source code is governed by the Apache License,
 // Version 2.0, that can be found in the LICENSE file.
 
+// mimalloc on native builds (the engine compile path is allocation-heavy;
+// mimalloc roughly halves allocator time -- see docs/design/engine-performance.md)
+// comes via the `simlin/mimalloc` feature on the libsimlin dependency, which
+// installs the global allocator. We deliberately do NOT declare a second
+// `#[global_allocator]` here: this binary links libsimlin, and two global
+// allocators in one artifact is a compile error (notably under
+// `cargo clippy --all-features`, which enables libsimlin's feature).
+
 use std::fs::File;
 use std::io::{BufRead, BufReader, Write};
 use std::path::PathBuf;
diff --git a/src/simlin-engine/examples/clearn_profile.rs b/src/simlin-engine/examples/clearn_profile.rs
new file mode 100644
index 000000000..7cc9a774b
--- /dev/null
+++ b/src/simlin-engine/examples/clearn_profile.rs
@@ -0,0 +1,264 @@
+// Copyright 2026 The Simlin Authors. All rights reserved.
+// Use of this source code is governed by the Apache License,
+// Version 2.0, that can be found in the LICENSE file.
+
+//! Standalone profiling harness for the C-LEARN hero model.
+//!
+//! Times each pipeline stage (parse, compile-via-salsa, VM construction, run)
+//! and reports allocation counts / peak live bytes per stage via a counting
+//! global allocator. Designed as a focused `perf record` / heaptrack target:
+//! set `CLEARN_PROFILE=compile` or `CLEARN_PROFILE=run` and a high iteration
+//! count to give an external sampler sustained signal on one stage.
+//!
+//! Usage:
+//!   cargo run --release -p simlin-engine --example clearn_profile
+//!   CLEARN_COMPILE_ITERS=20 CLEARN_PROFILE=compile \
+//!     perf record -g -- target/release/examples/clearn_profile
+//!   CLEARN_RUN_ITERS=200 CLEARN_PROFILE=run \
+//!     perf record -g -- target/release/examples/clearn_profile
+//!
+//! Environment:
+//!   CLEARN_MODEL          override the .mdl path
+//!   CLEARN_COMPILE_ITERS  extra compile-only iterations (default 0)
+//!   CLEARN_RUN_ITERS      extra run-only iterations (default 0)
+//!   CLEARN_PROFILE        "compile" | "run" | "both" (default both) -- which
+//!                         extra-iteration loop(s) to execute
+
+use std::alloc::{GlobalAlloc, Layout, System as Backing};
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::time::Instant;
+
+use simlin_engine::db::{SimlinDb, compile_project_incremental, sync_from_datamodel_incremental};
+use simlin_engine::{CompiledSimulation, Vm, open_vensim};
+
+// --- Counting allocator -----------------------------------------------------
+//
+// Tracks cumulative allocation calls/bytes plus live bytes and a high-water
+// mark. compile_project_incremental can fan out across rayon threads, so all
+// counters are atomic and the peak is maintained with a CAS loop. The default
+// GlobalAlloc::realloc routes through our alloc/dealloc, so realloc is counted
+// without an explicit override.
+
+struct Counting;
+
+static COUNTING_ON: AtomicBool = AtomicBool::new(false);
+static ALLOC_CALLS: AtomicUsize = AtomicUsize::new(0);
+static ALLOC_BYTES: AtomicUsize = AtomicUsize::new(0);
+static LIVE_BYTES: AtomicUsize = AtomicUsize::new(0);
+static PEAK_BYTES: AtomicUsize = AtomicUsize::new(0);
+
+unsafe impl GlobalAlloc for Counting {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        let p = unsafe { Backing.alloc(layout) };
+        // Counting is gated so the default run measures true wall-clock without
+        // per-allocation atomic overhead. Enable with CLEARN_COUNT_ALLOCS=1 to
+        // get allocation counts (at the cost of distorted timing).
+        if !p.is_null() && COUNTING_ON.load(Ordering::Relaxed) {
+            ALLOC_CALLS.fetch_add(1, Ordering::Relaxed);
+            ALLOC_BYTES.fetch_add(layout.size(), Ordering::Relaxed);
+            let live = LIVE_BYTES.fetch_add(layout.size(), Ordering::Relaxed) + layout.size();
+            let mut peak = PEAK_BYTES.load(Ordering::Relaxed);
+            while live > peak {
+                match PEAK_BYTES.compare_exchange_weak(
+                    peak,
+                    live,
+                    Ordering::Relaxed,
+                    Ordering::Relaxed,
+                ) {
+                    Ok(_) => break,
+                    Err(observed) => peak = observed,
+                }
+            }
+        }
+        p
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        unsafe { Backing.dealloc(ptr, layout) };
+        if COUNTING_ON.load(Ordering::Relaxed) {
+            LIVE_BYTES.fetch_sub(layout.size(), Ordering::Relaxed);
+        }
+    }
+}
+
+#[global_allocator]
+static GLOBAL: Counting = Counting;
+
+#[derive(Clone, Copy)]
+struct Snap {
+    calls: usize,
+    bytes: usize,
+    live: usize,
+}
+
+fn snap() -> Snap {
+    Snap {
+        calls: ALLOC_CALLS.load(Ordering::Relaxed),
+        bytes: ALLOC_BYTES.load(Ordering::Relaxed),
+        live: LIVE_BYTES.load(Ordering::Relaxed),
+    }
+}
+
+/// Reset the peak high-water mark to the current live bytes so the next phase's
+/// peak is measured relative to its own starting point.
+fn reset_peak() {
+    PEAK_BYTES.store(LIVE_BYTES.load(Ordering::Relaxed), Ordering::Relaxed);
+}
+
+fn mib(bytes: usize) -> f64 {
+    bytes as f64 / (1024.0 * 1024.0)
+}
+
+/// Run `f` as a measured phase: report wall time, allocation calls/bytes during
+/// the phase, net retained (live) bytes, and peak live bytes reached.
+fn phase<T>(name: &str, f: impl FnOnce() -> T) -> T {
+    reset_peak();
+    let before = snap();
+    let t0 = Instant::now();
+    let out = f();
+    let elapsed = t0.elapsed();
+    let after = snap();
+    let peak = PEAK_BYTES.load(Ordering::Relaxed);
+
+    let calls = after.calls - before.calls;
+    let bytes = after.bytes - before.bytes;
+    let retained = after.live as i64 - before.live as i64;
+
+    println!(
+        "{name:<22} {:>9.2} ms | allocs {:>10} | alloc'd {:>9.1} MiB | retained {:>+8.1} MiB | peak {:>8.1} MiB",
+        elapsed.as_secs_f64() * 1000.0,
+        calls,
+        mib(bytes),
+        retained as f64 / (1024.0 * 1024.0),
+        mib(peak),
+    );
+    out
+}
+
+fn model_path() -> String {
+    if let Ok(p) = std::env::var("CLEARN_MODEL") {
+        return p;
+    }
+    format!(
+        "{}/../../test/xmutil_test_models/C-LEARN v77 for Vensim.mdl",
+        env!("CARGO_MANIFEST_DIR")
+    )
+}
+
+fn compile_once(datamodel: &simlin_engine::datamodel::Project) -> CompiledSimulation {
+    let mut db = SimlinDb::default();
+    let sync = sync_from_datamodel_incremental(&mut db, datamodel, None);
+    compile_project_incremental(&db, sync.project, "main").unwrap()
+}
+
+fn env_usize(key: &str, default: usize) -> usize {
+    std::env::var(key)
+        .ok()
+        .and_then(|v| v.parse().ok())
+        .unwrap_or(default)
+}
+
+fn main() {
+    let path = model_path();
+    let compile_iters = env_usize("CLEARN_COMPILE_ITERS", 0);
+    let run_iters = env_usize("CLEARN_RUN_ITERS", 0);
+    let which = std::env::var("CLEARN_PROFILE").unwrap_or_else(|_| "both".to_string());
+    if std::env::var("CLEARN_COUNT_ALLOCS").is_ok_and(|v| v != "0") {
+        COUNTING_ON.store(true, Ordering::Relaxed);
+    }
+
+    println!("model: {path}");
+
+    let contents = phase("read_file", || {
+        std::fs::read_to_string(&path).unwrap_or_else(|e| panic!("failed to read {path}: {e}"))
+    });
+    println!(
+        "  source: {} bytes, {} lines",
+        contents.len(),
+        contents.lines().count()
+    );
+
+    let datamodel = phase("parse (open_vensim)", || open_vensim(&contents).unwrap());
+    let n_models = datamodel.models.len();
+    let n_vars: usize = datamodel.models.iter().map(|m| m.variables.len()).sum();
+    println!(
+        "  models: {n_models}, datamodel variables: {n_vars}, dims: {}",
+        datamodel.dimensions.len()
+    );
+
+    let compiled = phase("compile (salsa)", || compile_once(&datamodel));
+    println!("  n_slots (root): {}", compiled.n_slots());
+
+    let prof = compiled.bytecode_profile();
+    println!(
+        "  bytecode: {} opcodes ({:.1} KiB @ 8B) = {} flow + {} stock + {} initial ({} initials)",
+        prof.total_opcodes,
+        (prof.total_opcodes * 8) as f64 / 1024.0,
+        prof.flow_opcodes,
+        prof.stock_opcodes,
+        prof.initial_opcodes,
+        prof.n_initials,
+    );
+    println!(
+        "  flow opcodes after 3-address fusion (est): {} -> {} ({:.1}% reduction)",
+        prof.flow_opcodes,
+        prof.flow_opcodes_after_fusion,
+        100.0 * (prof.flow_opcodes - prof.flow_opcodes_after_fusion) as f64
+            / prof.flow_opcodes as f64,
+    );
+    println!(
+        "  tables: {} literals, {} GFs / {} points, {} temp slots, {} dims, {} static_views, {} dim_lists, {} names, {} modules",
+        prof.total_literals,
+        prof.graphical_functions,
+        prof.graphical_function_points,
+        prof.temp_storage_slots,
+        prof.dimensions,
+        prof.static_views,
+        prof.dim_lists,
+        prof.names,
+        prof.n_modules,
+    );
+    let mut hist: Vec<_> = prof.histogram.iter().collect();
+    hist.sort_by(|a, b| b.1.cmp(a.1));
+    println!("  opcode histogram (top 25 of {}):", prof.histogram.len());
+    for (name, count) in hist.iter().take(25) {
+        let pct = **count as f64 / prof.total_opcodes as f64 * 100.0;
+        println!("    {name:<22} {count:>9}  {pct:>5.1}%");
+    }
+
+    let mut vm = phase("Vm::new", || Vm::new(compiled.clone()).unwrap());
+    println!("  variables (offsets): {}", vm.names_as_strs().len());
+
+    phase("run_to_end", || vm.run_to_end().unwrap());
+    let results = vm.into_results();
+    println!(
+        "  result slots/step: {}, saved steps: {}",
+        results.step_size, results.step_count
+    );
+
+    // Extra-iteration loops for external samplers (perf/heaptrack). Kept out of
+    // the per-phase accounting above; these print only aggregate timing.
+    let do_compile = which == "both" || which == "compile";
+    let do_run = which == "both" || which == "run";
+
+    if compile_iters > 0 && do_compile {
+        let t0 = Instant::now();
+        for _ in 0..compile_iters {
+            std::hint::black_box(compile_once(&datamodel));
+        }
+        let per = t0.elapsed().as_secs_f64() * 1000.0 / compile_iters as f64;
+        println!("compile x{compile_iters}: {per:.2} ms/iter");
+    }
+
+    if run_iters > 0 && do_run {
+        let compiled = compile_once(&datamodel);
+        let t0 = Instant::now();
+        for _ in 0..run_iters {
+            let mut vm = Vm::new(compiled.clone()).unwrap();
+            vm.run_to_end().unwrap();
+            std::hint::black_box(&vm);
+        }
+        let per = t0.elapsed().as_secs_f64() * 1000.0 / run_iters as f64;
+        println!("run x{run_iters}: {per:.2} ms/iter (incl. Vm::new + clone)");
+    }
+}
diff --git a/src/simlin-engine/src/ast/expr0.rs b/src/simlin-engine/src/ast/expr0.rs
index 901469fa7..96c2cc295 100644
--- a/src/simlin-engine/src/ast/expr0.rs
+++ b/src/simlin-engine/src/ast/expr0.rs
@@ -2,7 +2,7 @@
 // Use of this source code is governed by the Apache License,
 // Version 2.0, that can be found in the LICENSE file.
 
-use crate::builtins::{Loc, UntypedBuiltinFn, is_0_arity_builtin_fn};
+use crate::builtins::{Loc, UntypedBuiltinFn, is_0_arity_builtin_fn_ci};
 use crate::common::{EquationError, RawIdent};
 use crate::lexer::LexerType;
 use std::result::Result as StdResult;
@@ -180,9 +180,13 @@ impl Expr0 {
     fn reify_0_arity_builtins(self) -> Self {
         match self {
             Expr0::Var(ref id, loc) => {
-                // Check for 0-arity builtins using lowercase version
-                let lowercase_id = id.as_str().to_lowercase();
-                if is_0_arity_builtin_fn(&lowercase_id) {
+                // Allocation-free membership test first: the vast majority of
+                // variable references are not 0-arity builtins, so we avoid the
+                // per-reference to_lowercase() heap allocation on the hot parse
+                // path and only materialize the lowercased name in the rare case
+                // a genuine `pi`/`time`/etc. reference must be reified.
+                if is_0_arity_builtin_fn_ci(id.as_str()) {
+                    let lowercase_id = id.as_str().to_lowercase();
                     Expr0::App(UntypedBuiltinFn(lowercase_id, vec![]), loc)
                 } else {
                     self
diff --git a/src/simlin-engine/src/builtins.rs b/src/simlin-engine/src/builtins.rs
index 3edcd630f..4b5e4e4bc 100644
--- a/src/simlin-engine/src/builtins.rs
+++ b/src/simlin-engine/src/builtins.rs
@@ -366,6 +366,30 @@ pub fn is_0_arity_builtin_fn(name: &str) -> bool {
     )
 }
 
+/// ASCII case-insensitive, allocation-free variant of [`is_0_arity_builtin_fn`].
+///
+/// The 0-arity builtin names are all ASCII, so a name containing any non-ASCII
+/// byte cannot match, and ASCII case-folding yields the same membership verdict
+/// as Unicode lowercasing for this fixed ASCII set. Used on the hot parse path
+/// (`Expr0::reify_0_arity_builtins`), which previously allocated a `String` via
+/// `to_lowercase()` for *every* variable reference just to test membership.
+pub fn is_0_arity_builtin_fn_ci(name: &str) -> bool {
+    const NAMES: [&str; 9] = [
+        "inf",
+        "pi",
+        "time",
+        "time_step",
+        "dt",
+        "initial_time",
+        "starttime",
+        "final_time",
+        "stoptime",
+    ];
+    NAMES
+        .iter()
+        .any(|candidate| name.eq_ignore_ascii_case(candidate))
+}
+
 /// Returns true if `func_name` (already lowercased) names a function that
 /// expands to a stdlib module: the canonical names in `MODEL_NAMES` plus
 /// the alias forms `delay`, `delayn`, and `smthn`.
@@ -546,6 +570,54 @@ fn test_is_0_arity_builtin_fn() {
     assert!(is_0_arity_builtin_fn("time"));
 }
 
+#[test]
+fn test_is_0_arity_builtin_fn_ci() {
+    const NAMES: [&str; 9] = [
+        "inf",
+        "pi",
+        "time",
+        "time_step",
+        "dt",
+        "initial_time",
+        "starttime",
+        "final_time",
+        "stoptime",
+    ];
+    for name in NAMES {
+        assert!(is_0_arity_builtin_fn_ci(name), "lowercase {name}");
+        assert!(
+            is_0_arity_builtin_fn_ci(&name.to_uppercase()),
+            "uppercase {name}"
+        );
+    }
+    assert!(is_0_arity_builtin_fn_ci("Time"));
+    assert!(is_0_arity_builtin_fn_ci("Final_Time"));
+    assert!(!is_0_arity_builtin_fn_ci("lookup"));
+    assert!(!is_0_arity_builtin_fn_ci("times"));
+    assert!(!is_0_arity_builtin_fn_ci(""));
+    // A non-ASCII name can never match (every builtin name is ASCII).
+    assert!(!is_0_arity_builtin_fn_ci("pï"));
+
+    // Equivalent to to_lowercase() + is_0_arity_builtin_fn for any ASCII input,
+    // which is the behavior the hot-path caller relies on.
+    for s in [
+        "TIME",
+        "Pi",
+        "Dt",
+        "Final_Time",
+        "STOPTIME",
+        "foo",
+        "lookuptable",
+        "timestep",
+    ] {
+        assert_eq!(
+            is_0_arity_builtin_fn_ci(s),
+            is_0_arity_builtin_fn(&s.to_lowercase()),
+            "ci/lowercase mismatch for {s}"
+        );
+    }
+}
+
 #[test]
 fn test_name() {
     enum TestExpr {}
diff --git a/src/simlin-engine/src/bytecode.rs b/src/simlin-engine/src/bytecode.rs
index 77bf10ea9..71d157041 100644
--- a/src/simlin-engine/src/bytecode.rs
+++ b/src/simlin-engine/src/bytecode.rs
@@ -645,6 +645,44 @@ pub(crate) enum Opcode {
         off: VariableOffset,
     },
 
+    // === 3-ADDRESS BINARY OPS (R2) ===
+    // Fold the leaf operand load(s) of a binary op into the op itself, so a
+    // subexpression `a op b` dispatches once instead of 3 (two loads + Op2) or
+    // twice instead of 2 (one load + Op2). Each pushes its result. `curr[]` is
+    // effectively the register file: these read operands straight from it (or
+    // from `literals`) with no intervening stack push/pop. Created only by the
+    // late `fuse_three_address` pass on final concrete bytecode -- they never
+    // enter the symbolic/incremental layer. A 3-operand `dst = a op b` would
+    // exceed the 8-byte Opcode budget, so the assign stays a separate op.
+    /// Push `curr[module_off + l] op curr[module_off + r]`.
+    BinVarVar {
+        l: VariableOffset,
+        r: VariableOffset,
+        op: Op2,
+    },
+    /// Push `curr[module_off + l] op literals[r]`.
+    BinVarConst {
+        l: VariableOffset,
+        r: LiteralId,
+        op: Op2,
+    },
+    /// Push `literals[l] op curr[module_off + r]`.
+    BinConstVar {
+        l: LiteralId,
+        r: VariableOffset,
+        op: Op2,
+    },
+    /// Pop `lhs`; push `lhs op curr[module_off + r]`.
+    BinStackVar {
+        r: VariableOffset,
+        op: Op2,
+    },
+    /// Pop `lhs`; push `lhs op literals[r]`.
+    BinStackConst {
+        r: LiteralId,
+        op: Op2,
+    },
+
     // =========================================================================
     // ARRAY SUPPORT (new)
     // =========================================================================
@@ -979,6 +1017,14 @@ impl Opcode {
             Opcode::BinOpAssignCurr { .. } => (2, 0), // pops 2, assigns directly
             Opcode::BinOpAssignNext { .. } => (2, 0), // pops 2, assigns directly
 
+            // 3-address binops: the *Var/*Const forms read both operands from
+            // curr/literals and push (0 pops, 1 push); the Stack* forms pop the
+            // lhs and push the result (1 pop, 1 push).
+            Opcode::BinVarVar { .. } | Opcode::BinVarConst { .. } | Opcode::BinConstVar { .. } => {
+                (0, 1)
+            }
+            Opcode::BinStackVar { .. } | Opcode::BinStackConst { .. } => (1, 1),
+
             // View stack ops don't touch arithmetic stack
             Opcode::PushVarView { .. }
             | Opcode::PushTempView { .. }
@@ -1042,6 +1088,81 @@ impl Opcode {
             Opcode::NextBroadcastOrJump { .. } => (0, 0),
         }
     }
+
+    /// Static variant name, independent of payload. Used for bytecode-composition
+    /// profiling (opcode histograms) and human-readable diagnostics without
+    /// depending on the optional `debug-derive` Debug impl.
+    pub(crate) fn name(&self) -> &'static str {
+        match self {
+            Opcode::Op2 { .. } => "Op2",
+            Opcode::Not {} => "Not",
+            Opcode::LoadConstant { .. } => "LoadConstant",
+            Opcode::LoadVar { .. } => "LoadVar",
+            Opcode::LoadGlobalVar { .. } => "LoadGlobalVar",
+            Opcode::LoadPrev { .. } => "LoadPrev",
+            Opcode::LoadInitial { .. } => "LoadInitial",
+            Opcode::PushSubscriptIndex { .. } => "PushSubscriptIndex",
+            Opcode::LoadSubscript { .. } => "LoadSubscript",
+            Opcode::SetCond {} => "SetCond",
+            Opcode::If {} => "If",
+            Opcode::Ret => "Ret",
+            Opcode::LoadModuleInput { .. } => "LoadModuleInput",
+            Opcode::EvalModule { .. } => "EvalModule",
+            Opcode::AssignCurr { .. } => "AssignCurr",
+            Opcode::AssignNext { .. } => "AssignNext",
+            Opcode::Apply { .. } => "Apply",
+            Opcode::Lookup { .. } => "Lookup",
+            Opcode::AssignConstCurr { .. } => "AssignConstCurr",
+            Opcode::BinVarVar { .. } => "BinVarVar",
+            Opcode::BinVarConst { .. } => "BinVarConst",
+            Opcode::BinConstVar { .. } => "BinConstVar",
+            Opcode::BinStackVar { .. } => "BinStackVar",
+            Opcode::BinStackConst { .. } => "BinStackConst",
+            Opcode::BinOpAssignCurr { .. } => "BinOpAssignCurr",
+            Opcode::BinOpAssignNext { .. } => "BinOpAssignNext",
+            Opcode::PushVarView { .. } => "PushVarView",
+            Opcode::PushTempView { .. } => "PushTempView",
+            Opcode::PushStaticView { .. } => "PushStaticView",
+            Opcode::PushVarViewDirect { .. } => "PushVarViewDirect",
+            Opcode::ViewSubscriptConst { .. } => "ViewSubscriptConst",
+            Opcode::ViewSubscriptDynamic { .. } => "ViewSubscriptDynamic",
+            Opcode::ViewRange { .. } => "ViewRange",
+            Opcode::ViewRangeDynamic { .. } => "ViewRangeDynamic",
+            Opcode::ViewStarRange { .. } => "ViewStarRange",
+            Opcode::ViewWildcard { .. } => "ViewWildcard",
+            Opcode::ViewTranspose {} => "ViewTranspose",
+            Opcode::PopView {} => "PopView",
+            Opcode::DupView {} => "DupView",
+            Opcode::LoadTempConst { .. } => "LoadTempConst",
+            Opcode::LoadTempDynamic { .. } => "LoadTempDynamic",
+            Opcode::BeginIter { .. } => "BeginIter",
+            Opcode::LoadIterElement {} => "LoadIterElement",
+            Opcode::LoadIterTempElement { .. } => "LoadIterTempElement",
+            Opcode::LoadIterViewTop {} => "LoadIterViewTop",
+            Opcode::LoadIterViewAt { .. } => "LoadIterViewAt",
+            Opcode::StoreIterElement {} => "StoreIterElement",
+            Opcode::NextIterOrJump { .. } => "NextIterOrJump",
+            Opcode::EndIter {} => "EndIter",
+            Opcode::ArraySum {} => "ArraySum",
+            Opcode::ArrayMax {} => "ArrayMax",
+            Opcode::ArrayMin {} => "ArrayMin",
+            Opcode::ArrayMean {} => "ArrayMean",
+            Opcode::ArrayStddev {} => "ArrayStddev",
+            Opcode::ArraySize {} => "ArraySize",
+            Opcode::VectorSelect {} => "VectorSelect",
+            Opcode::VectorElmMap { .. } => "VectorElmMap",
+            Opcode::VectorSortOrder { .. } => "VectorSortOrder",
+            Opcode::Rank { .. } => "Rank",
+            Opcode::LookupArray { .. } => "LookupArray",
+            Opcode::AllocateAvailable { .. } => "AllocateAvailable",
+            Opcode::AllocateByPriority { .. } => "AllocateByPriority",
+            Opcode::BeginBroadcastIter { .. } => "BeginBroadcastIter",
+            Opcode::LoadBroadcastElement { .. } => "LoadBroadcastElement",
+            Opcode::StoreBroadcastElement {} => "StoreBroadcastElement",
+            Opcode::NextBroadcastOrJump { .. } => "NextBroadcastOrJump",
+            Opcode::EndBroadcastIter {} => "EndBroadcastIter",
+        }
+    }
 }
 
 // ============================================================================
@@ -1256,6 +1377,45 @@ impl ByteCode {
         }
         max_depth
     }
+
+    /// Estimate the opcode count after a 3-address fusion pass that folds leaf
+    /// operand loads into the binary op that consumes them (R2). Greedy,
+    /// post-peephole semantics: `LoadX; LoadY; Op2` fuses 3->1 (both operands
+    /// are leaf loads) and `LoadX; Op2` fuses 2->1 (one operand is a leaf load,
+    /// the other is already on the stack), where a leaf load is `LoadVar`,
+    /// `LoadGlobalVar`, or `LoadConstant`. Used only to size the win before
+    /// implementing the pass; the real pass must additionally fix up jumps.
+    pub(crate) fn estimate_fused_len(&self) -> usize {
+        fn is_leaf_load(op: &Opcode) -> bool {
+            matches!(
+                op,
+                Opcode::LoadVar { .. } | Opcode::LoadGlobalVar { .. } | Opcode::LoadConstant { .. }
+            )
+        }
+        let code = &self.code;
+        let mut i = 0;
+        let mut emitted = 0;
+        while i < code.len() {
+            if i + 2 < code.len()
+                && is_leaf_load(&code[i])
+                && is_leaf_load(&code[i + 1])
+                && matches!(code[i + 2], Opcode::Op2 { .. })
+            {
+                emitted += 1;
+                i += 3;
+            } else if i + 1 < code.len()
+                && is_leaf_load(&code[i])
+                && matches!(code[i + 1], Opcode::Op2 { .. })
+            {
+                emitted += 1;
+                i += 2;
+            } else {
+                emitted += 1;
+                i += 1;
+            }
+        }
+        emitted
+    }
 }
 
 #[cfg_attr(feature = "debug-derive", derive(Debug))]
@@ -1403,6 +1563,145 @@ impl ByteCode {
 
         self.code = optimized;
     }
+
+    /// Late 3-address fusion pass (R2): fold the leaf operand load(s) of a
+    /// binary op into the op itself, so `LoadX; LoadY; Op2` becomes one
+    /// `BinXY` (3->1) and `LoadX; Op2` (lhs already on the stack) becomes one
+    /// `BinStackX` (2->1).
+    ///
+    /// MUST run only on FINAL concrete bytecode -- after `peephole_optimize`
+    /// and, for the incremental path, after `resolve` -- because the fused
+    /// opcodes deliberately do not exist in the symbolic/incremental layer.
+    /// Greedy, longest-match-first. Reuses the same jump-target guard and
+    /// old->new PC remap as `peephole_optimize`: a run is only fused when the
+    /// instructions it *absorbs* (the second, and for a triple the third) are
+    /// not jump targets, so no jump can land mid-fusion; a jump to the first
+    /// instruction still lands on the fused opcode at the same new PC.
+    /// Stack-effect-preserving (the fused ops carry the net effect of the
+    /// sequence they replace), so the `max_stack_depth` safety proof is
+    /// unchanged.
+    pub(crate) fn fuse_three_address(&mut self) {
+        if self.code.is_empty() {
+            return;
+        }
+
+        // 1. Build set of PCs that are jump targets.
+        let mut jump_targets = vec![false; self.code.len()];
+        for (pc, op) in self.code.iter().enumerate() {
+            if let Some(offset) = op.jump_offset() {
+                let target = (pc as isize + offset as isize) as usize;
+                assert!(
+                    target < jump_targets.len(),
+                    "jump at pc {pc} targets {target}, out of bounds (len {})",
+                    self.code.len()
+                );
+                jump_targets[target] = true;
+            }
+        }
+
+        // 2. Greedy fuse, building an old_pc -> new_pc map (one entry per
+        //    original instruction) for jump fixup.
+        let mut optimized: Vec<Opcode> = Vec::with_capacity(self.code.len());
+        let mut pc_map: Vec<usize> = Vec::with_capacity(self.code.len() + 1);
+        let mut i = 0;
+        while i < self.code.len() {
+            let new_pc = optimized.len();
+
+            // 3-window: [leaf load, leaf load, Op2]. Both absorbed instructions
+            // (i+1, i+2) must not be jump targets.
+            let three = i + 2 < self.code.len()
+                && !jump_targets[i + 1]
+                && !jump_targets[i + 2]
+                && matches!(self.code[i + 2], Opcode::Op2 { .. });
+            let fused3 = if three {
+                match (&self.code[i], &self.code[i + 1], &self.code[i + 2]) {
+                    (
+                        Opcode::LoadVar { off: l },
+                        Opcode::LoadVar { off: r },
+                        Opcode::Op2 { op },
+                    ) => Some(Opcode::BinVarVar {
+                        l: *l,
+                        r: *r,
+                        op: *op,
+                    }),
+                    (
+                        Opcode::LoadVar { off: l },
+                        Opcode::LoadConstant { id: r },
+                        Opcode::Op2 { op },
+                    ) => Some(Opcode::BinVarConst {
+                        l: *l,
+                        r: *r,
+                        op: *op,
+                    }),
+                    (
+                        Opcode::LoadConstant { id: l },
+                        Opcode::LoadVar { off: r },
+                        Opcode::Op2 { op },
+                    ) => Some(Opcode::BinConstVar {
+                        l: *l,
+                        r: *r,
+                        op: *op,
+                    }),
+                    _ => None,
+                }
+            } else {
+                None
+            };
+            if let Some(op) = fused3 {
+                optimized.push(op);
+                pc_map.push(new_pc); // old i
+                pc_map.push(new_pc); // old i+1
+                pc_map.push(new_pc); // old i+2
+                i += 3;
+                continue;
+            }
+
+            // 2-window: [leaf load, Op2] with the lhs already on the stack.
+            let two = i + 1 < self.code.len()
+                && !jump_targets[i + 1]
+                && matches!(self.code[i + 1], Opcode::Op2 { .. });
+            let fused2 = if two {
+                match (&self.code[i], &self.code[i + 1]) {
+                    (Opcode::LoadVar { off: r }, Opcode::Op2 { op }) => {
+                        Some(Opcode::BinStackVar { r: *r, op: *op })
+                    }
+                    (Opcode::LoadConstant { id: r }, Opcode::Op2 { op }) => {
+                        Some(Opcode::BinStackConst { r: *r, op: *op })
+                    }
+                    _ => None,
+                }
+            } else {
+                None
+            };
+            if let Some(op) = fused2 {
+                optimized.push(op);
+                pc_map.push(new_pc); // old i
+                pc_map.push(new_pc); // old i+1
+                i += 2;
+                continue;
+            }
+
+            // No fusion: copy as-is.
+            pc_map.push(new_pc);
+            optimized.push(self.code[i]);
+            i += 1;
+        }
+        pc_map.push(optimized.len());
+
+        // 3. Fix up jump offsets via the old_pc -> new_pc map.
+        for (old_pc, op) in self.code.iter().enumerate() {
+            let Some(jump_back) = op.jump_offset() else {
+                continue;
+            };
+            let new_pc = pc_map[old_pc];
+            let old_target = (old_pc as isize + jump_back as isize) as usize;
+            let new_target = pc_map[old_target];
+            let new_jump_back = (new_target as isize - new_pc as isize) as PcOffset;
+            *optimized[new_pc].jump_offset_mut().unwrap() = new_jump_back;
+        }
+
+        self.code = optimized;
+    }
 }
 
 #[cfg(test)]
@@ -3099,6 +3398,224 @@ mod tests {
             assert_eq!(ids[0], i);
         }
     }
+
+    // === 3-address fusion (R2) ===
+
+    #[test]
+    fn test_fuse_var_var() {
+        // a + b -> BinVarVar; the trailing assign is left to the existing
+        // BinOpAssignCurr fusion (a 3-operand op would exceed the 8-byte budget).
+        let mut bc = ByteCode {
+            literals: vec![],
+            code: vec![
+                Opcode::LoadVar { off: 0 },
+                Opcode::LoadVar { off: 1 },
+                Opcode::Op2 { op: Op2::Add },
+                Opcode::AssignCurr { off: 2 },
+            ],
+        };
+        bc.fuse_three_address();
+        assert_eq!(bc.code.len(), 2);
+        assert!(matches!(
+            bc.code[0],
+            Opcode::BinVarVar {
+                l: 0,
+                r: 1,
+                op: Op2::Add
+            }
+        ));
+        assert!(matches!(bc.code[1], Opcode::AssignCurr { off: 2 }));
+    }
+
+    #[test]
+    fn test_fuse_var_const_preserves_operand_order() {
+        // `a - 5`: the var is the lhs, the const the rhs. Sub is non-commutative,
+        // so a swapped encoding would be a silent miscompile.
+        let mut bc = ByteCode {
+            literals: vec![5.0],
+            code: vec![
+                Opcode::LoadVar { off: 7 },
+                Opcode::LoadConstant { id: 0 },
+                Opcode::Op2 { op: Op2::Sub },
+            ],
+        };
+        bc.fuse_three_address();
+        assert_eq!(bc.code.len(), 1);
+        assert!(matches!(
+            bc.code[0],
+            Opcode::BinVarConst {
+                l: 7,
+                r: 0,
+                op: Op2::Sub
+            }
+        ));
+    }
+
+    #[test]
+    fn test_fuse_const_var_preserves_operand_order() {
+        // `5 - a`: the const is the lhs, the var the rhs.
+        let mut bc = ByteCode {
+            literals: vec![5.0],
+            code: vec![
+                Opcode::LoadConstant { id: 0 },
+                Opcode::LoadVar { off: 7 },
+                Opcode::Op2 { op: Op2::Sub },
+            ],
+        };
+        bc.fuse_three_address();
+        assert_eq!(bc.code.len(), 1);
+        assert!(matches!(
+            bc.code[0],
+            Opcode::BinConstVar {
+                l: 0,
+                r: 7,
+                op: Op2::Sub
+            }
+        ));
+    }
+
+    #[test]
+    fn test_fuse_greedy_triple_then_stack_var() {
+        // ((a + b) + c): the leaf triple fuses to BinVarVar (greedy prefers the
+        // 3-window), then the outer `+ c` -- whose lhs is on the stack -- fuses
+        // the load of c into a BinStackVar.
+        let mut bc = ByteCode {
+            literals: vec![],
+            code: vec![
+                Opcode::LoadVar { off: 0 },
+                Opcode::LoadVar { off: 1 },
+                Opcode::Op2 { op: Op2::Add },
+                Opcode::LoadVar { off: 2 },
+                Opcode::Op2 { op: Op2::Add },
+            ],
+        };
+        bc.fuse_three_address();
+        assert_eq!(bc.code.len(), 2);
+        assert!(matches!(
+            bc.code[0],
+            Opcode::BinVarVar {
+                l: 0,
+                r: 1,
+                op: Op2::Add
+            }
+        ));
+        assert!(matches!(
+            bc.code[1],
+            Opcode::BinStackVar { r: 2, op: Op2::Add }
+        ));
+    }
+
+    #[test]
+    fn test_fuse_stack_const() {
+        // (a + b) * 2: leaf triple -> BinVarVar; the outer `* 2` (lhs on stack)
+        // -> BinStackConst.
+        let mut bc = ByteCode {
+            literals: vec![2.0],
+            code: vec![
+                Opcode::LoadVar { off: 0 },
+                Opcode::LoadVar { off: 1 },
+                Opcode::Op2 { op: Op2::Add },
+                Opcode::LoadConstant { id: 0 },
+                Opcode::Op2 { op: Op2::Mul },
+            ],
+        };
+        bc.fuse_three_address();
+        assert_eq!(bc.code.len(), 2);
+        assert!(matches!(bc.code[0], Opcode::BinVarVar { .. }));
+        assert!(matches!(
+            bc.code[1],
+            Opcode::BinStackConst { r: 0, op: Op2::Mul }
+        ));
+    }
+
+    #[test]
+    fn test_fuse_noop_without_op2_and_empty() {
+        // Nothing to fold into (no Op2): unchanged.
+        let mut bc = ByteCode {
+            literals: vec![1.0],
+            code: vec![
+                Opcode::LoadConstant { id: 0 },
+                Opcode::AssignCurr { off: 0 },
+            ],
+        };
+        bc.fuse_three_address();
+        assert_eq!(bc.code.len(), 2);
+        assert!(matches!(bc.code[0], Opcode::LoadConstant { id: 0 }));
+
+        let mut empty = ByteCode::default();
+        empty.fuse_three_address();
+        assert!(empty.code.is_empty());
+    }
+
+    #[test]
+    fn test_fuse_preserves_max_stack_depth() {
+        // x = (a + b) * (c + d): peak depth 3. Fusion folds loads into ops, so
+        // depth can only stay the same or shrink -- never grow (the Stack-safety
+        // proof must survive fusion).
+        let mut bc = ByteCode {
+            literals: vec![],
+            code: vec![
+                Opcode::LoadVar { off: 0 },
+                Opcode::LoadVar { off: 1 },
+                Opcode::Op2 { op: Op2::Add },
+                Opcode::LoadVar { off: 2 },
+                Opcode::LoadVar { off: 3 },
+                Opcode::Op2 { op: Op2::Add },
+                Opcode::Op2 { op: Op2::Mul },
+                Opcode::AssignCurr { off: 4 },
+            ],
+        };
+        let before = bc.max_stack_depth();
+        bc.fuse_three_address();
+        assert!(bc.max_stack_depth() <= before);
+    }
+
+    #[test]
+    fn test_fuse_triple_with_jump_target_at_first_instruction() {
+        // A backward jump targets the first instruction of a fusable triple. The
+        // triple still fuses (the fused op replaces the first instruction at the
+        // same PC) and the jump offset is rewritten to land on it.
+        let mut bc = ByteCode {
+            literals: vec![],
+            code: vec![
+                Opcode::LoadVar { off: 0 },               // [0] <- jump target
+                Opcode::LoadVar { off: 1 },               // [1]
+                Opcode::Op2 { op: Op2::Add },             // [2]
+                Opcode::NextIterOrJump { jump_back: -3 }, // [3] -> [0]
+            ],
+        };
+        bc.fuse_three_address();
+        assert_eq!(bc.code.len(), 2);
+        assert!(matches!(bc.code[0], Opcode::BinVarVar { .. }));
+        assert!(matches!(
+            bc.code[1],
+            Opcode::NextIterOrJump { jump_back: -1 }
+        ));
+    }
+
+    #[test]
+    fn test_fuse_blocked_when_absorbed_instruction_is_jump_target() {
+        // A jump targets the Op2 (the instruction a triple would absorb). Fusing
+        // would make the jump land mid-fusion, so the pass must leave it alone.
+        let mut bc = ByteCode {
+            literals: vec![],
+            code: vec![
+                Opcode::LoadVar { off: 0 },               // [0]
+                Opcode::LoadVar { off: 1 },               // [1]
+                Opcode::Op2 { op: Op2::Add },             // [2] <- jump target
+                Opcode::NextIterOrJump { jump_back: -1 }, // [3] -> [2]
+            ],
+        };
+        bc.fuse_three_address();
+        assert_eq!(bc.code.len(), 4);
+        assert!(matches!(bc.code[0], Opcode::LoadVar { off: 0 }));
+        assert!(matches!(bc.code[1], Opcode::LoadVar { off: 1 }));
+        assert!(matches!(bc.code[2], Opcode::Op2 { op: Op2::Add }));
+        assert!(matches!(
+            bc.code[3],
+            Opcode::NextIterOrJump { jump_back: -1 }
+        ));
+    }
 }
 
 /// A single variable's compiled initial-value bytecode, along with the
diff --git a/src/simlin-engine/src/compiler/symbolic.rs b/src/simlin-engine/src/compiler/symbolic.rs
index a2a9c6edf..66361f621 100644
--- a/src/simlin-engine/src/compiler/symbolic.rs
+++ b/src/simlin-engine/src/compiler/symbolic.rs
@@ -518,6 +518,17 @@ pub(crate) fn symbolize_opcode(
             op: *op,
             var: rmap.lookup(u32::from(*off))?,
         }),
+        // The 3-address fused binops are created by `ByteCode::fuse_three_address`,
+        // which runs only on FINAL concrete bytecode (after `resolve`), strictly
+        // after symbolization. They therefore never reach this function; seeing
+        // one means the fusion ran before symbolize, which is a compiler bug.
+        Opcode::BinVarVar { .. }
+        | Opcode::BinVarConst { .. }
+        | Opcode::BinConstVar { .. }
+        | Opcode::BinStackVar { .. }
+        | Opcode::BinStackConst { .. } => {
+            unreachable!("3-address fused binop reached symbolize_opcode")
+        }
         Opcode::PushVarView {
             base_off,
             dim_list_id,
@@ -1168,6 +1179,10 @@ pub(crate) fn resolve_module(
         })
         .collect::<Result<Vec<_>, String>>()?;
 
+    // `resolve_module` is a pure symbolic<->concrete primitive (the roundtrip
+    // tests symbolize its output again), so the 3-address fusion (R2) is NOT
+    // applied here -- the production assembler `assemble_module` applies it to
+    // this function's output instead, where the result is never re-symbolized.
     let compiled_flows = resolve_bytecode(&sym.compiled_flows, layout)?;
     let compiled_stocks = resolve_bytecode(&sym.compiled_stocks, layout)?;
 
diff --git a/src/simlin-engine/src/db.rs b/src/simlin-engine/src/db.rs
index af019760a..74eef73da 100644
--- a/src/simlin-engine/src/db.rs
+++ b/src/simlin-engine/src/db.rs
@@ -3734,8 +3734,11 @@ pub fn compile_var_fragment(
     let var_ident_canonical: Ident<Canonical> = Ident::new(&var_ident);
 
     // Caller-owned, lowering-independent context (built only from
-    // project/variable data, never from the lowered equation).
-    let dm_dims = source_dims_to_datamodel(project.dimensions(db));
+    // project/variable data, never from the lowered equation). Use the
+    // salsa-cached project dims (returns(ref)) rather than re-running
+    // source_dims_to_datamodel on every variable -- this fragment compiler is
+    // invoked once per variable, and the datamodel dims are project-global.
+    let dm_dims = project_datamodel_dims(db, project);
     let dim_context = crate::dimensions::DimensionsContext::from(dm_dims.as_slice());
     let converted_dims: Vec<crate::dimensions::Dimension> = dm_dims
         .iter()
@@ -5337,7 +5340,10 @@ pub fn assemble_module(
         dim_lists: merged.dim_lists,
     };
 
-    // Resolve symbolic -> concrete offsets
+    // Resolve symbolic -> concrete offsets. The CompiledModule stays a pure,
+    // symbolizable artifact (the symbolic roundtrip tests symbolize it again,
+    // and salsa caches it); the 3-address fusion (R2) is applied later, at
+    // Vm::new, to the execution copy of the bytecode.
     resolve_module(&sym_module, layout).inspect_err(|msg| {
         try_accumulate_diagnostic(
             db,
diff --git a/src/simlin-engine/src/lib.rs b/src/simlin-engine/src/lib.rs
index 571d80c29..25478b323 100644
--- a/src/simlin-engine/src/lib.rs
+++ b/src/simlin-engine/src/lib.rs
@@ -108,6 +108,9 @@ mod units_infer;
 mod variable;
 pub mod vdf;
 mod vm;
+// Bytecode-composition profiling for CompiledSimulation; a diagnostics-only
+// sibling of `vm` kept separate purely for the per-file line cap.
+mod vm_profile;
 mod vm_vector_elm_map;
 mod vm_vector_sort_order;
 pub mod xmile;
@@ -123,6 +126,7 @@ pub use self::variable::{
     DepClassification, Variable, classify_dependencies, identifier_set, previous_referenced_idents,
 };
 pub use self::vm::{CompiledSimulation, Vm};
+pub use self::vm_profile::BytecodeProfile;
 
 // Re-export compat functions at the crate root for convenience
 #[cfg(feature = "xmutil")]
diff --git a/src/simlin-engine/src/vm.rs b/src/simlin-engine/src/vm.rs
index e7d29cb16..58bb75386 100644
--- a/src/simlin-engine/src/vm.rs
+++ b/src/simlin-engine/src/vm.rs
@@ -9,8 +9,8 @@ use smallvec::SmallVec;
 
 use crate::alloc::allocate_available;
 use crate::bytecode::{
-    BuiltinId, ByteCode, ByteCodeContext, CompiledInitial, CompiledModule, DimId, LookupMode,
-    ModuleId, Op2, Opcode, RuntimeView, STACK_CAPACITY, TempId,
+    BuiltinId, ByteCode, ByteCodeContext, CompiledInitial, CompiledModule, DimId, LookupMode, Op2,
+    Opcode, RuntimeView, STACK_CAPACITY, TempId,
 };
 use crate::common::{Canonical, Error, ErrorCode, ErrorKind, Ident, Result};
 use crate::dimensions::match_dimensions_two_pass;
@@ -169,22 +169,38 @@ impl CompiledSimulation {
     }
 }
 
-/// Per-module compiled initials with the shared ByteCodeContext needed to eval them.
+/// One unique compiled module (a distinct `(model_name, input_set)`), holding
+/// its three phase programs plus the resolved child-module indices for its
+/// `EvalModule` opcodes.
+///
+/// `child_targets[decl_id]` is the index into `CompiledSlicedSimulation.modules`
+/// of the module that `context.modules[decl_id]` instantiates. Resolving these
+/// once at `Vm::new` lets the `EvalModule` opcode do a plain array index in the
+/// hot loop instead of cloning a `(String, BTreeSet<String>)` key and SipHashing
+/// it for a `HashMap` lookup on every module evaluation, every timestep.
 #[cfg_attr(feature = "debug-derive", derive(Debug))]
 #[derive(Clone)]
-struct CompiledModuleInitials {
+struct ResolvedModule {
     #[allow(dead_code)]
     ident: Ident<Canonical>,
     context: Arc<ByteCodeContext>,
     initials: Arc<Vec<CompiledInitial>>,
+    flows: Arc<ByteCode>,
+    stocks: Arc<ByteCode>,
+    child_targets: Vec<u32>,
 }
 
 #[cfg_attr(feature = "debug-derive", derive(Debug))]
 #[derive(Clone)]
 struct CompiledSlicedSimulation {
-    initial_modules: HashMap<ModuleKey, CompiledModuleInitials>,
-    flow_modules: HashMap<ModuleKey, CompiledModuleSlice>,
-    stock_modules: HashMap<ModuleKey, CompiledModuleSlice>,
+    /// All unique compiled modules, indexed by the integer ids stored in
+    /// `child_targets` (and `root_idx`).
+    modules: Vec<ResolvedModule>,
+    root_idx: usize,
+    /// `ModuleKey` -> module index. Used only by the cold `set_value` /
+    /// `clear_values` literal-override paths (which still address modules by
+    /// key via `BytecodeLocation`); never consulted in the hot eval loop.
+    key_to_idx: HashMap<ModuleKey, u32>,
 }
 
 #[cfg_attr(feature = "debug-derive", derive(Debug))]
@@ -209,7 +225,6 @@ fn borrow_two(buf: &mut [f64], n_slots: usize, a: usize, b: usize) -> (&mut [f64
 #[derive(Clone)]
 pub struct Vm {
     specs: Specs,
-    root: ModuleKey,
     offsets: HashMap<Ident<Canonical>, usize>,
     sliced_sim: CompiledSlicedSimulation,
     n_slots: usize,
@@ -338,27 +353,65 @@ struct EvalState<'a> {
     use_prev_fallback: bool,
 }
 
-#[cfg_attr(feature = "debug-derive", derive(Debug))]
-#[derive(Clone)]
-struct CompiledModuleSlice {
-    #[allow(dead_code)]
-    ident: Ident<Canonical>,
-    context: Arc<ByteCodeContext>,
-    bytecode: Arc<ByteCode>,
-    part: StepPart,
-}
+impl CompiledSlicedSimulation {
+    /// Build the indexed module table from the keyed `CompiledModule` map,
+    /// resolving every module declaration's `(model_name, input_set)` key to a
+    /// child index so the hot eval loop never reconstructs or hashes a key.
+    fn build(modules: &HashMap<ModuleKey, CompiledModule>, root: &ModuleKey) -> Self {
+        // Stable, deterministic ordering so module indices don't depend on
+        // HashMap iteration order.
+        let mut keys: Vec<&ModuleKey> = modules.keys().collect();
+        keys.sort();
+
+        let key_to_idx: HashMap<ModuleKey, u32> = keys
+            .iter()
+            .enumerate()
+            .map(|(idx, key)| ((*key).clone(), idx as u32))
+            .collect();
 
-impl CompiledModuleSlice {
-    fn new(module: &CompiledModule, part: StepPart) -> Self {
-        CompiledModuleSlice {
-            ident: module.ident.clone(),
-            context: module.context.clone(),
-            bytecode: match part {
-                StepPart::Flows => module.compiled_flows.clone(),
-                StepPart::Stocks => module.compiled_stocks.clone(),
-                StepPart::Initials => unreachable!("initials use CompiledModuleInitials"),
-            },
-            part,
+        let resolved: Vec<ResolvedModule> = keys
+            .iter()
+            .map(|key| {
+                let m = &modules[*key];
+                // Resolve each child declaration's key to its module index.
+                let child_targets: Vec<u32> = m
+                    .context
+                    .modules
+                    .iter()
+                    .map(|decl| {
+                        let child_key = make_module_key(&decl.model_name, &decl.input_set);
+                        key_to_idx[&child_key]
+                    })
+                    .collect();
+                // 3-address fusion (R2): fold leaf operand loads into the
+                // binary ops of the per-timestep flows/stocks programs. Done
+                // on the Vm's execution copy (not the cached CompiledModule,
+                // which stays a pure symbolizable artifact) so the fused
+                // opcodes never re-enter the symbolic layer. make_mut clones
+                // the bytecode out of the shared Arc once per Vm; the scan is
+                // linear and cheap relative to a simulation run. Initials run
+                // once and their AssignCurr targets are read elsewhere, so they
+                // are left unfused.
+                let mut flows = m.compiled_flows.clone();
+                let mut stocks = m.compiled_stocks.clone();
+                Arc::make_mut(&mut flows).fuse_three_address();
+                Arc::make_mut(&mut stocks).fuse_three_address();
+                ResolvedModule {
+                    ident: m.ident.clone(),
+                    context: m.context.clone(),
+                    initials: m.compiled_initials.clone(),
+                    flows,
+                    stocks,
+                    child_targets,
+                }
+            })
+            .collect();
+
+        let root_idx = key_to_idx[root] as usize;
+        CompiledSlicedSimulation {
+            modules: resolved,
+            root_idx,
+            key_to_idx,
         }
     }
 }
@@ -541,36 +594,12 @@ impl Vm {
         };
         let rk_scratch = vec![0.0; stock_offsets.len() * 2];
 
+        let sliced_sim = CompiledSlicedSimulation::build(&sim.modules, &sim.root);
+
         Ok(Vm {
             specs: sim.specs,
-            root: sim.root,
             offsets: sim.offsets,
-            sliced_sim: CompiledSlicedSimulation {
-                initial_modules: sim
-                    .modules
-                    .iter()
-                    .map(|(id, m)| {
-                        (
-                            id.clone(),
-                            CompiledModuleInitials {
-                                ident: m.ident.clone(),
-                                context: m.context.clone(),
-                                initials: m.compiled_initials.clone(),
-                            },
-                        )
-                    })
-                    .collect(),
-                flow_modules: sim
-                    .modules
-                    .iter()
-                    .map(|(id, m)| (id.clone(), CompiledModuleSlice::new(m, StepPart::Flows)))
-                    .collect(),
-                stock_modules: sim
-                    .modules
-                    .iter()
-                    .map(|(id, m)| (id.clone(), CompiledModuleSlice::new(m, StepPart::Stocks)))
-                    .collect(),
-            },
+            sliced_sim,
             n_slots,
             n_chunks,
             data: Some(data),
@@ -613,8 +642,7 @@ impl Vm {
         self.stack.clear();
         let mut data = self.data.take().unwrap();
 
-        let module_flows = &self.sliced_sim.flow_modules[&self.root];
-        let module_stocks = &self.sliced_sim.stock_modules[&self.root];
+        let root_idx = self.sliced_sim.root_idx;
 
         self.view_stack.clear();
         self.iter_stack.clear();
@@ -673,14 +701,7 @@ impl Vm {
                     break;
                 }
 
-                Self::eval_step(
-                    &self.sliced_sim,
-                    &mut state,
-                    module_flows,
-                    module_stocks,
-                    curr,
-                    next,
-                );
+                Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next);
                 state.prev_values.copy_from_slice(curr);
                 state.use_prev_fallback = false;
                 self.prev_values_valid = true;
@@ -699,14 +720,7 @@ impl Vm {
                     let saved_time = curr[TIME_OFF];
 
                     // Stage 1: evaluate at (t, y)
-                    Self::eval_step(
-                        &self.sliced_sim,
-                        &mut state,
-                        module_flows,
-                        module_stocks,
-                        curr,
-                        next,
-                    );
+                    Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next);
                     for (i, &off) in stock_offsets.iter().enumerate() {
                         let s1 = next[off] - curr[off];
                         saved[i] = curr[off];
@@ -716,14 +730,7 @@ impl Vm {
                     curr[TIME_OFF] = saved_time + dt * 0.5;
 
                     // Stage 2: evaluate at (t + dt/2, y + s1/2)
-                    Self::eval_step(
-                        &self.sliced_sim,
-                        &mut state,
-                        module_flows,
-                        module_stocks,
-                        curr,
-                        next,
-                    );
+                    Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next);
                     for (i, &off) in stock_offsets.iter().enumerate() {
                         let s2 = next[off] - curr[off];
                         accum[i] += 2.0 * s2;
@@ -731,14 +738,7 @@ impl Vm {
                     }
 
                     // Stage 3: evaluate at (t + dt/2, y + s2/2)
-                    Self::eval_step(
-                        &self.sliced_sim,
-                        &mut state,
-                        module_flows,
-                        module_stocks,
-                        curr,
-                        next,
-                    );
+                    Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next);
                     for (i, &off) in stock_offsets.iter().enumerate() {
                         let s3 = next[off] - curr[off];
                         accum[i] += 2.0 * s3;
@@ -747,14 +747,7 @@ impl Vm {
                     curr[TIME_OFF] = saved_time + dt;
 
                     // Stage 4: evaluate at (t + dt, y + s3)
-                    Self::eval_step(
-                        &self.sliced_sim,
-                        &mut state,
-                        module_flows,
-                        module_stocks,
-                        curr,
-                        next,
-                    );
+                    Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next);
                     for (i, &off) in stock_offsets.iter().enumerate() {
                         let s4 = next[off] - curr[off];
                         accum[i] += s4;
@@ -776,7 +769,8 @@ impl Vm {
                     Self::eval(
                         &self.sliced_sim,
                         &mut state,
-                        module_flows,
+                        root_idx,
+                        StepPart::Flows,
                         0,
                         &[],
                         curr,
@@ -802,14 +796,7 @@ impl Vm {
                     let saved_time = curr[TIME_OFF];
 
                     // Stage 1: evaluate at (t, y)
-                    Self::eval_step(
-                        &self.sliced_sim,
-                        &mut state,
-                        module_flows,
-                        module_stocks,
-                        curr,
-                        next,
-                    );
+                    Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next);
                     for (i, &off) in stock_offsets.iter().enumerate() {
                         let s1 = next[off] - curr[off];
                         saved[i] = curr[off];
@@ -819,14 +806,7 @@ impl Vm {
                     curr[TIME_OFF] = saved_time + dt;
 
                     // Stage 2: evaluate at (t + dt, y + s1)
-                    Self::eval_step(
-                        &self.sliced_sim,
-                        &mut state,
-                        module_flows,
-                        module_stocks,
-                        curr,
-                        next,
-                    );
+                    Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next);
                     for (i, &off) in stock_offsets.iter().enumerate() {
                         let s2 = next[off] - curr[off];
                         accum[i] += s2;
@@ -842,7 +822,8 @@ impl Vm {
                     Self::eval(
                         &self.sliced_sim,
                         &mut state,
-                        module_flows,
+                        root_idx,
+                        StepPart::Flows,
                         0,
                         &[],
                         curr,
@@ -914,6 +895,18 @@ impl Vm {
         self.constant_info.contains_key(&off)
     }
 
+    /// Resolve a `ModuleKey` (carried by a `BytecodeLocation` from the
+    /// constant-info map) to its module index. Cold path only -- used by the
+    /// `set_value` / `clear_values` literal-override machinery, never the hot
+    /// eval loop.
+    fn module_idx_for(&self, module_key: &ModuleKey) -> usize {
+        *self
+            .sliced_sim
+            .key_to_idx
+            .get(module_key)
+            .expect("module key must exist") as usize
+    }
+
     /// Read the current value of a literal at a bytecode location.
     fn read_literal(&self, loc: &BytecodeLocation) -> f64 {
         match loc {
@@ -922,32 +915,21 @@ impl Vm {
                 part,
                 literal_id,
             } => {
-                let module = match part {
-                    StepPart::Flows => self
-                        .sliced_sim
-                        .flow_modules
-                        .get(module_key)
-                        .expect("module key must exist"),
-                    StepPart::Stocks => self
-                        .sliced_sim
-                        .stock_modules
-                        .get(module_key)
-                        .expect("module key must exist"),
+                let module = &self.sliced_sim.modules[self.module_idx_for(module_key)];
+                let bytecode = match part {
+                    StepPart::Flows => &module.flows,
+                    StepPart::Stocks => &module.stocks,
                     StepPart::Initials => unreachable!(),
                 };
-                module.bytecode.literals[*literal_id as usize]
+                bytecode.literals[*literal_id as usize]
             }
             BytecodeLocation::Initial {
                 module_key,
                 initial_index,
                 literal_id,
             } => {
-                let initials_module = self
-                    .sliced_sim
-                    .initial_modules
-                    .get(module_key)
-                    .expect("module key must exist");
-                initials_module.initials[*initial_index].bytecode.literals[*literal_id as usize]
+                let module = &self.sliced_sim.modules[self.module_idx_for(module_key)];
+                module.initials[*initial_index].bytecode.literals[*literal_id as usize]
             }
         }
     }
@@ -961,32 +943,23 @@ impl Vm {
                 part,
                 literal_id,
             } => {
-                let module = match part {
-                    StepPart::Flows => self
-                        .sliced_sim
-                        .flow_modules
-                        .get_mut(module_key)
-                        .expect("module key must exist"),
-                    StepPart::Stocks => self
-                        .sliced_sim
-                        .stock_modules
-                        .get_mut(module_key)
-                        .expect("module key must exist"),
+                let idx = self.module_idx_for(module_key);
+                let module = &mut self.sliced_sim.modules[idx];
+                let bytecode = match part {
+                    StepPart::Flows => &mut module.flows,
+                    StepPart::Stocks => &mut module.stocks,
                     StepPart::Initials => unreachable!(),
                 };
-                Arc::make_mut(&mut module.bytecode).literals[*literal_id as usize] = value;
+                Arc::make_mut(bytecode).literals[*literal_id as usize] = value;
             }
             BytecodeLocation::Initial {
                 module_key,
                 initial_index,
                 literal_id,
             } => {
-                let initials_module = self
-                    .sliced_sim
-                    .initial_modules
-                    .get_mut(module_key)
-                    .expect("module key must exist");
-                let initials = Arc::make_mut(&mut initials_module.initials);
+                let idx = self.module_idx_for(module_key);
+                let module = &mut self.sliced_sim.modules[idx];
+                let initials = Arc::make_mut(&mut module.initials);
                 initials[*initial_index].bytecode.literals[*literal_id as usize] = value;
             }
         }
@@ -1130,7 +1103,7 @@ impl Vm {
         Self::eval_initials(
             &self.sliced_sim,
             &mut state,
-            &self.root,
+            self.sliced_sim.root_idx,
             0,
             module_inputs,
             curr,
@@ -1186,48 +1159,20 @@ impl Vm {
         Some(series)
     }
 
-    /// Evaluate a submodule's initials.
-    #[allow(clippy::too_many_arguments)]
-    #[inline(never)]
-    fn eval_module_initials(
-        sliced_sim: &CompiledSlicedSimulation,
-        state: &mut EvalState<'_>,
-        parent_context: &ByteCodeContext,
-        parent_module_off: usize,
-        module_inputs: &[f64],
-        curr: &mut [f64],
-        next: &mut [f64],
-        id: ModuleId,
-    ) {
-        let new_module_decl = &parent_context.modules[id as usize];
-        let module_key = make_module_key(&new_module_decl.model_name, &new_module_decl.input_set);
-        let module_off = parent_module_off + new_module_decl.off;
-
-        Self::eval_initials(
-            sliced_sim,
-            state,
-            &module_key,
-            module_off,
-            module_inputs,
-            curr,
-            next,
-        );
-    }
-
     /// Run all per-variable initials for a module (in dependency order).
     #[allow(clippy::too_many_arguments)]
     fn eval_initials(
         sliced_sim: &CompiledSlicedSimulation,
         state: &mut EvalState<'_>,
-        module_key: &ModuleKey,
+        module_idx: usize,
         module_off: usize,
         module_inputs: &[f64],
         curr: &mut [f64],
         next: &mut [f64],
     ) {
-        let module_initials = &sliced_sim.initial_modules[module_key];
-        let context = &module_initials.context;
-        for compiled_initial in module_initials.initials.iter() {
+        let module = &sliced_sim.modules[module_idx];
+        let context = &module.context;
+        for compiled_initial in module.initials.iter() {
             Self::eval_bytecode(
                 sliced_sim,
                 state,
@@ -1235,6 +1180,7 @@ impl Vm {
                 &compiled_initial.bytecode,
                 StepPart::Initials,
                 module_off,
+                module_idx,
                 module_inputs,
                 curr,
                 next,
@@ -1244,17 +1190,35 @@ impl Vm {
 
     /// Evaluate one full integration step: compute all flows/auxes then
     /// update all stocks.  Used by each RK stage and the Euler loop.
+    /// Always evaluates the root module (`module_off == 0`).
     #[inline(always)]
     fn eval_step(
         sliced_sim: &CompiledSlicedSimulation,
         state: &mut EvalState<'_>,
-        module_flows: &CompiledModuleSlice,
-        module_stocks: &CompiledModuleSlice,
+        module_idx: usize,
         curr: &mut [f64],
         next: &mut [f64],
     ) {
-        Self::eval(sliced_sim, state, module_flows, 0, &[], curr, next);
-        Self::eval(sliced_sim, state, module_stocks, 0, &[], curr, next);
+        Self::eval(
+            sliced_sim,
+            state,
+            module_idx,
+            StepPart::Flows,
+            0,
+            &[],
+            curr,
+            next,
+        );
+        Self::eval(
+            sliced_sim,
+            state,
+            module_idx,
+            StepPart::Stocks,
+            0,
+            &[],
+            curr,
+            next,
+        );
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -1262,19 +1226,27 @@ impl Vm {
     fn eval(
         sliced_sim: &CompiledSlicedSimulation,
         state: &mut EvalState<'_>,
-        module: &CompiledModuleSlice,
+        module_idx: usize,
+        part: StepPart,
         module_off: usize,
         module_inputs: &[f64],
         curr: &mut [f64],
         next: &mut [f64],
     ) {
+        let module = &sliced_sim.modules[module_idx];
+        let bytecode = match part {
+            StepPart::Flows => &module.flows,
+            StepPart::Stocks => &module.stocks,
+            StepPart::Initials => unreachable!("initials are evaluated via eval_initials"),
+        };
         Self::eval_bytecode(
             sliced_sim,
             state,
             &module.context,
-            &module.bytecode,
-            module.part,
+            bytecode,
+            part,
             module_off,
+            module_idx,
             module_inputs,
             curr,
             next,
@@ -1289,6 +1261,11 @@ impl Vm {
         bytecode: &ByteCode,
         part: StepPart,
         module_off: usize,
+        // Index of the module currently executing, into
+        // `sliced_sim.modules`. Used to resolve `EvalModule` child targets
+        // without reconstructing/hashing a module key. `context` is
+        // `&sliced_sim.modules[module_idx].context`.
+        module_idx: usize,
         module_inputs: &[f64],
         curr: &mut [f64],
         next: &mut [f64],
@@ -1416,35 +1393,29 @@ impl Vm {
                         prev_values,
                         use_prev_fallback,
                     };
+                    // Resolve the child module by precomputed index instead of
+                    // reconstructing + SipHashing a (model_name, input_set) key.
+                    let child_module_off = module_off + context.modules[*id as usize].off;
+                    let child_idx =
+                        sliced_sim.modules[module_idx].child_targets[*id as usize] as usize;
                     match part {
                         StepPart::Initials => {
-                            Self::eval_module_initials(
+                            Self::eval_initials(
                                 sliced_sim,
                                 &mut child_state,
-                                context,
-                                module_off,
+                                child_idx,
+                                child_module_off,
                                 &module_inputs,
                                 curr,
                                 next,
-                                *id,
                             );
                         }
                         StepPart::Flows | StepPart::Stocks => {
-                            let new_module_decl = &context.modules[*id as usize];
-                            let module_key = make_module_key(
-                                &new_module_decl.model_name,
-                                &new_module_decl.input_set,
-                            );
-                            let child_module_off = module_off + new_module_decl.off;
-                            let child_module = match part {
-                                StepPart::Flows => &sliced_sim.flow_modules[&module_key],
-                                StepPart::Stocks => &sliced_sim.stock_modules[&module_key],
-                                StepPart::Initials => unreachable!(),
-                            };
                             Self::eval(
                                 sliced_sim,
                                 &mut child_state,
-                                child_module,
+                                child_idx,
+                                part,
                                 child_module_off,
                                 &module_inputs,
                                 curr,
@@ -1495,6 +1466,35 @@ impl Vm {
                     next[module_off + *off as usize] = eval_op2(*op, l, r);
                     debug_assert_eq!(0, stack.len());
                 }
+                // === 3-ADDRESS BINARY OPS (R2) ===
+                // Operands are read straight from curr[]/literals; the *Stack*
+                // forms take the lhs from the arithmetic stack. Each pushes the
+                // result, replacing a Load;Load;Op2 or Load;Op2 sequence.
+                Opcode::BinVarVar { l, r, op } => {
+                    let lv = curr[module_off + *l as usize];
+                    let rv = curr[module_off + *r as usize];
+                    stack.push(eval_op2(*op, lv, rv));
+                }
+                Opcode::BinVarConst { l, r, op } => {
+                    let lv = curr[module_off + *l as usize];
+                    let rv = bytecode.literals[*r as usize];
+                    stack.push(eval_op2(*op, lv, rv));
+                }
+                Opcode::BinConstVar { l, r, op } => {
+                    let lv = bytecode.literals[*l as usize];
+                    let rv = curr[module_off + *r as usize];
+                    stack.push(eval_op2(*op, lv, rv));
+                }
+                Opcode::BinStackVar { r, op } => {
+                    let lv = stack.pop();
+                    let rv = curr[module_off + *r as usize];
+                    stack.push(eval_op2(*op, lv, rv));
+                }
+                Opcode::BinStackConst { r, op } => {
+                    let lv = stack.pop();
+                    let rv = bytecode.literals[*r as usize];
+                    stack.push(eval_op2(*op, lv, rv));
+                }
                 Opcode::Apply { func } => {
                     let time = curr[TIME_OFF];
                     let dt = curr[DT_OFF];
@@ -2679,16 +2679,22 @@ impl Vm {
 
     #[cfg(test)]
     pub fn debug_print_bytecode(&self, _model_name: &str) {
-        let mut module_keys: Vec<_> = self.sliced_sim.initial_modules.keys().collect();
-        module_keys.sort_unstable();
-        for module_key in module_keys {
+        // Iterate modules in key order for stable, readable output.
+        let mut keyed: Vec<(&ModuleKey, usize)> = self
+            .sliced_sim
+            .key_to_idx
+            .iter()
+            .map(|(k, &idx)| (k, idx as usize))
+            .collect();
+        keyed.sort_unstable_by(|a, b| a.0.cmp(b.0));
+        for (module_key, idx) in keyed {
             eprintln!("\n\nCOMPILED MODULE: {:?}", module_key);
 
-            let module_initials = &self.sliced_sim.initial_modules[module_key];
-            let flows_bc = &self.sliced_sim.flow_modules[module_key].bytecode;
-            let stocks_bc = &self.sliced_sim.stock_modules[module_key].bytecode;
+            let module = &self.sliced_sim.modules[idx];
+            let flows_bc = &module.flows;
+            let stocks_bc = &module.stocks;
 
-            for ci in module_initials.initials.iter() {
+            for ci in module.initials.iter() {
                 eprintln!("\ninitial '{}' literals:", ci.ident);
                 for (i, lit) in ci.bytecode.literals.iter().enumerate() {
                     eprintln!("\t{i}: {lit}");
@@ -3457,6 +3463,48 @@ mod vm_reset_and_run_initials_tests {
             .expect("incremental compile should succeed")
     }
 
+    /// End-to-end guard for the 3-address fusion (R2), which is applied to the
+    /// Vm's flow/stock bytecode at construction. Uses subtraction and division
+    /// (non-commutative) so a swapped operand encoding in any fused form is a
+    /// loud failure rather than a silent miscompile. `a`, `b`, `c` are distinct
+    /// variables (not foldable into a literal), so each expression compiles to
+    /// loads + Op2 that the pass folds into BinVarVar / BinVarConst /
+    /// BinConstVar / BinStackVar / BinStackConst.
+    #[test]
+    fn test_fused_binops_preserve_operand_order() {
+        let tp = TestProject::new("fusion_order")
+            .with_sim_time(0.0, 1.0, 1.0)
+            .aux("a", "20", None)
+            .aux("b", "5", None)
+            .aux("c", "2", None)
+            .aux("vv", "a - b", None) // BinVarVar
+            .aux("dvv", "a / b", None) // BinVarVar, division
+            .aux("vc", "a - 3", None) // BinVarConst
+            .aux("cv", "10 - a", None) // BinConstVar
+            .aux("sv", "(a - b) - c", None) // BinVarVar then BinStackVar
+            .aux("sc", "(a - b) - 4", None); // BinVarVar then BinStackConst
+
+        let compiled = build_compiled(&tp);
+        let mut vm = Vm::new(compiled).unwrap();
+        vm.run_to_end().unwrap();
+        let results = vm.into_results();
+
+        let val = |name: &str| -> f64 {
+            let off = *results
+                .offsets
+                .get(&*canonicalize(name))
+                .unwrap_or_else(|| panic!("missing {name}"));
+            results.data[off] // step 0
+        };
+
+        assert_eq!(val("vv"), 15.0, "a - b");
+        assert_eq!(val("dvv"), 4.0, "a / b");
+        assert_eq!(val("vc"), 17.0, "a - 3");
+        assert_eq!(val("cv"), -10.0, "10 - a");
+        assert_eq!(val("sv"), 13.0, "(a - b) - c");
+        assert_eq!(val("sc"), 11.0, "(a - b) - 4");
+    }
+
     #[test]
     fn test_vm_reset_produces_identical_results() {
         let tp = pop_model();
@@ -4435,13 +4483,13 @@ mod superinstruction_tests {
 
     /// Helper: collect all opcodes from the flow bytecode of the root module.
     fn flow_opcodes(vm: &Vm) -> Vec<&Opcode> {
-        let bc = &vm.sliced_sim.flow_modules[&vm.root].bytecode;
+        let bc = &vm.sliced_sim.modules[vm.sliced_sim.root_idx].flows;
         bc.code.iter().collect()
     }
 
     /// Helper: collect all opcodes from the stock bytecode of the root module.
     fn stock_opcodes(vm: &Vm) -> Vec<&Opcode> {
-        let bc = &vm.sliced_sim.stock_modules[&vm.root].bytecode;
+        let bc = &vm.sliced_sim.modules[vm.sliced_sim.root_idx].stocks;
         bc.code.iter().collect()
     }
 
diff --git a/src/simlin-engine/src/vm_profile.rs b/src/simlin-engine/src/vm_profile.rs
new file mode 100644
index 000000000..7674072d2
--- /dev/null
+++ b/src/simlin-engine/src/vm_profile.rs
@@ -0,0 +1,86 @@
+// Copyright 2026 The Simlin Authors. All rights reserved.
+// Use of this source code is governed by the Apache License,
+// Version 2.0, that can be found in the LICENSE file.
+
+//! Bytecode composition profiling for a compiled simulation.
+//!
+//! A diagnostics-only sibling of `vm.rs` (kept here purely for the per-file line
+//! cap): `CompiledSimulation::bytecode_profile` answers "how big and what shape
+//! is the compiled bytecode?" for the `clearn_profile` example and similar
+//! analysis, without exposing the private `Opcode` type.
+
+use std::collections::BTreeMap;
+
+use crate::bytecode::ByteCode;
+use crate::vm::CompiledSimulation;
+
+impl CompiledSimulation {
+    /// Walk every compiled module's bytecode and tables to produce an aggregate
+    /// composition profile.
+    pub fn bytecode_profile(&self) -> BytecodeProfile {
+        let mut p = BytecodeProfile {
+            n_modules: self.modules.len(),
+            n_slots_root: self.n_slots(),
+            ..Default::default()
+        };
+
+        let mut tally = |bc: &ByteCode, hist: &mut BTreeMap<&'static str, usize>| {
+            p.total_literals += bc.literals.len();
+            for op in bc.code.iter() {
+                *hist.entry(op.name()).or_insert(0) += 1;
+            }
+            bc.code.len()
+        };
+
+        for module in self.modules.values() {
+            p.flow_opcodes += tally(&module.compiled_flows, &mut p.histogram);
+            p.flow_opcodes_after_fusion += module.compiled_flows.estimate_fused_len();
+            p.stock_opcodes += tally(&module.compiled_stocks, &mut p.histogram);
+            for ci in module.compiled_initials.iter() {
+                p.n_initials += 1;
+                p.initial_opcodes += tally(&ci.bytecode, &mut p.histogram);
+            }
+
+            let ctx = &module.context;
+            p.graphical_functions += ctx.graphical_functions.len();
+            p.graphical_function_points += ctx
+                .graphical_functions
+                .iter()
+                .map(|gf| gf.len())
+                .sum::<usize>();
+            p.temp_storage_slots += ctx.temp_total_size;
+            p.dimensions += ctx.dimensions.len();
+            p.static_views += ctx.static_views.len();
+            p.dim_lists += ctx.dim_lists.len();
+            p.names += ctx.names.len();
+        }
+
+        p.total_opcodes = p.flow_opcodes + p.stock_opcodes + p.initial_opcodes;
+        p
+    }
+}
+
+/// Aggregate composition of a compiled simulation's bytecode and side tables.
+/// Produced by [`CompiledSimulation::bytecode_profile`]. `histogram` maps each
+/// opcode variant name to its occurrence count across all modules and phases.
+#[derive(Default, Clone)]
+pub struct BytecodeProfile {
+    pub n_modules: usize,
+    pub n_slots_root: usize,
+    pub total_opcodes: usize,
+    pub flow_opcodes: usize,
+    /// Estimated flow opcode count after a 3-address fusion pass (R2 sizing).
+    pub flow_opcodes_after_fusion: usize,
+    pub stock_opcodes: usize,
+    pub initial_opcodes: usize,
+    pub n_initials: usize,
+    pub total_literals: usize,
+    pub graphical_functions: usize,
+    pub graphical_function_points: usize,
+    pub temp_storage_slots: usize,
+    pub dimensions: usize,
+    pub static_views: usize,
+    pub dim_lists: usize,
+    pub names: usize,
+    pub histogram: BTreeMap<&'static str, usize>,
+}
diff --git a/src/simlin-mcp/Cargo.toml b/src/simlin-mcp/Cargo.toml
index 574369189..c68722d19 100644
--- a/src/simlin-mcp/Cargo.toml
+++ b/src/simlin-mcp/Cargo.toml
@@ -18,6 +18,7 @@ path = "src/main.rs"
 simlin-engine = { version = "0.1", path = "../simlin-engine", features = ["schema"] }
 simlin-mcp-core = { version = "0.1", path = "../simlin-mcp-core" }
 rmcp = { version = "1", features = ["server", "macros", "transport-io"] }
+mimalloc = "0.1"
 serde_json = "1"
 anyhow = "1"
 tokio = { version = "1", features = ["macros", "rt-multi-thread", "fs"] }
diff --git a/src/simlin-mcp/src/main.rs b/src/simlin-mcp/src/main.rs
index 3f06724e9..0c6241110 100644
--- a/src/simlin-mcp/src/main.rs
+++ b/src/simlin-mcp/src/main.rs
@@ -19,6 +19,12 @@
 //! simlin-mcp --version    # print version
 //! ```
 
+// mimalloc on native builds: the engine compile path is allocation-heavy
+// (millions of small, short-lived allocations); mimalloc roughly halves the
+// allocator time vs the system malloc. See docs/design/engine-performance.md.
+#[global_allocator]
+static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
 use rmcp::{ServiceExt, transport::stdio};
 use simlin_mcp::access::FileSystemAccess;
 use simlin_mcp_core::server::{ResourceContent, SimlinMcpServer};
diff --git a/src/simlin-serve/Cargo.toml b/src/simlin-serve/Cargo.toml
index 6c2d8b61a..ebd7cdff4 100644
--- a/src/simlin-serve/Cargo.toml
+++ b/src/simlin-serve/Cargo.toml
@@ -15,6 +15,7 @@ name = "simlin-serve"
 path = "src/main.rs"
 
 [dependencies]
+mimalloc = "0.1"
 tokio = { version = "1", features = ["full"] }
 axum = { version = "0.8", features = ["ws"] }
 tower-http = { version = "0.6", features = ["limit", "trace"] }
diff --git a/src/simlin-serve/src/main.rs b/src/simlin-serve/src/main.rs
index 1600a9a6d..c38c7b9c3 100644
--- a/src/simlin-serve/src/main.rs
+++ b/src/simlin-serve/src/main.rs
@@ -4,6 +4,13 @@
 
 #![deny(unsafe_code)]
 
+// mimalloc on native builds: the engine compile path is allocation-heavy
+// (millions of small, short-lived allocations); mimalloc roughly halves the
+// allocator time vs the system malloc. See docs/design/engine-performance.md.
+// `#[global_allocator]` is a safe item, so it stands under `deny(unsafe_code)`.
+#[global_allocator]
+static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
 use std::sync::Arc;
 
 use tracing_subscriber::EnvFilter;