diff --git a/CLAUDE.md b/CLAUDE.md
index 6a38b1dea..095264d43 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -95,6 +95,7 @@ IMPORTANT: If feedback seems non-actionable, it means you need comments explaini
 - Public Rust items and non-trivial internal functions should have concise rustdoc describing purpose, key assumptions, and side effects.
 - When behavior changes, update nearby comments in the same commit so docs and code stay aligned.
 - If you intentionally remove a comment block, replace it with an updated equivalent when the context is still non-obvious.
+- NEVER add a "Last updated" (or "Last verified") line to a `CLAUDE.md`: it is a perpetual rebase/merge-conflict magnet and goes stale immediately. Describe current state in prose; rely on `git log` / `git blame` for history.
 
 ## Development Standards
 
diff --git a/Cargo.lock b/Cargo.lock
index de6d8942e..84f27515e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -414,6 +414,16 @@ dependencies = [
  "rand_core 0.10.1",
 ]
 
+[[package]]
+name = "checked"
+version = "0.2.0"
+source = "git+https://github.com/DLR-FT/wasm-interpreter.git?rev=64cedbba603edfd64cbb6b5a19f5fa34530bb03a#64cedbba603edfd64cbb6b5a19f5fa34530bb03a"
+dependencies = [
+ "interop",
+ "linker",
+ "wasm-interpreter",
+]
+
 [[package]]
 name = "chrono"
 version = "0.4.44"
@@ -1864,6 +1874,14 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "interop"
+version = "0.2.0"
+source = "git+https://github.com/DLR-FT/wasm-interpreter.git?rev=64cedbba603edfd64cbb6b5a19f5fa34530bb03a#64cedbba603edfd64cbb6b5a19f5fa34530bb03a"
+dependencies = [
+ "wasm-interpreter",
+]
+
 [[package]]
 name = "intrusive-collections"
 version = "0.9.7"
@@ -2072,9 +2090,9 @@ checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f"
 
 [[package]]
 name = "libm"
-version = "0.2.16"
+version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
 
 [[package]]
 name = "libmimalloc-sys"
@@ -2085,6 +2103,14 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "linker"
+version = "0.2.0"
+source = "git+https://github.com/DLR-FT/wasm-interpreter.git?rev=64cedbba603edfd64cbb6b5a19f5fa34530bb03a#64cedbba603edfd64cbb6b5a19f5fa34530bb03a"
+dependencies = [
+ "wasm-interpreter",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.12.1"
@@ -2112,6 +2138,11 @@ version = "0.4.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
 
+[[package]]
+name = "log_wrapper"
+version = "0.1.0"
+source = "git+https://github.com/DLR-FT/wasm-interpreter.git?rev=64cedbba603edfd64cbb6b5a19f5fa34530bb03a#64cedbba603edfd64cbb6b5a19f5fa34530bb03a"
+
 [[package]]
 name = "loom"
 version = "0.7.2"
@@ -3786,11 +3817,13 @@ name = "simlin"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "checked",
  "mimalloc",
  "prost",
  "serde",
  "serde_json",
  "simlin-engine",
+ "wasm-interpreter",
 ]
 
 [[package]]
@@ -3812,6 +3845,7 @@ dependencies = [
  "base64",
  "bumpalo",
  "calamine",
+ "checked",
  "criterion",
  "csv",
  "ed25519",
@@ -3836,6 +3870,8 @@ dependencies = [
  "tempfile",
  "test-generator",
  "unicode-xid",
+ "wasm-encoder",
+ "wasm-interpreter",
  "xmutil",
 ]
 
@@ -4783,6 +4819,15 @@ dependencies = [
  "wasmparser",
 ]
 
+[[package]]
+name = "wasm-interpreter"
+version = "0.2.0"
+source = "git+https://github.com/DLR-FT/wasm-interpreter.git?rev=64cedbba603edfd64cbb6b5a19f5fa34530bb03a#64cedbba603edfd64cbb6b5a19f5fa34530bb03a"
+dependencies = [
+ "libm",
+ "log_wrapper",
+]
+
 [[package]]
 name = "wasm-metadata"
 version = "0.244.0"
diff --git a/docs/README.md b/docs/README.md
index ea9715bb8..c0e7a573f 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -30,8 +30,10 @@
   - [design-plans/2026-05-11-ltm-arrays-hardening.md](design-plans/2026-05-11-ltm-arrays-hardening.md) -- Arrayed/cross-element LTM hardening: unify the reference-site walkers behind one classification IR (#520), then layer eight fixes (#487, #511, #510, #514, #515, #483, #502, #492)
   - [design-plans/2026-05-13-macros.md](design-plans/2026-05-13-macros.md) -- Vensim macro support: macros as a data-driven generalization of the stdlib module mechanism, persisted via a `MacroSpec` marker on `Model`; 7 implementation phases
   - [design-plans/2026-05-19-clearn-residual.md](design-plans/2026-05-19-clearn-residual.md) -- Close C-LEARN's residual (#590/#591) as general Vensim import/simulation primitives: arrayed inline graphical functions, import-time macro shadowing, user-macro INITIAL recurrence, residual attribution; 5 phases
+  - [design-plans/2026-05-20-wasm-backend.md](design-plans/2026-05-20-wasm-backend.md) -- WebAssembly code-generation backend: compile a model to one self-contained wasm module as an alternative to the bytecode VM (for fast interactive re-simulation), validated to full VM parity; 8 phases
 - [plans/](plans/README.md) -- Implementation plans (active and completed)
 - [test-plans/](test-plans/) -- Human verification plans for completed features
+  - [test-plans/2026-05-20-wasm-backend.md](test-plans/2026-05-20-wasm-backend.md) -- Manual verification for the WebAssembly simulation backend: the heavy `#[ignore]`d parity twins (C-LEARN vs `Ref.vdf`, WORLD3), driving the libsimlin FFI from a real host, and the AC3.3 deliberate-regression check (the bytecode VM is the automated oracle for everything else)
 - `implementation-plans/` -- Detailed phase-by-phase implementation plans, created during plan execution
 
 ## Security
diff --git a/docs/architecture.md b/docs/architecture.md
index 9772c6b9d..4693a7a61 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -14,6 +14,7 @@ Core simulation engine. Compiles, type-checks, unit-checks, and simulates SD mod
 - Primary compilation path is `db::compile_project_incremental()` using salsa tracked functions for fine-grained incrementality (`db.rs`, `db_analysis.rs`, `db_ltm.rs`, `db_ltm_ir.rs`)
 - Equation text is parsed via recursive descent parser (`parser/mod.rs`)
 - Simulations run on a stack-based bytecode VM (`vm.rs`) with `PREVIOUS`/`INIT` intrinsic opcodes
+- An alternative WebAssembly code-generation backend (`wasmgen/`) lowers a compiled model to one self-contained wasm module (no host imports) for fast repeated re-simulation; the VM stays the correctness oracle (every emitted module is checked against it). Surfaced through libsimlin `simlin_model_compile_to_wasm`
 - `builtins.rs` defines builtin functions (including `PREVIOUS`, `INIT`); stateful module functions (TREND, SMOOTH3) are model definitions in `stdlib/*.stmx`, generated into `stdlib.gen.rs`
 - Native Vensim MDL parser in `mdl/` (replaces C++ xmutil); see [docs/design/mdl-parser.md](/docs/design/mdl-parser.md)
 
diff --git a/docs/design-plans/2026-05-20-wasm-backend.md b/docs/design-plans/2026-05-20-wasm-backend.md
index 8cdb899f5..43f7cd420 100644
--- a/docs/design-plans/2026-05-20-wasm-backend.md
+++ b/docs/design-plans/2026-05-20-wasm-backend.md
@@ -125,7 +125,7 @@ Turn the validated proof-of-concept (branch `wasm-backend-poc`) into a full, cor
 
 ## Architecture
 
-The backend translates the engine's compiled bytecode into an equivalent WebAssembly module, mirroring the bytecode VM (`src/simlin-engine/src/vm.rs`) opcode-for-opcode. It consumes the public salsa output `compile_project_incremental(db, project, model) -> CompiledSimulation` (`vm.rs:134`) — the same value `Vm::new` consumes — so no salsa-internal queries are touched and all engine assembly (dependency ordering, model-global offset resolution, recurrence-SCC handling, graphical-function layout, module instantiation, implicit SMOOTH/DELAY variables) is inherited unchanged.
+The backend translates the engine's compiled bytecode into an equivalent WebAssembly module, mirroring the bytecode VM (`src/simlin-engine/src/vm.rs`) opcode-for-opcode. It consumes the public salsa output `compile_project_incremental(db, project, model) -> CompiledSimulation` (`db.rs:5886`, returning the `CompiledSimulation` defined at `vm.rs:134`) — the same value `Vm::new` consumes — so no salsa-internal queries are touched and all engine assembly (dependency ordering, model-global offset resolution, recurrence-SCC handling, graphical-function layout, module instantiation, implicit SMOOTH/DELAY variables) is inherited unchanged.
 
 `CompiledSimulation` is `{ modules: HashMap<ModuleKey, CompiledModule>, specs: Specs, root: ModuleKey, offsets: HashMap<Ident, usize> }`. Each `CompiledModule` (`bytecode.rs:4616`) holds three opcode programs (`compiled_initials`, `compiled_flows`, `compiled_stocks`), per-program `literals`, and a shared `ByteCodeContext` (`bytecode.rs:1585`: graphical-function tables, module declarations, dimensions, temp-array sizes, static array views). It is the *un-fused* form — the 3-address `fuse_three_address` pass runs later in `Vm::new` — so the backend translates the plain opcode set only.
 
diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_01.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_01.md
new file mode 100644
index 000000000..e21a68276
--- /dev/null
+++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_01.md
@@ -0,0 +1,330 @@
+# WebAssembly Simulation Backend — Phase 1: Bytecode-to-wasm scalar core + parity harness
+
+**Goal:** Restructure the `wasmgen` proof-of-concept so it consumes the salsa-compiled bytecode (`CompiledSimulation`) instead of the monolithic `compiler::Module`/`Expr` IR, lower the scalar-core opcode set + the Euler integration loop to a self-contained wasm module, and stand up the dual VM-vs-wasm parity gate in `tests/simulate.rs`.
+
+**Architecture:** The bytecode VM (`src/simlin-engine/src/vm.rs`) is a stack machine over a flat f64 "slab" in linear memory; wasm is also a stack machine over linear memory, so each `Opcode` lowers to a short, mostly 1:1 wasm instruction sequence operating on the wasm operand stack. The backend walks the un-fused opcode programs of each `CompiledModule` (`compiled_initials`/`compiled_flows`/`compiled_stocks`) and emits three wasm functions, then a `run` function that seeds the reserved globals + initials and drives the Euler loop, writing step-major snapshots into a results region. The module exports `memory`, `run`, and three i32 geometry globals (`n_slots`, `n_chunks`, `results_offset`); a `WasmLayout` (variable-name→slot-offset map) is returned alongside the bytes for host-side by-name reads.
+
+**Tech Stack:** Rust; `wasm-encoder` 0.244 (module emission); the DLR-FT `wasm-interpreter` (`wasm::validate`) + `checked::Store` (host run) as the in-test execution oracle; the existing `compile_project_incremental` salsa pipeline; the `tests/simulate.rs` corpus harness.
+
+**Scope:** Phase 1 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`.
+
+**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`).
+
+---
+
+## Acceptance Criteria Coverage
+
+This phase implements and tests:
+
+### wasm-backend.AC1: The wasm backend reproduces the VM's simulation results
+- **wasm-backend.AC1.1 Success:** A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` against the model's expected outputs at those tests' existing tolerances. (No separate, tighter wasm-vs-VM threshold.) *(Phase 1 covers scalar, Euler models; later phases widen the supported set.)*
+- **wasm-backend.AC1.4 Failure:** A model using a not-yet-supported construct returns `WasmGenError::Unsupported` — a clean error, never a panic or a silently wrong result.
+- **wasm-backend.AC1.5 Edge:** Empty-view reducers, out-of-bounds subscripts, and division-by-zero produce the same NaN / finite-`:NA:` / Inf values the VM produces. *(Phase 1 covers the division-by-zero portion — raw `Op2::Div`; the empty-reducer/OOB and finite-`:NA:`-vs-NaN portions complete in Phases 5 and 2.)*
+
+### wasm-backend.AC2: The backend consumes the salsa compiled bytecode
+- **wasm-backend.AC2.1 Success:** The wasm module is produced from `compile_project_incremental(...) -> CompiledSimulation`, not from the `Expr` IR or the monolithic `compiler::Module`.
+- **wasm-backend.AC2.2 Success:** The POC's `#[cfg(test)]` un-gating of the monolithic builder is reverted; the crate builds with `Module::new`/`build_metadata`/`calc_n_slots`/`calc_module_model_map` test-only again.
+
+### wasm-backend.AC3: simulate.rs runs the corpus through both backends
+- **wasm-backend.AC3.1 Success:** During rollout, each corpus model runs through the VM and (when supported) the wasm backend, comparing wasm-vs-VM; unsupported models are skipped (not failed) and counted against a monotonically rising floor.
+
+### wasm-backend.AC4: Self-describing results + efficient by-name retrieval
+- **wasm-backend.AC4.1 Success:** The blob exports `n_slots`/`n_chunks`/`results_offset` and writes step-major snapshots; a host locates and strides the results with no external metadata.
+
+### wasm-backend.AC7: Numeric-parity specifics
+- **wasm-backend.AC7.4 Success:** Euler, RK2, and RK4 each match the VM's saved samples (cadence and values); `PREVIOUS`/`INIT` match via the snapshot regions. *(Phase 1 establishes the Euler cadence/values portion only; RK2/RK4 + PREVIOUS/INIT complete this AC in Phase 4.)*
+
+### wasm-backend.AC8: Engineering quality (cross-cutting)
+- **wasm-backend.AC8.1 / AC8.2** are satisfied cross-cuttingly across every phase rather than headered per-phase: each functionality task is TDD with inline `#[cfg(test)]` unit tests that execute emitted wasm under the DLR-FT interpreter, each opcode/feature group is individually tested toward ≥95% coverage, and every phase ends with passing tests for the ACs it claims (its "Done When").
+
+---
+
+## Notes for the implementer (read first)
+
+- **The VM is the executable spec.** Every opcode's wasm lowering must reproduce the matching arm of `vm.rs`. Cite-and-mirror, do not invent. Key references confirmed during planning:
+  - The `Opcode` enum: `src/simlin-engine/src/bytecode.rs:561`. The scalar-core variants are `Op2 { op: Op2 }`, `Not {}`, `LoadConstant { id: LiteralId }`, `LoadVar { off: VariableOffset }`, `LoadGlobalVar { off: VariableOffset }`, `SetCond {}`, `If {}`, `AssignCurr { off }`, `AssignNext { off }`, `Ret`. (`LiteralId`/`VariableOffset` are `u16`.)
+  - `Op2` enum: `bytecode.rs:526` — `Add, Sub, Exp, Mul, Div, Mod, Gt, Gte, Lt, Lte, Eq, And, Or`. **There is no `Neq`** (the AST `Neq` lowers to `Eq` then `Not`). The VM's `eval_op2` is `vm.rs:94-111`.
+  - `is_truthy(n) = !crate::float::approx_eq(n, 0.0)` — `vm.rs:89`.
+  - The Euler loop and the `save_advance!` macro — `vm.rs:631-711` (Euler arm `vm.rs:698-711`; `save_advance!` `vm.rs:675-695`).
+  - Reserved global slots `TIME_OFF=0`, `DT_OFF=1`, `INITIAL_TIME_OFF=2`, `FINAL_TIME_OFF=3`, `IMPLICIT_VAR_COUNT=4` — `vm.rs:83-87`.
+- **`CompiledSimulation` shape** (`vm.rs:132-140`), all fields `pub(crate)` (the in-crate `wasmgen` module reads them directly):
+  - `modules: HashMap<ModuleKey, CompiledModule>`, `specs: Specs`, `root: ModuleKey`, `offsets: HashMap<Ident<Canonical>, usize>` (the global var-name→slot map — this becomes `WasmLayout.var_offsets`), plus a private `cached_constant_info` (ignore until Phase 7).
+  - `ModuleKey = (Ident<Canonical>, BTreeSet<Ident<Canonical>>)` (`vm.rs:24`).
+  - `CompiledModule` (`bytecode.rs:4616`): `ident`, `n_slots: usize`, `context: Arc<ByteCodeContext>`, `compiled_initials: Arc<Vec<CompiledInitial>>`, `compiled_flows: Arc<ByteCode>`, `compiled_stocks: Arc<ByteCode>`.
+  - `ByteCode { literals: Vec<f64>, code: Vec<Opcode> }` (`bytecode.rs:1702`). **`literals` live inside each `ByteCode`**, not on `CompiledModule`. `CompiledInitial { ident, offsets: Vec<usize>, bytecode: ByteCode }` (`bytecode.rs:4603`) — initials are a **vector of per-variable programs**, each its own `ByteCode`.
+  - `Specs` (`results.rs:22`): `start`, `stop`, `dt`, `save_step`, `method: Method`, `n_chunks: usize`. `Method` is `Euler | RungeKutta2 | RungeKutta4`.
+- **The opcode programs are un-fused.** `fuse_three_address` runs inside `Vm::new` (`vm.rs:397`), *after* `CompiledSimulation` is produced, on the VM's private execution copy. A `CompiledSimulation` consumer only ever sees the plain opcode set above — never `BinVarVar`, `AssignConstCurr`, etc. The emitter does not need to handle the fused/superinstruction opcodes; if one is ever encountered, return `WasmGenError::Unsupported`.
+- **DLR-FT oracle pattern** (used by every wasm-executing test), confirmed verbatim at `wasmgen/module.rs:392-422`:
+  ```rust
+  use checked::Store;
+  use wasm::validate;
+  let info = validate(&wasm_bytes).expect("module must validate");
+  let mut store = Store::new(());
+  let inst = store.module_instantiate(&info, Vec::new(), None).expect("instantiate").module_addr;
+  let run = store.instance_export(inst, "run").unwrap().as_func().unwrap();
+  store.invoke_simple_typed::<(), ()>(run, ()).expect("run wasm");
+  let mem = store.instance_export(inst, "memory").unwrap().as_mem().unwrap();
+  let data: Vec<f64> = store.mem_access_mut_slice(mem, |bytes| { /* read f64 LE at byte offsets */ });
+  ```
+- **Visibility latitude (per the repo owner):** widen any engine item to `pub(crate)` — or `pub` where the `tests/` parity harness (a crate-external target) needs it — wherever it produces a cleaner backend. The repo has no external API consumers; breaking changes are fine if tests pass. Do not contort the design to avoid touching visibility. (`compile_project_incremental`, `db::sync_from_datamodel_incremental`, `SimlinDb`, `Results`, and the new `compile_simulation`/`WasmArtifact`/`WasmLayout` must be reachable from `tests/`; make them `pub`.)
+- **TDD, 95%+ coverage, inline `#[cfg(test)] mod tests`.** Each unit test that executes wasm builds a tiny module, runs it under the DLR-FT interpreter, and asserts on memory/return values. Keep each test < 2s (the suite runs under a 3-minute wall-clock cap; `docs/dev/rust.md:13-17`). Run the engine tests with `cargo test -p simlin-engine --features file_io` (the corpus tests are gated on `file_io`; bare `cargo test`/`cargo test --workspace` also activate it via workspace feature unification).
+- **Addressing scheme (uniform across all phases, module-ready).** The per-program wasm functions take a single `i32` parameter `module_off` (slot base of this module instance within a chunk; `0` for the root in Phase 1). A module-relative slot `off` resolves to byte address `chunk_base + (module_off + off) * 8`, emitted as: push the dynamic part `local.get module_off; i32.const 8; i32.mul`, then `f64.load`/`f64.store` with `memarg.offset = chunk_base + off*8` (a compile-time constant) and `memarg.align = 3`. An **absolute global** slot (`LoadGlobalVar`, slots 0..4) ignores `module_off`: `i32.const 0; f64.load memarg{offset: chunk_base + off*8}`. Using `module_off` from Phase 1 (always 0 for the root) avoids a Phase 7 rewrite. `chunk_base` is `curr_base` for `LoadVar`/`LoadGlobalVar`/`AssignCurr`, `next_base` for `AssignNext`.
+
+---
+
+<!-- START_SUBCOMPONENT_A (tasks 1-4) -->
+
+<!-- START_TASK_1 -->
+### Task 1: Scalar-core opcode emitter (`wasmgen/lower.rs`)
+
+**Verifies:** wasm-backend.AC2.1 (consumes bytecode opcodes, not `Expr`); wasm-backend.AC1.4 (unsupported opcodes return a clean `WasmGenError::Unsupported`); wasm-backend.AC1.5 (raw `Op2::Div` by zero).
+
+**Files:**
+- Create: `src/simlin-engine/src/wasmgen/lower.rs`
+- Modify: `src/simlin-engine/src/wasmgen/mod.rs` (add `mod lower;`)
+- Test: inline `#[cfg(test)] mod tests` in `wasmgen/lower.rs`
+
+**Implementation:**
+Create the per-opcode emitter that walks a `&crate::bytecode::ByteCode` and appends wasm instructions to a `wasm_encoder::Function`, mirroring `eval_bytecode` (`vm.rs:1257+`). Reuse the POC's `EmitCtx`/`memarg`/`f64_const` helpers (currently in `wasmgen/expr.rs`) but generalize `EmitCtx` to carry `module_off` handling per the addressing scheme above.
+
+Define:
+```rust
+pub(crate) struct EmitCtx {
+    pub curr_base: u32,   // byte offset of slot 0 of the curr chunk
+    pub next_base: u32,   // byte offset of slot 0 of the next chunk
+    pub dt: f64,
+    pub start_time: f64,
+    pub final_time: f64,
+    pub module_off_local: u32, // wasm local index holding this instance's module_off (i32)
+}
+```
+
+`pub(crate) fn emit_bytecode(bc: &ByteCode, ctx: &EmitCtx, f: &mut Function) -> Result<(), WasmGenError>`:
+walk `bc.code` in order; for each `Opcode` emit wasm. A scratch f64 local (reserved by the caller; pass its index in `EmitCtx` or as an arg) is needed for `AssignCurr`/`AssignNext` (the value is already on the wasm stack and the store address must be pushed under it).
+
+Per-opcode lowering (Phase 1 supported set; everything else → `WasmGenError::Unsupported(format!(...))`):
+
+| Opcode | wasm emitted |
+|---|---|
+| `LoadConstant { id }` | `f64.const bc.literals[id as usize]` |
+| `LoadVar { off }` | address(`curr_base`, `off`, dynamic `module_off`); `f64.load` |
+| `LoadGlobalVar { off }` | `i32.const 0; f64.load memarg{curr_base + off*8}` (absolute, no `module_off`) |
+| `Op2 { op }` | operands already on stack. `Add/Sub/Mul/Div` → `f64.add/sub/mul/div`. `Gt/Gte/Lt/Lte` → `f64.gt/ge/lt/le` then convert the i32 0/1 to f64 (`f64.convert_i32_u`) so booleans stay f64 1.0/0.0 like the VM. `Eq/And/Or/Mod/Exp` → `Unsupported` (Phase 2). |
+| `Not {}` | operand on stack; truthiness-negate. Phase 1 uses simple `value == 0.0` (`f64.const 0.0; f64.eq; f64.convert_i32_u`), matching the POC; Phase 2 routes through the `approx_eq` helper. |
+| `SetCond {}` | pop the f64 condition; reduce to i32 truthiness (Phase 1: `f64.const 0.0; f64.ne` → i32) and `local.set` into a reserved i32 "condition" local. |
+| `If {}` | the two arm values (`t` then `f`) are already on the wasm stack from preceding opcodes; emit `local.get <cond_local>; select`. wasm `select` pops `[t, f, cond_i32]` and yields `t` if `cond != 0` else `f` — exactly the VM's `If` (`push(if condition { t } else { f })`). |
+| `AssignCurr { off }` | pop value into the scratch f64 local; emit address(`curr_base`, `off`, `module_off`); `local.get scratch`; `f64.store`. |
+| `AssignNext { off }` | same as `AssignCurr` but `next_base`. |
+| `Ret` | emit nothing (the wasm function's `End` is emitted by the caller). |
+
+**Critical correctness notes** (all confirmed against the VM):
+- `SetCond` is a *separate opcode* that sets a condition register read by `If`; they are always emitted adjacently by codegen but the emitter must reserve a dedicated i32 local for the condition. Nesting: an inner `If` can occur between an outer `SetCond` and its `If`, so use a **stack of condition locals** (push on `SetCond`, pop on `If`) rather than a single local, to be safe — confirm against `compiler/codegen.rs:1153-1159` that emission is well-nested; if codegen guarantees `SetCond` immediately precedes its `If` with no intervening `SetCond`, a single local suffices. Default to the local-stack to be robust.
+- `Op2` operand order: the VM pops `r` then `l` and computes `l op r`; wasm leaves them in push order `[l, r]` on the stack, so `f64.sub`/`f64.div` (non-commutative) are already correct.
+- Comparisons must yield f64 `1.0`/`0.0` (not raw i32), because downstream opcodes consume them as f64.
+
+**Testing:**
+Hand-build small `ByteCode` values (`ByteCode { literals, code }` — fields are `pub(crate)`, reachable in-crate) wrapping each opcode/sequence, wrap in a one-function test module that exports `eval`/`mem` (mirror the harness in the current `wasmgen/expr.rs:300-396`), execute under the DLR-FT interpreter, and assert. Cover:
+- wasm-backend.AC2.1: each scalar-core opcode (`LoadConstant`, `LoadVar`, `LoadGlobalVar`, every supported `Op2`, `Not`, `SetCond`+`If` true/false, `AssignCurr`, `AssignNext`) lowers and produces the value/store the VM's `eval_op2`/handler produces.
+- `If` selecting the correct arm for truthy and zero conditions; nested `If`.
+- wasm-backend.AC1.5: raw `Op2::Div` by zero matches the VM (`x/0` → ±Inf, `0/0` → NaN — IEEE-identical, since wasm `f64.div` matches Rust `f64`).
+- wasm-backend.AC1.4: unsupported opcodes (`Op2::Eq`, `Op2::Mod`, `Apply`, `Lookup`, an array opcode) return `WasmGenError::Unsupported` (a clean error, never a panic).
+
+**Verification:**
+Run: `cargo test -p simlin-engine --features file_io wasmgen::lower`
+Expected: all new tests pass.
+
+**Commit:** `engine: wasmgen scalar-core opcode emitter over bytecode`
+<!-- END_TASK_1 -->
+
+<!-- START_TASK_2 -->
+### Task 2: `compile_simulation` — whole-model assembly (root, Euler)
+
+**Verifies:** wasm-backend.AC2.1, wasm-backend.AC4.1, wasm-backend.AC7.4 (Euler portion).
+
+**Files:**
+- Modify: `src/simlin-engine/src/wasmgen/module.rs` (add the new `compile_simulation` path + `WasmArtifact`/`WasmLayout`; the old `compile_module(&Module, &Specs)` is removed in Task 3)
+- Modify: `src/simlin-engine/src/wasmgen/mod.rs` (export the new types/fn)
+- Test: inline `#[cfg(test)] mod tests` in `wasmgen/module.rs`
+
+**Implementation:**
+Add the public contract types and entry point. Place the types in `mod.rs` (or `module.rs` and re-export); make them `pub`:
+```rust
+pub struct WasmArtifact {
+    pub wasm: Vec<u8>,
+    pub layout: WasmLayout,
+}
+
+pub struct WasmLayout {
+    pub n_slots: usize,
+    pub n_chunks: usize,
+    pub results_offset: usize,             // byte offset of the results region
+    pub var_offsets: Vec<(String, usize)>, // canonical variable name -> slot offset
+}
+
+pub fn compile_simulation(sim: &CompiledSimulation) -> Result<WasmArtifact, WasmGenError>;
+```
+
+`compile_simulation` (Phase 1 supports the root module only, Euler only):
+1. Look up the root `CompiledModule` via `sim.modules.get(&sim.root)`. Return `Unsupported` if `sim.specs.method != Method::Euler`. Return `Unsupported` if the root has any nested modules (`root.context.modules` non-empty) — modules land in Phase 7.
+2. Compute layout: `n_slots = root.n_slots`, `n_chunks = sim.specs.n_chunks`, `stride = n_slots*8`, `curr_base = 0`, `next_base = stride`, `results_base = 2*stride`, `pages = ceil((results_base + n_chunks*stride)/65536)`. (Mirror the POC's `compile_module`, `module.rs:72-85`.) `save_every = max(1, round(save_step/dt))`.
+3. Emit three wasm functions over the shared linear memory, each `(module_off: i32) -> ()`:
+   - **initials**: for each `CompiledInitial` in `root.compiled_initials`, `emit_bytecode(&ci.bytecode, ...)` in order.
+   - **flows**: `emit_bytecode(&root.compiled_flows, ...)`.
+   - **stocks**: `emit_bytecode(&root.compiled_stocks, ...)`.
+   Each function reserves the scratch f64 local + condition i32 local(s) the emitter needs.
+4. Emit the **`run`** function (`() -> ()`): seed `curr[TIME_OFF]=start`, `curr[DT_OFF]=dt`, `curr[INITIAL_TIME_OFF]=start`, `curr[FINAL_TIME_OFF]=stop`; `call initials(0)`; then the Euler loop mirroring `vm.rs:698-711` + `save_advance!` (`vm.rs:675-695`): each step call `flows(0)` then `stocks(0)`, force-save the t=start sample then every `save_every` steps, write the full `curr` row (all `n_slots`) into `results[saved]`, advance stocks `next→curr` and `time += dt`, stop after `n_chunks` saves or when `time > stop`. The POC's `emit_run` (`module.rs:172-286`) is a correct reference for this control-flow shape — adapt it to call the three opcode-emitted functions instead of inlining `Expr` lowering, and to derive the stock copy-back offsets from the `AssignNext` opcodes in `root.compiled_stocks` (collect their `off`, analogous to the POC's `collect_assign_next_offsets`, `module.rs:139-147`).
+5. Assemble the module (Type/Function/Memory/Global/Export/Code sections per the POC's `assemble`, `module.rs:293-338`): export `memory`, `run`, and three immutable i32 globals `n_slots`/`n_chunks`/`results_offset` (= `results_base`). With multiple functions, emit a type section entry for `(i32)->()` and `()->()`, a function section indexing them, and export `run` by its function index.
+6. Build `WasmLayout`: `var_offsets = sim.offsets.iter().map(|(k,v)| (k.as_str().to_string(), *v)).collect()`; `n_slots`, `n_chunks`, `results_offset = results_base`.
+
+**Testing:**
+- wasm-backend.AC2.1 + AC7.4(Euler): build a `CompiledSimulation` for a small scalar Euler model via `compile_project_incremental` (mirror `wasmgen/module.rs:367-373`) — e.g. the `default_projects/population/model.xmile` already used by the POC test, and 1-2 hand-built scalar models via `TestProject` (`src/simlin-engine/src/test_common.rs`). Run the blob under DLR-FT, read the step-major slab, and assert every shared variable's full series matches `Vm::new(sim).run_to_end().into_results()` (reuse the comparison shape from `module.rs:425-457`). Assert `step_count == n_chunks` and the saved cadence matches.
+- wasm-backend.AC4.1: a dedicated test reads the three exported i32 globals from the instantiated module (via the `checked` crate's `instance_export(inst, "n_slots").as_global()` accessor) and verifies they equal the `WasmLayout` values; then uses `results_offset`/`n_slots`/`n_chunks` (read from the module, no external metadata) to stride to one variable's series and confirm it matches the VM.
+
+**Verification:**
+Run: `cargo test -p simlin-engine --features file_io wasmgen::module`
+Expected: all new tests pass.
+
+**Commit:** `engine: wasmgen compile_simulation (root, Euler) over CompiledSimulation`
+<!-- END_TASK_2 -->
+
+<!-- START_TASK_3 -->
+### Task 3: Reroute the datamodel entry point; remove the `Expr`-based path
+
+**Verifies:** wasm-backend.AC2.1.
+
+**Files:**
+- Modify: `src/simlin-engine/src/wasmgen/module.rs` (replace `compile_datamodel_to_wasm` body; remove `compile_module(&Module, &Specs)` and the `collect_assign_next_offsets(&[Expr])`/`store_curr_const`/`emit_run` helpers that consumed `Expr`/`compiler::Module`)
+- Delete: `src/simlin-engine/src/wasmgen/expr.rs`
+- Modify: `src/simlin-engine/src/wasmgen/mod.rs` (remove `mod expr;`, update `pub use`)
+
+**Implementation:**
+Rewrite `compile_datamodel_to_wasm(datamodel, model_name) -> Result<Vec<u8>, WasmGenError>` to go through the salsa pipeline and the new entry point (this is what makes AC2.1 true end-to-end and removes the only production use of `compiler::Module`):
+```rust
+pub fn compile_datamodel_to_wasm(datamodel: &crate::datamodel::Project, model_name: &str)
+    -> Result<Vec<u8>, WasmGenError>
+{
+    let mut db = crate::db::SimlinDb::default();
+    let sync = crate::db::sync_from_datamodel_incremental(&mut db, datamodel, None);
+    let sim = crate::db::compile_project_incremental(&db, sync.project, model_name)
+        .map_err(|e| WasmGenError::Unsupported(format!("wasmgen: incremental compile failed: {e:?}")))?;
+    Ok(compile_simulation(&sim)?.wasm)
+}
+```
+(The `WasmLayout` is dropped here; Phase 7 changes the FFI to surface it. Keep this function's signature stable so `libsimlin` and the `wasm-backend-poc.mjs` exploratory script keep building.)
+
+Delete `wasmgen/expr.rs` entirely (its `Expr`-tree lowering is replaced by `lower.rs`'s opcode emitter). Move the still-needed shared helpers (`memarg`, `f64_const`) into `lower.rs` if not already there. Replace the old `population_wasm_matches_vm` test so it builds the wasm via `compile_simulation(&compiled)` (the same `compiled` it already produces for the VM golden at `module.rs:369-373`) rather than `compile_module(&module, &specs)`; drop the monolithic `compiler::Module::new` usage from the test.
+
+**Testing:**
+- The rerouted `population_wasm_matches_vm` (now compiling via `compile_simulation`) passes.
+- Add a test that `compile_datamodel_to_wasm` returns a non-empty blob for the population model and that the blob validates under `wasm::validate`.
+
+**Verification:**
+Run: `cargo test -p simlin-engine --features file_io wasmgen`
+Expected: all wasmgen tests pass; `wasmgen/expr.rs` no longer exists; no references to `crate::compiler::Module` remain in `wasmgen/`.
+
+**Commit:** `engine: route wasmgen through compile_simulation; drop Expr path`
+<!-- END_TASK_3 -->
+
+<!-- START_TASK_4 -->
+### Task 4: Revert the monolithic-compiler `#[cfg(test)]` un-gating
+
+**Verifies:** wasm-backend.AC2.2.
+
+**Files:**
+- Modify: `src/simlin-engine/src/compiler/mod.rs`
+
+**Implementation:**
+The POC removed `#[cfg(test)]` from the monolithic builder so the `Expr`-based wasmgen could use it in production. Now that wasmgen consumes `CompiledSimulation`, re-gate it (restoring `main`'s state). Re-add `#[cfg(test)]` to:
+- the four imports the POC un-gated at `compiler/mod.rs:16-29` (`use crate::common::{Error, ErrorCode, ErrorKind};`, `use crate::model::ModelStage1;`, `use crate::project::Project;`, `use crate::vm::IMPLICIT_VAR_COUNT;` — confirm exact set against `git diff main -- src/simlin-engine/src/compiler/mod.rs`),
+- `calc_module_model_map` (`mod.rs:2660`, currently `pub(crate) fn`),
+- `build_metadata` (`mod.rs:2694`, currently `pub(crate) fn`),
+- `calc_n_slots` (`mod.rs:2830`, currently bare-private `fn`),
+- the `impl Module { fn new }` block (`mod.rs:2849`, `pub(crate) fn new`).
+
+Use `git diff main -- src/simlin-engine/src/compiler/mod.rs` to see precisely what the POC changed and invert exactly that diff (do **not** touch the separate pre-existing `#[cfg(test)] impl Module` test-helper block at `mod.rs:3046`, nor the non-test `impl Module { pub fn compile() }` at `mod.rs:2839`).
+
+**Testing:**
+This is a visibility/gating revert verified operationally (no new behavior; **Verifies: AC2.2** via build state). The existing `#[cfg(test)]` users of `Module::new` (and the test suite) continue to compile.
+
+**Verification:**
+Run: `cargo build -p simlin-engine` — builds with the four items test-only again (a non-test build no longer references them).
+Run: `cargo test -p simlin-engine --features file_io` — compiles and passes (test code still reaches the now-`#[cfg(test)]` builder).
+Run: `git diff main -- src/simlin-engine/src/compiler/mod.rs` — shows only the re-gating (the POC's un-gating is fully inverted).
+
+**Commit:** `engine: re-gate monolithic compiler builder to test-only`
+<!-- END_SUBCOMPONENT_A -->
+<!-- END_TASK_4 -->
+
+<!-- START_SUBCOMPONENT_B (tasks 5-6) -->
+
+<!-- START_TASK_5 -->
+### Task 5: `ensure_wasm_matches` parity helper
+
+**Verifies:** wasm-backend.AC1.1, wasm-backend.AC3.1.
+
+**Files:**
+- Modify: `src/simlin-engine/tests/test_helpers.rs` (add the helper + a `WasmRunOutcome` type; add the `checked`/`wasm` imports)
+- (If `compile_simulation`/`WasmArtifact`/`WasmLayout`/`sync_from_datamodel_incremental`/`compile_project_incremental`/`SimlinDb` are not `pub`, widen them to `pub` so this `tests/` target can call them.)
+
+**Implementation:**
+Add a helper that compiles a model to wasm, runs it under the DLR-FT interpreter, builds a `Results` from the step-major slab, and compares it to the model's expected outputs with the **existing** comparator (`ensure_results_excluding`, `test_helpers.rs:62`) — the same check the VM passes. There is no separate wasm-vs-VM threshold (per the design's validation bar); "wasm-vs-VM parity" is achieved because both clear the same comparator against the same expected outputs.
+
+```rust
+pub enum WasmRunOutcome { Ran, Skipped(String) }   // Skipped carries the Unsupported message
+
+pub fn ensure_wasm_matches(
+    datamodel: &simlin_engine::datamodel::Project,
+    model_name: &str,
+    expected: &simlin_engine::Results,
+    excluded: &[&str],
+) -> WasmRunOutcome
+```
+Steps:
+1. Build `CompiledSimulation` exactly as the VM corpus path does (`simulate.rs:105-111` `compile_vm`): `SimlinDb::default()` → `sync_from_datamodel_incremental` → `compile_project_incremental(&db, sync.project, model_name)`. (If the incremental compile itself errors, that is a VM-side issue already covered elsewhere — return `Skipped` with the message rather than failing here.)
+2. `let artifact = match simlin_engine::wasmgen::compile_simulation(&sim) { Ok(a) => a, Err(WasmGenError::Unsupported(m)) => return WasmRunOutcome::Skipped(m) };`
+3. Instantiate `artifact.wasm` under `checked::Store`, invoke `run`, and read the results region. Read geometry from `artifact.layout` (`n_slots`, `n_chunks`, `results_offset`) — copy `n_chunks * n_slots` f64 from `results_offset`.
+4. Build a `simlin_engine::Results`: `offsets` from `artifact.layout.var_offsets` (map each `String` back to `Ident<Canonical>` via the canonicalizing constructor), `data` = the slab (boxed), `step_size = n_slots`, `step_count = n_chunks`, `specs = sim.specs.clone()`, `is_vensim = false`.
+5. `ensure_results_excluding(expected, &wasm_results, excluded);` (panics on mismatch — a supported model producing wrong wasm fails loudly). Return `WasmRunOutcome::Ran`.
+
+**Testing:**
+This helper is exercised by Task 6's corpus wiring and by a focused unit test here: call `ensure_wasm_matches` on one tiny scalar model (build its `expected` from the VM) and assert it returns `Ran`; call it on a model using an unsupported construct (e.g. a builtin/`Apply`) and assert it returns `Skipped`. (AC1.1: a supported model clears `ensure_results`; AC3.1: an unsupported model is skipped, not failed.)
+
+**Verification:**
+Run: `cargo test -p simlin-engine --features file_io --test simulate ensure_wasm_matches`
+Expected: helper unit tests pass.
+
+**Commit:** `engine: add ensure_wasm_matches parity helper`
+<!-- END_TASK_5 -->
+
+<!-- START_TASK_6 -->
+### Task 6: Wire the corpus through both backends + the rising floor gate
+
+**Verifies:** wasm-backend.AC1.1, wasm-backend.AC3.1, wasm-backend.AC4.1.
+
+**Files:**
+- Modify: `src/simlin-engine/tests/simulate.rs`
+
+**Implementation:**
+1. **Inline hook:** in `simulate_path_with_excluding` (`simulate.rs:843-915`), after the existing VM `ensure_results_excluding` comparisons pass, call `ensure_wasm_matches(&datamodel, "main", &expected, excluded)` once per model. A `Ran` outcome means the wasm output already cleared `ensure_results` inside the helper (a supported-but-wrong model panics there); a `Skipped` outcome is recorded, not failed. Do the same in the `.mdl` path (`simulate_mdl_path*`). Do **not** add the hook to `run_clearn_vs_vdf`/`simulates_clearn` or other `#[ignore]` heavy-model paths — those get `#[ignore]`d wasm twins in Phase 8 (the DLR-FT interpreter is slow; keep the default suite under the 3-minute cap).
+2. **Floor gate:** add `const WASM_SUPPORTED_FLOOR: usize = <observed>;` and a `#[test] fn wasm_parity_floor()` that iterates the small/medium corpus list (`TEST_MODELS`, `simulate.rs:22-101`, skipping any entry that is itself `#[ignore]`-class/heavy), runs each through `ensure_wasm_matches` (building `expected` from the VM via the existing parse+`compile_vm`+run path), counts `Ran`, and asserts `ran >= WASM_SUPPORTED_FLOOR`. Set `WASM_SUPPORTED_FLOOR` to the count Phase 1 actually achieves (run the test once, observe, pin it). Document with a comment that each subsequent phase raises this floor and that dropping below it is a regression (AC3.1 / AC3.3). Keep the gate's total runtime within budget — if iterating all of `TEST_MODELS` under the interpreter is too slow, restrict the gate to a representative scalar subset and note it; the per-model inline hook still covers the rest functionally.
+
+**Testing:**
+The gate test *is* the test. Also confirm (manually, noted in the commit) that at least one scalar model reports `Ran` and that introducing a deliberate `Unsupported` (temporarily) lowers the count — i.e. the floor would catch a regression.
+
+**Verification:**
+Run: `cargo test -p simlin-engine --features file_io --test simulate wasm_parity_floor`
+Expected: passes with `ran >= WASM_SUPPORTED_FLOOR`.
+Run: `cargo test -p simlin-engine --features file_io --test simulate`
+Expected: the full corpus passes (VM unchanged; supported models also clear wasm; unsupported models skip).
+
+**Commit:** `engine: run corpus through wasm backend with rising floor gate`
+<!-- END_TASK_6 -->
+<!-- END_SUBCOMPONENT_B -->
+
+---
+
+## Phase 1 Done When
+- Scalar Euler corpus models match the VM through wasm (clearing the existing `ensure_results` comparator); unsupported models skip cleanly via `WasmGenError::Unsupported`.
+- The floor gate (`wasm_parity_floor`) is active and pinned.
+- The monolithic builder is re-gated to `#[cfg(test)]`; `cargo build -p simlin-engine` and `cargo test -p simlin-engine --features file_io` both pass.
+- The blob is self-describing (exports `n_slots`/`n_chunks`/`results_offset`, step-major results) and a test reads geometry from the module to stride results (AC4.1).
diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_02.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_02.md
new file mode 100644
index 000000000..36f50fa19
--- /dev/null
+++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_02.md
@@ -0,0 +1,185 @@
+# WebAssembly Simulation Backend — Phase 2: Full scalar builtins + numeric parity
+
+**Goal:** Bring every scalar `BuiltinId` and `Op2` to VM parity: open-code the transcendentals wasm lacks, route equality/truthiness through a wasm `approx_eq` helper that matches `crate::float::approx_eq` exactly, and lower `Mod`/`Exp` and the composed builtins (`Step`/`Pulse`/`Ramp`/`Sshape`/`Sign`/`Quantum`/`SafeDiv`) to faithful f64 sequences.
+
+**Architecture:** Builds on Phase 1's opcode emitter (`wasmgen/lower.rs`). Math wasm provides natively (`f64.abs`/`sqrt`/`floor`/`min`/`max`/arithmetic/compares) maps to the instruction directly; the transcendentals (`sin`/`cos`/`tan`/`asin`/`acos`/`atan`/`exp`/`ln`/`log10`/`pow`) are emitted once each as self-contained wasm helper functions (range reduction + polynomial) and called by name — the blob needs no math imports. Equality and truthiness route through a single emitted `approx_eq` helper so the backend takes the same branch the VM takes.
+
+**Tech Stack:** Rust; `wasm-encoder` (multi-function modules, `call`); the DLR-FT interpreter oracle; `crate::float::approx_eq` (`float_cmp` 0.10) as the equality reference; the VM's `apply()` (`vm.rs:2938-3012`) and `eval_op2` (`vm.rs:94-111`) as the builtin/operator spec.
+
+**Scope:** Phase 2 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`.
+
+**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`).
+
+---
+
+## Acceptance Criteria Coverage
+
+### wasm-backend.AC1
+- **wasm-backend.AC1.1 Success:** A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` against the model's expected outputs at those tests' existing tolerances. (No separate, tighter wasm-vs-VM threshold.)
+- **wasm-backend.AC1.5 Edge:** Empty-view reducers, out-of-bounds subscripts, and division-by-zero produce the same NaN / finite-`:NA:` / Inf values the VM produces. *(Phase 2 covers the finite-`:NA:`-sentinel-vs-genuine-NaN distinction via the `approx_eq` helper; the empty-reducer/OOB portions complete in Phase 5.)*
+
+### wasm-backend.AC7
+- **wasm-backend.AC7.1 Success:** Math wasm provides natively (`sqrt`, `abs`, `floor`/`ceil`/`trunc`/`nearest`, `min`/`max`, arithmetic) uses wasm instructions; the transcendentals wasm lacks (`sin`/`cos`/`tan`/`asin`/`acos`/`atan`/`exp`/`ln`/`log10`/`pow`) and the allocation `erfc` are open-coded as self-contained wasm helper functions (range reduction + polynomial). Each open-coded helper has a unit test comparing its output to Rust `f64` over a sampled range; results need not be bit-identical to the VM's libm — only close enough that the existing tests pass. *(The allocation `erfc`/`normal_cdf` helpers land in Phase 6; Phase 2 covers the scalar transcendentals.)*
+- **wasm-backend.AC7.2 Success:** Equality and truthiness (`Eq`/`Neq`/`And`/`Or`/`If` condition) use ULP-based `approx_eq` matching the VM.
+- **wasm-backend.AC7.3 Edge:** `Mod` matches the VM's `rem_euclid` semantics (computed via wasm `floor`). `Max`/`Min` use the wasm `f64.max`/`f64.min` instructions; if a corpus test surfaces a NaN/±0 difference from the VM's compare-based form, fall back to explicit compare-and-select for that case.
+
+---
+
+## Notes for the implementer (read first)
+
+- **Confirmed enums.** `Op2` (`bytecode.rs:527`): `Add, Sub, Exp, Mul, Div, Mod, Gt, Gte, Lt, Lte, Eq, And, Or` (no `Neq`). `BuiltinId` (`bytecode.rs:500`): `Abs, Arccos, Arcsin, Arctan, Cos, Exp, Inf, Int, Ln, Log10, Max, Min, Pi, Pulse, Quantum, Ramp, SafeDiv, Sign, Sin, Sshape, Sqrt, Step, Tan`. **There is no `Mean` and no `IsModuleInput` `BuiltinId`** — scalar `MEAN(a,b,…)` is lowered by codegen to `(0+a+b+…)/N` using `Op2::Add`/`Op2::Div` (already handled by Phase 1/2), single-arg `MEAN(array)` becomes `ArrayMean` (Phase 5), and `IsModuleInput` is resolved to a `LoadConstant 1.0/0.0` at codegen. So the backend never sees a `Mean`/`IsModuleInput` opcode.
+- **`Apply` always pops exactly 3 operands** (codegen pads: 1-arg builtins with two `LoadConstant 0.0`; 2-arg with one; `Ramp` pads its end-time with `LoadGlobalVar{FINAL_TIME_OFF}`). So lower `Apply{func}` by popping the 3 stack values into three scratch f64 locals `a`, `b`, `c` (top is `c`), reading `time = curr[TIME_OFF]`/`dt = curr[DT_OFF]` from memory when the builtin needs them, computing per `apply()` (`vm.rs:2938-3012`), and pushing the result.
+- **`apply()` exact sequences** (mirror verbatim, `vm.rs:2938-3012`): `Abs=a.abs()`, `Sqrt=a.sqrt()`, `Int=a.floor()` (**floor, not trunc**), `Min={if a<b {a} else {b}}`, `Max={if a>b {a} else {b}}`, `Sign={if a>0 {1} else if a<0 {-1} else {0}}`, `Quantum={if b==0.0 {a} else {(a/b).trunc()*b}}`, `SafeDiv={if b != 0.0 {a/b} else {c}}` (**exact `!= 0.0`, not approx**), `Sshape=b + (c-b)/(1.0 + (-4.0*(2.0*a-1.0)).exp())`, `Exp=a.exp()`, `Ln=a.ln()`, `Log10=a.log10()`, `Sin/Cos/Tan/Arcsin/Arccos/Arctan` = the libm calls, `Inf=f64::INFINITY`, `Pi=PI`, `Step=step(time,dt,a,b)`, `Pulse=pulse(time,dt,a,b,c)`, `Ramp=ramp(time,a,b,Some(c))`. Helper bodies: `step` (`vm.rs:3027`): `if time + dt/2.0 > step_time {height} else {0.0}`; `ramp` (`vm.rs:3014`): `if time > start {if end.is_some() && time>=end {slope*(end-start)} else {slope*(time-start)}} else {0.0}`; `pulse` (`vm.rs:3036`): a `while` loop — emit it as a wasm helper function with a loop.
+- **`eval_op2`** (`vm.rs:94-111`): `Exp=l.powf(r)`, `Mod=l.rem_euclid(r)`, `Eq=approx_eq(l,r) as f64`, `And=(is_truthy(l)&&is_truthy(r)) as f64`, `Or=(is_truthy(l)||is_truthy(r)) as f64`. The rest (`Add/Sub/Mul/Div/Gt/Gte/Lt/Lte`) are Phase 1.
+- **`approx_eq` is `float_cmp::approx_eq!(f64, a, b)`** with `float-cmp` 0.10.0 defaults `epsilon = f64::EPSILON`, `ulps = 4`. Exact algorithm (must be reproduced bit-faithfully in wasm; confirmed by reading the crate):
+  - `a == b` → true (handles ±inf and exact equality), OR
+  - `(a-b).abs() <= f64::EPSILON` → true, OR
+  - `|ulps_diff(a,b)| <= 4` → true,
+  where `ulps_diff(a,b) = ordered(a).wrapping_sub(ordered(b))` as `i64` (then `saturating_abs`), and `ordered(f) = { let bits = f.to_bits() as i64; if (bits as u64) & (1<<63) != 0 { !bits ... } else { bits ^ (1<<63) } }` — i.e. map the sign-magnitude bit pattern to a monotonic ordered integer. Consequence: **`approx_eq(NaN, NaN) == true`** (identical bits → 0 ulps), and the finite `:NA:` sentinel (`crate::float::NA = -2^109`) compares unequal to ordinary values (its exponent is far from theirs). `is_truthy(n) = !approx_eq(n, 0.0)` (`vm.rs:89`).
+- **`pub(crate)`/`pub` latitude** (per the repo owner): widen visibility freely. Reuse the Rust `crate::float::approx_eq` in unit tests as the oracle for the wasm helper.
+- **TDD, inline `#[cfg(test)] mod tests`, < 2s per test.** Run: `cargo test -p simlin-engine --features file_io wasmgen`.
+
+---
+
+<!-- START_SUBCOMPONENT_A (tasks 1) -->
+<!-- START_TASK_1 -->
+### Task 1: `approx_eq` wasm helper + equality/truthiness routing
+
+**Verifies:** wasm-backend.AC7.2, wasm-backend.AC1.5 (the finite `:NA:` sentinel vs genuine NaN — `approx_eq` keeps them distinct).
+
+**Files:**
+- Modify: `src/simlin-engine/src/wasmgen/lower.rs` (the emitter) and the module-assembly code so the helper function is emitted once and callable.
+- Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+1. Emit one wasm helper function `approx_eq(a: f64, b: f64) -> i32` (returns 1/0) reproducing the algorithm above using `i64.reinterpret_f64`, `i64` arithmetic (`wrapping_sub` is plain `i64.sub`; replicate `saturating_abs` and the `ordered` bit map), `f64.eq`, `f64.sub`/`f64.abs`, and `f64.const f64::EPSILON`. Reserve a function index for it (it joins the module's function table; later phases reuse it). Provide a small `pub(crate)` helper in the emitter that pushes two f64 operands and emits `call approx_eq`.
+2. Replace Phase 1's placeholder truthiness everywhere it matters:
+   - `Not {}`: `call approx_eq(value, 0.0)` → i32 `is_false`; logical-not (`i32.eqz`) → `is_truthy`; convert to f64 1.0/0.0. (i.e. `Not` pushes `(!is_truthy) as f64` = `is_false as f64`; mirror `vm.rs` `Not` = `(!is_truthy(pop)) as f64`.)
+   - `SetCond {}`: `is_truthy(pop) = approx_eq(pop, 0.0) == 0` → store the i32 into the condition local.
+   - `Op2::Eq`: `call approx_eq(l, r)` → i32 → `f64.convert_i32_u` (f64 1.0/0.0).
+   - `Op2::And`: `is_truthy(l) & is_truthy(r)` → f64; `Op2::Or`: `is_truthy(l) | is_truthy(r)` → f64. (Both operands are on the stack; compute `is_truthy` of each via `approx_eq(·,0.0); i32.eqz`, combine with `i32.and`/`i32.or`, convert to f64.)
+   - `If {}` condition: unchanged structurally (reads the condition local set by `SetCond`), but the local now holds the `approx_eq`-based truthiness.
+   `Neq` is not an `Op2` (codegen lowers it to `Eq`+`Not`), so routing `Eq` through `approx_eq` automatically makes `Neq` correct.
+
+**Testing:**
+- A unit test that emits a tiny module exporting `eq(a,b)->i32` wired to the `approx_eq` helper, runs it under DLR-FT for a curated + randomized sample of f64 pairs, and asserts the wasm result equals `crate::float::approx_eq(a,b)` for every pair. Sample must include: exact equal, far apart, 1–4 ULP apart, `f64::EPSILON`-apart around 1.0, around-zero (subnormals), `(NaN,NaN)`, `(NaN,1.0)`, `(NA, NA)`, `(NA, 0.0)`, `(+0.0,-0.0)`, `(±inf, ±inf)`.
+- Tests that `Op2::Eq`, `Op2::And`, `Op2::Or`, `Not`, and `SetCond`+`If` now match the VM's `eval_op2`/`is_truthy` for near-zero / ULP-adjacent operands where raw `==`/`!=0.0` would diverge.
+
+**Verification:**
+Run: `cargo test -p simlin-engine --features file_io wasmgen::lower`
+Expected: the `approx_eq`-parity tests pass.
+
+**Commit:** `engine: wasmgen approx_eq helper + equality/truthiness routing`
+<!-- END_TASK_1 -->
+<!-- END_SUBCOMPONENT_A -->
+
+<!-- START_SUBCOMPONENT_B (tasks 2-4) -->
+<!-- START_TASK_2 -->
+### Task 2: Open-coded transcendental helpers
+
+**Verifies:** wasm-backend.AC7.1.
+
+**Files:**
+- Create: `src/simlin-engine/src/wasmgen/math.rs` (the transcendental helper emitters) — or add to `lower.rs`; prefer a dedicated module for clarity.
+- Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+Emit one self-contained wasm helper function per transcendental, each `(f64) -> f64` (or `(f64,f64)->f64` for `pow`), using range reduction + a polynomial/rational approximation. The blob imports no host math. There is no external library to integrate — this is standard numerical method work, validated against Rust `f64`. Recommended kernels (refine only if a corpus model needs more accuracy — the bar is the `simulate.rs` tolerances, abs `2e-3` / rel `5e-6` / VDF 1%):
+- `exp(x)`: reduce `x = k·ln2 + r`, `|r| <= ln2/2`; `exp(x) = 2^k · exp(r)` (poly in `r`); assemble `2^k` by composing the exponent bits (`i64`→`f64` via `f64.reinterpret_i64`). Handle overflow→`+inf`, underflow→`0`, `NaN`→`NaN`.
+- `ln(x)`: split `x = m · 2^e` with `m ∈ [1,2)` (decompose the f64 bits); `ln(x) = e·ln2 + ln(m)` (poly/`atanh` series in `(m-1)/(m+1)`). `x<0`→`NaN`, `x==0`→`-inf`.
+- `sin(x)`/`cos(x)`: reduce modulo `π/2` (Cody–Waite or a simple `k = round(x/(π/2))` with extended-precision subtraction), choose the kernel poly by `k mod 4`.
+- `atan(x)`: reduce using `atan(x) = π/2 - atan(1/x)` for `|x|>1` and a small-argument poly; sign symmetry.
+- Composed: `tan = sin/cos`; `pow(x,y) = exp(y·ln x)` (matches `powf` for `x>0`; **negative-base integer powers diverge** — note this as a known limitation, refine only if a corpus model uses it); `log10(x) = ln(x)·(1/ln10)`; `asin(x) = atan(x / sqrt(1-x²))` (with domain clamping at `|x|=1`); `acos(x) = π/2 - asin(x)`.
+
+Wire each `BuiltinId` transcendental in the `Apply` lowering (Task 4) to `call` the matching helper. Emit each helper at most once per module (lazily, recording its function index).
+
+**Testing:**
+Per AC7.1, **each helper gets a unit test comparing the emitted wasm output to Rust `f64` over a sampled range**: emit a module exporting the helper, run it under DLR-FT for a dense sample across the function's domain (and edge cases: 0, ±large, near asymptotes, the `asin`/`acos` endpoints, negative args for `ln`/`sqrt`/even roots), and assert `|wasm(x) - rust_f64(x)| <= tol` with a tol comfortably inside the `simulate.rs` tolerances (e.g. rel `1e-9`..`1e-6` depending on the function; document the chosen tol per helper and why it suffices). Include NaN/inf propagation assertions.
+
+**Verification:**
+Run: `cargo test -p simlin-engine --features file_io wasmgen::math`
+Expected: every transcendental helper's accuracy test passes.
+
+**Commit:** `engine: open-coded wasm transcendental helpers (exp/ln/sin/cos/atan + composed)`
+<!-- END_TASK_2 -->
+
+<!-- START_TASK_3 -->
+### Task 3: `Op2::Exp` and `Op2::Mod`
+
+**Verifies:** wasm-backend.AC7.3 (Mod), wasm-backend.AC1.1.
+
+**Files:**
+- Modify: `src/simlin-engine/src/wasmgen/lower.rs` (extend the `Op2` arm).
+
+**Implementation:**
+- `Op2::Exp`: operands `[l, r]` on stack → `call pow` (the Task 2 helper). Matches `l.powf(r)` for positive base.
+- `Op2::Mod`: compute `rem_euclid(l, r)` faithfully (do **not** use a plain truncated remainder). `r0 = l - r * (l / r).trunc()` (the `%` result, via `f64.div`, `f64.trunc`, `f64.mul`, `f64.sub`); then `if r0 < 0.0 { r0 + r.abs() } else { r0 }` (via `f64.lt`, `f64.abs`, `f64.add`, `select`). This reproduces Rust's `f64::rem_euclid` exactly (a result in `[0, |r|)`). (The design's "via floor" phrasing is approximate; the trunc-then-adjust form matches `rem_euclid` for negative divisors too.)
+
+**Testing:**
+- `Op2::Exp`: assert wasm matches `l.powf(r)` (via the VM) for a sample of positive bases and assorted exponents (integer, fractional, negative).
+- `Op2::Mod`: assert wasm matches `l.rem_euclid(r)` for the four sign combinations of `(l, r)` and non-integer operands; assert the result is always in `[0, |r|)`.
+
+**Verification:**
+Run: `cargo test -p simlin-engine --features file_io wasmgen::lower`
+Expected: Exp/Mod parity tests pass.
+
+**Commit:** `engine: wasmgen Op2 Exp (pow) and Mod (rem_euclid)`
+<!-- END_TASK_3 -->
+
+<!-- START_TASK_4 -->
+### Task 4: `Apply` lowering for the full `BuiltinId` set
+
+**Verifies:** wasm-backend.AC1.1, wasm-backend.AC7.1, wasm-backend.AC7.3 (Min/Max).
+
+**Files:**
+- Modify: `src/simlin-engine/src/wasmgen/lower.rs` (add the `Apply { func }` arm).
+
+**Implementation:**
+Add the `Apply { func }` arm: pop the 3 operands into scratch f64 locals `a`/`b`/`c` (top is `c`), then emit per `func`, reading `time`/`dt` from `curr[TIME_OFF]`/`curr[DT_OFF]` where needed (mirror `apply()` exactly):
+- Native f64 instr: `Abs`→`f64.abs(a)`, `Sqrt`→`f64.sqrt(a)`, `Int`→`f64.floor(a)`, `Max`→`f64.max(a,b)`, `Min`→`f64.min(a,b)`.
+  - **AC7.3 Min/Max note:** `f64.min`/`f64.max` differ from the VM's compare form (`if a>b {a} else {b}`) on NaN and ±0. Use the wasm instructions first; if a corpus test surfaces a NaN/±0 divergence, switch *that* op to the compare-and-select form `(a>b)?a:b` / `(a<b)?a:b` matching `apply()`.
+- Compare/arith composed: `Sign` (`a>0`→1, `a<0`→-1, else 0 via compares+selects), `Quantum` (`b==0.0`→`a` else `(a/b).trunc()*b` — exact `==`), `SafeDiv` (`b != 0.0`→`a/b` else `c` — exact `!=`), `Sshape` (`b + (c-b)/(1.0 + exp(-4.0*(2.0*a-1.0)))`, calling the `exp` helper).
+- Transcendental: `Exp/Ln/Log10/Sin/Cos/Tan/Arcsin/Arccos/Arctan` → `call` the Task 2 helpers on `a`.
+- Time-driven helpers: `Step` (`time + dt/2 > b ? a : 0`), `Ramp` (the `ramp(time, a, b, Some(c))` branch logic), `Pulse` (emit/`call` a `pulse(time, dt, volume, first, interval)` wasm helper containing the VM's `while` loop, `vm.rs:3036-3053`).
+- Constants: `Inf`→`f64.const INFINITY`, `Pi`→`f64.const PI`. (Codegen usually emits these as `LoadConstant`, but handle the `Apply` form too.)
+
+**Testing:**
+- Per-builtin unit tests: emit each `Apply{func}` over hand-built operand sequences, run under DLR-FT, assert equality with the VM's `apply(func, time, dt, a, b, c)` over representative inputs (including the edge values: `Int` of negatives (floor vs trunc), `Quantum` with `b==0`, `SafeDiv` with `b==0` and `b`=subnormal, `Sign(0)`, `Step`/`Ramp` across their breakpoints, `Pulse` across multiple intervals, `Sshape` across `[0,1]`).
+- AC7.1: the transcendental `Apply` arms produce values within the documented tolerance of Rust `f64`.
+
+**Verification:**
+Run: `cargo test -p simlin-engine --features file_io wasmgen::lower`
+Expected: all builtin parity tests pass.
+
+**Commit:** `engine: wasmgen Apply lowering for full scalar builtin set`
+<!-- END_TASK_4 -->
+<!-- END_SUBCOMPONENT_B -->
+
+<!-- START_TASK_5 -->
+### Task 5: Raise the floor; scalar-only corpus parity
+
+**Verifies:** wasm-backend.AC1.1.
+
+**Files:**
+- Modify: `src/simlin-engine/tests/simulate.rs` (raise `WASM_SUPPORTED_FLOOR`).
+
+**Implementation:**
+With all scalar builtins/operators supported, more corpus models now run through wasm. Re-run the `wasm_parity_floor` gate, observe the new `Ran` count, and raise `WASM_SUPPORTED_FLOOR` to it. Any model that is purely scalar (no arrays, lookups, modules, RK, PREVIOUS/INIT) should now `Ran` and clear `ensure_results`. Models still using unsupported constructs (graphical functions, arrays, modules, RK2/RK4, PREVIOUS/INIT) remain `Skipped` until their phases land.
+
+**Testing:**
+The raised floor gate is the test. Confirm (note in the commit) that scalar models which were `Skipped` in Phase 1 due to `Eq`/builtins now `Ran`.
+
+**Verification:**
+Run: `cargo test -p simlin-engine --features file_io --test simulate`
+Expected: full corpus passes; `wasm_parity_floor` passes at the raised floor.
+
+**Commit:** `engine: raise wasm parity floor after full scalar builtins`
+<!-- END_TASK_5 -->
+
+---
+
+## Phase 2 Done When
+- All scalar-only corpus models match the VM through wasm (clearing `ensure_results`).
+- Unit tests cover each builtin, each transcendental helper (vs Rust `f64`), and the `approx_eq`/NaN/`:NA:` edge cases.
+- `Mod`=`rem_euclid`, `Exp`=`pow`, equality/truthiness via `approx_eq`; `Min`/`Max` via `f64.min`/`f64.max` (compare-fallback noted).
+- The floor gate is raised to the new supported count.
diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_03.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_03.md
new file mode 100644
index 000000000..cb05d7334
--- /dev/null
+++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_03.md
@@ -0,0 +1,119 @@
+# WebAssembly Simulation Backend — Phase 3: Graphical functions (lookups)
+
+**Goal:** Bring the scalar `Lookup` opcode (Interpolate / Forward / Backward modes) to VM parity by laying the graphical-function tables into the blob's linear memory and emitting a shared lookup helper that mirrors the VM's three lookup functions exactly.
+
+**Architecture:** The `ByteCodeContext.graphical_functions` (a `Vec<Vec<(f64,f64)>>`) is serialized into a read-only region of the module's linear memory via an active wasm data segment, alongside a per-table directory (byte offset + point count). Three wasm helper functions — `lookup_interp`, `lookup_forward`, `lookup_backward` — reproduce `vm.rs`'s `lookup`/`lookup_forward`/`lookup_backward` (`vm.rs:3055-3186`) over a `(data_offset, count, index)` interface. The `Lookup { base_gf, table_count, mode }` opcode lowers to a runtime element-offset bounds check + a directory lookup + a `call` to the mode's helper. The interpolate kernel reuses Phase 2's `approx_eq` helper for the at-knot exact-hit test.
+
+**Tech Stack:** `wasm-encoder` `DataSection` (active data); the Phase 2 `approx_eq` helper; the VM lookup functions as spec.
+
+**Scope:** Phase 3 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`.
+
+**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`).
+
+---
+
+## Acceptance Criteria Coverage
+
+### wasm-backend.AC1
+- **wasm-backend.AC1.1 Success:** A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` against the model's expected outputs at those tests' existing tolerances. (No separate, tighter wasm-vs-VM threshold.)
+
+### wasm-backend.AC7
+- **wasm-backend.AC7.1 Success:** Math wasm provides natively uses wasm instructions; the transcendentals … are open-coded as self-contained wasm helper functions … Each open-coded helper has a unit test comparing its output to Rust `f64` over a sampled range. *(For Phase 3 the relevant helpers are the lookup kernels; tested against the VM's `lookup`/`lookup_forward`/`lookup_backward`.)*
+
+---
+
+## Notes for the implementer (read first)
+
+- **Opcode** (`bytecode.rs:626-638`): `Lookup { base_gf: GraphicalFunctionId, table_count: u16, mode: LookupMode }`. `GraphicalFunctionId = u8` (`bytecode.rs:21`, so ≤256 tables/module). `LookupMode` (`bytecode.rs:45-55`): `Interpolate = 0`, `Forward = 1`, `Backward = 2`. Stack effect `(2,1)`.
+- **Stack discipline** (`vm.rs:1710-1731`): the opcode pops `lookup_index` first, then `element_offset` (so the producing opcodes pushed `element_offset` then `lookup_index`). Bounds check: `if element_offset < 0.0 || element_offset >= table_count as f64 { push NaN } else { gf_idx = base_gf + element_offset; dispatch mode }`. For the common scalar case codegen emits `LoadConstant 0.0` for `element_offset` (so it is 0), but **the lowering must handle a runtime element_offset** (arrayed scalar-`Lookup` selects a per-element table).
+- **Tables** (`bytecode.rs:1588`): `graphical_functions: Vec<Vec<(f64,f64)>>`; the table used is `graphical_functions[base_gf + element_offset]`, a list of `(x,y)` knots in x-ascending order.
+- **The three VM lookup functions are NOT one function — they differ in three ways** (confirmed; this is the key parity risk):
+  - `lookup` (Interpolate, `vm.rs:3055-3102`): empty→NaN; NaN index→NaN; `index < x[0]` (**strict**)→`y[0]`; `index > x[n-1]` (**strict**)→`y[n-1]`; lower-bound binary search (`while low<high { mid; if x[mid] < index {low=mid+1} else {high=mid} }`); at `i=low`: **if `approx_eq(x[i], index)`** → `y[i]`, else linear interp `slope=(y[i]-y[i-1])/(x[i]-x[i-1]); (index-x[i-1])*slope + y[i-1]`.
+  - `lookup_forward` (`vm.rs:3104-3142`): empty/NaN→NaN; `index <= x[0]` (**inclusive**)→`y[0]`; `index >= x[n-1]`→`y[n-1]`; **same lower-bound** search; return `y[low]`. **No approx_eq, no interpolation.**
+  - `lookup_backward` (`vm.rs:3144-3186`): empty/NaN→NaN; `index <= x[0]`→`y[0]`; `index >= x[n-1]`→`y[n-1]`; **upper-bound** search (`if x[mid] <= index {low=mid+1} else {high=mid}`); return `y[low-1]` (last knot with `x <= index`; for duplicate x, the LAST). **No approx_eq, no interpolation.**
+- The `context.graphical_functions[gf_idx]` access is a safe bounds-checked index in the VM; the element_offset/table_count check guarantees it's in range.
+- **Memory-layout convention (extended each phase).** Phase 1 used `[curr][next][results]`. Phase 3 appends two regions after the results region: a **GF directory** (per global table index: byte offset of its data + point count) and the **GF data** (all tables' `(x,y)` pairs as f64). Compute these region bases in `compile_simulation`, grow `pages` accordingly, and initialize them with an active `DataSection`. `results_offset` (exported) is unchanged. (Phases 4/5 append RK-scratch / temp regions similarly.)
+- `pub(crate)`/`pub` latitude per the repo owner. TDD, inline `#[cfg(test)] mod tests`, `cargo test -p simlin-engine --features file_io wasmgen`.
+
+---
+
+<!-- START_SUBCOMPONENT_A (tasks 1-3) -->
+<!-- START_TASK_1 -->
+### Task 1: Emit GF tables + directory into linear memory
+
+**Verifies:** wasm-backend.AC1.1 (prerequisite for lookups).
+
+**Files:**
+- Modify: `src/simlin-engine/src/wasmgen/module.rs` (layout + `DataSection` emission), `src/simlin-engine/src/wasmgen/lower.rs` (carry the GF region bases in `EmitCtx`).
+- Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+In `compile_simulation`, after computing the results region, lay out:
+- **GF data region:** concatenate every table in `root.context.graphical_functions` in order; each table's knots as consecutive f64 LE pairs `x0,y0,x1,y1,…`. Record each table's byte offset and point count.
+- **GF directory region:** an array indexed by global table index `t` (0..`graphical_functions.len()`), each entry `(data_byte_offset: i32, n_points: i32)` — so the runtime can map `base_gf + element_offset` → its table. Store as two i32 per entry (or i32 pairs).
+Emit both regions with an active `DataSection` (a data segment whose `ConstExpr` offset is the region base) so they're initialized at instantiation. Grow `pages` to cover them. Thread the directory base + data base into `EmitCtx`.
+
+(Modules in Phase 7 each have their own `ByteCodeContext.graphical_functions`; for Phase 3 only the root's tables exist. Phase 7 generalizes the directory to cover all instances' tables.)
+
+**Testing:**
+- A test that builds a model with one graphical function, compiles it, and verifies (by reading the blob's GF data region from memory after instantiation) that the table's `(x,y)` pairs are present at the directory-indicated offset with the right count.
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen::module`
+
+**Commit:** `engine: emit graphical-function tables + directory into wasm memory`
+<!-- END_TASK_1 -->
+
+<!-- START_TASK_2 -->
+### Task 2: The three lookup helper functions
+
+**Verifies:** wasm-backend.AC1.1, wasm-backend.AC7.1.
+
+**Files:**
+- Modify: `src/simlin-engine/src/wasmgen/lower.rs` (or a `wasmgen/lookup.rs`) — emit `lookup_interp`, `lookup_forward`, `lookup_backward`.
+- Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+Emit three wasm helper functions, each `(data_off: i32, count: i32, index: f64) -> f64`, reading `x = f64.load[data_off + 16*k]`, `y = f64.load[data_off + 16*k + 8]` for knot `k`. Reproduce the VM functions exactly:
+- `lookup_interp`: the empty/NaN guards, **strict** edge clamps, lower-bound binary search, then at `i=low` `call approx_eq(x[i], index)` (Phase 2 helper) → if true return `y[i]`, else the linear-interp formula.
+- `lookup_forward`: NaN/empty guards, **inclusive** edge clamps, lower-bound search, return `y[low]`.
+- `lookup_backward`: NaN/empty guards, inclusive edge clamps, **upper-bound** search, return `y[low-1]`.
+Implement the binary search with i32 locals (`low`, `high`, `mid`) and `f64.load` of `x[mid]`. (`count == 0` → return NaN; `index` NaN via `f64.ne(index,index)` → NaN.)
+
+**Testing:**
+- Emit each helper over hand-placed tables in memory and assert, under DLR-FT, that it matches the VM's `lookup`/`lookup_forward`/`lookup_backward` for: below-range, above-range, exact-knot hits, between-knots, a single-point table, duplicate-x tables (Backward's last-duplicate rule), and a NaN index. Compare directly against calling the VM functions (expose them `pub(crate)` if needed).
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasm lookup_interp/forward/backward helpers matching the VM`
+<!-- END_TASK_2 -->
+
+<!-- START_TASK_3 -->
+### Task 3: `Lookup` opcode lowering + corpus parity
+
+**Verifies:** wasm-backend.AC1.1.
+
+**Files:**
+- Modify: `src/simlin-engine/src/wasmgen/lower.rs` (add the `Lookup` arm).
+- Modify: `src/simlin-engine/tests/simulate.rs` (raise `WASM_SUPPORTED_FLOOR`).
+
+**Implementation:**
+Add the `Lookup { base_gf, table_count, mode }` arm. Stack has `[element_offset, index]` (top = index). Emit: pop `index` and `element_offset` into f64 locals; bounds-check `element_offset < 0.0 || element_offset >= table_count as f64` → push NaN; else compute `table_idx = base_gf + (element_offset as i32)`, load `(data_off, count)` from the GF directory at `directory_base + table_idx*8`, and `call` the mode-specific helper (`mode` is compile-time, so emit a static `call` to `lookup_interp`/`lookup_forward`/`lookup_backward`). Push the result. Match the VM's `as usize`/`as f64` cast chain for the bounds compare.
+
+Then raise the floor: corpus models using graphical functions now run through wasm. Re-observe and raise `WASM_SUPPORTED_FLOOR`.
+
+**Testing:**
+- Unit: a model with a `LOOKUP`/graphical-function variable in Interpolate, Forward, and Backward modes; assert wasm matches the VM across the table's domain (below/above/at-knot/between) and for an out-of-range `element_offset` (→NaN).
+- Corpus: at least one `simulate.rs` model that uses a graphical function now `Ran` and clears `ensure_results`.
+
+**Verification:** `cargo test -p simlin-engine --features file_io --test simulate`
+
+**Commit:** `engine: wasmgen Lookup opcode lowering + GF corpus parity`
+<!-- END_TASK_3 -->
+<!-- END_SUBCOMPONENT_A -->
+
+---
+
+## Phase 3 Done When
+- Corpus models using graphical functions match the VM through wasm.
+- Unit tests cover interpolate / forward / backward, edge clamping, exact-knot hits, duplicate-x (Backward), and out-of-range element_offset → NaN.
+- The floor gate is raised.
diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_04.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_04.md
new file mode 100644
index 000000000..f43de711e
--- /dev/null
+++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_04.md
@@ -0,0 +1,118 @@
+# WebAssembly Simulation Backend — Phase 4: RK2/RK4 integration + PREVIOUS/INIT
+
+**Goal:** Generate the RK2 (Heun) and RK4 multi-stage integration loops, and serve `PREVIOUS`/`INIT` via `prev_values`/`initial_values` snapshot regions captured at the same loop points the VM uses.
+
+**Architecture:** `compile_simulation` selects the run-loop shape from `sim.specs.method` (Euler from Phase 1; RK2/RK4 added here). The RK loops mirror `vm.rs:712-838`: per-stock scratch (`saved`/`accum` in a linear-memory region), trial-point mutation of `curr`, time juggling across stages, a final flows-only re-evaluation with restored state, then the `prev_values` snapshot. `LoadPrev`/`LoadInitial` read the two snapshot regions; the `use_prev_fallback` gate is a mutable wasm global (not a time comparison). Because the emitter knows which program it is lowering, `LoadInitial`'s "during Initials read `curr`, else read `initial_values`" branch is resolved at compile time.
+
+**Tech Stack:** `wasm-encoder` (loops/blocks, mutable global, multi-region memory); the VM integration loops + `run_initials` + `LoadPrev`/`LoadInitial` arms as spec.
+
+**Scope:** Phase 4 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`.
+
+**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`).
+
+---
+
+## Acceptance Criteria Coverage
+
+### wasm-backend.AC1
+- **wasm-backend.AC1.1 Success:** A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` at those tests' existing tolerances. (No separate, tighter wasm-vs-VM threshold.)
+
+### wasm-backend.AC7
+- **wasm-backend.AC7.4 Success:** Euler, RK2, and RK4 each match the VM's saved samples (cadence and values); `PREVIOUS`/`INIT` match via the snapshot regions. *(Phase 1 established Euler; Phase 4 completes RK2/RK4 and PREVIOUS/INIT.)*
+
+---
+
+## Notes for the implementer (read first)
+
+- **Reserved globals**: `TIME_OFF=0`, `DT_OFF=1`, `INITIAL_TIME_OFF=2`, `FINAL_TIME_OFF=3` (`vm.rs:83-86`). `LoadGlobalVar` reads these absolutely (no `module_off`).
+- **Stock offsets**: the set of stock data-buffer offsets is the `AssignNext { off }` targets in `root.compiled_stocks` (Phase 1 already collects these for the Euler copy-back). They are module-relative `off` (root `module_off=0`, so absolute here). The VM's `stock_offsets` (`vm.rs:265`) are absolute and include submodule stocks via `EvalModule` recursion — Phase 4 is root-only; Phase 7 generalizes.
+- **RK4 loop** (`vm.rs:712-787`), reproduce per timestep:
+  - `saved_time = curr[TIME_OFF]`.
+  - Stage 1: `eval_step` (flows then stocks). For each stock `off`: `s1 = next[off]-curr[off]; saved[i]=curr[off]; accum[i]=s1; curr[off]=saved[i]+s1*0.5`. Then `curr[TIME_OFF]=saved_time+dt*0.5`.
+  - Stage 2: `eval_step`. `s2=next[off]-curr[off]; accum[i]+=2*s2; curr[off]=saved[i]+s2*0.5`.
+  - Stage 3: `eval_step`. `s3=next[off]-curr[off]; accum[i]+=2*s3; curr[off]=saved[i]+s3`. Then `curr[TIME_OFF]=saved_time+dt`.
+  - Stage 4: `eval_step`. `s4=next[off]-curr[off]; accum[i]+=s4; next[off]=saved[i]+accum[i]/6.0; curr[off]=saved[i]`.
+  - `curr[TIME_OFF]=saved_time; next[TIME_OFF]=saved_time+dt`.
+  - **Final flows-only re-eval** with restored `curr` (`eval(StepPart::Flows)`), so `curr`'s aux/flow slots hold time-`t` values (stages 2-4 clobbered them). **Load-bearing** for both saved output and PREVIOUS.
+  - `prev_values := curr`; `use_prev_fallback := 0`; `save_advance!`.
+- **RK2 (Heun) loop** (`vm.rs:788-838`): Stage 1 `eval_step`, `s1=next-curr; saved=curr; accum=s1; curr=saved+s1`, `curr[TIME]=saved_time+dt`. Stage 2 `eval_step`, `s2=next-curr; accum+=s2; next=saved+accum/2.0; curr=saved`. `curr[TIME]=saved_time; next[TIME]=saved_time+dt`. Final flows re-eval; `prev_values:=curr`; `use_prev_fallback:=0`; `save_advance!`.
+- **`eval_step` = flows() then stocks()**; the stocks program writes `next[off]` via `AssignNext`. So per stage: `call flows(0); call stocks(0)`; then read `next[off]`/`curr[off]`. The final re-eval calls **only** `flows(0)`.
+- **`run_initials`** (`vm.rs:1066-1135`): seed `curr[TIME/DT/INITIAL_TIME/FINAL_TIME]`, set `use_prev_fallback=1`, run initials once, then **capture `initial_values := curr` (whole `n_slots` chunk)** exactly once. (`prev_values` is not written during initials.)
+- **`prev_values`/`initial_values`** are each `n_slots` wide (`vm.rs:617-618`). Address with `module_off + off` (root: `module_off=0`).
+- **`LoadPrev { off }`** (`vm.rs:1320-1328`): pops a fallback; pushes `if use_prev_fallback { fallback } else { prev_values[module_off+off] }`. **Gate on the flag, never a `TIME==INITIAL_TIME` check** (RK moves TIME to trial points).
+- **`LoadInitial { off }`** (`vm.rs:1332-1340`): `if part==Initials { curr[module_off+off] } else { initial_values[module_off+off] }`. Since the emitter knows the program (`StepPart`), pick the branch at compile time: in the initials function emit a `curr` read, in flows/stocks emit an `initial_values` read.
+- **Memory layout additions:** `prev_values` (n_slots), `initial_values` (n_slots), and (RK only) `rk_scratch` = `saved`(n_stocks)+`accum`(n_stocks). Append after the Phase-3 GF region; grow `pages`. Add a mutable i32 global `use_prev_fallback` (init 1).
+- `pub(crate)`/`pub` latitude per the repo owner. TDD, inline `#[cfg(test)] mod tests`.
+
+---
+
+<!-- START_SUBCOMPONENT_A (tasks 1-3) -->
+<!-- START_TASK_1 -->
+### Task 1: PREVIOUS/INIT snapshot regions + LoadPrev/LoadInitial
+
+**Verifies:** wasm-backend.AC7.4 (PREVIOUS/INIT), wasm-backend.AC1.1.
+
+**Files:**
+- Modify: `src/simlin-engine/src/wasmgen/module.rs` (layout: prev/initial regions + `use_prev_fallback` global; `run_initials` capture; Euler-loop `prev_values` snapshot), `src/simlin-engine/src/wasmgen/lower.rs` (`LoadPrev`/`LoadInitial` arms).
+- Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+1. Reserve `initial_values` + `prev_values` regions (each `n_slots*8` bytes) and a mutable i32 global `use_prev_fallback` (init 1). Thread their bases into `EmitCtx` + the `StepPart` being emitted.
+2. In the `run`/initials sequence: after seeding globals and calling the initials function, copy the `curr` chunk into `initial_values` (an unrolled per-slot copy or a small copy loop). Leave `use_prev_fallback=1`.
+3. In the Euler loop (Phase 1's loop), after `flows`+`stocks` and before advancing time, copy `curr → prev_values` and set `use_prev_fallback=0` (mirroring `vm.rs:705-707`).
+4. `LoadPrev { off }`: pop fallback into a scratch local; `global.get use_prev_fallback`; `if` → push fallback, `else` → push `prev_values[module_off+off]` (use `select` after loading both, or an `if/else` producing f64).
+5. `LoadInitial { off }`: in the **initials** program emit `curr[module_off+off]`; in **flows/stocks** programs emit `initial_values[module_off+off]`.
+
+**Testing:**
+- Euler models using `PREVIOUS(x)` and `INIT(x)` (build via `TestProject`/XMILE), assert wasm matches the VM series. Include: `PREVIOUS` at t0 (returns the fallback), `PREVIOUS` after the first step, `INIT(x)` referenced from a flow (reads `initial_values`), and `INIT(x)` referenced from another initial equation (reads `curr` during Initials).
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasmgen PREVIOUS/INIT snapshot regions + LoadPrev/LoadInitial`
+<!-- END_TASK_1 -->
+
+<!-- START_TASK_2 -->
+### Task 2: RK2 + RK4 run-loop generation
+
+**Verifies:** wasm-backend.AC7.4 (RK2/RK4), wasm-backend.AC1.1.
+
+**Files:**
+- Modify: `src/simlin-engine/src/wasmgen/module.rs` (method dispatch in `compile_simulation`; emit RK2/RK4 loops).
+- Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+Remove Phase 1's `Method::Euler`-only guard; dispatch on `sim.specs.method`. Emit the RK4 and RK2 loops per the Notes above, unrolling the per-stock stage math over the compile-time-known stock offsets, using the `rk_scratch` region for `saved[i]`/`accum[i]`. Each stage does `call flows(0); call stocks(0)` then the per-stock arithmetic; the end-of-step does a **flows-only** `call flows(0)` with restored `curr`, then the `prev_values` snapshot (Task 1), then `save_advance!`. Mind the time juggling (`curr[TIME_OFF]` set to `saved_time + dt*0.5`, `+dt`, restored to `saved_time`; `next[TIME_OFF]=saved_time+dt`).
+
+**Testing:**
+- RK2 and RK4 scalar models (e.g. a logistic-growth or SIR model run under each method): assert wasm matches the VM's saved samples (cadence and values). Include a model with `PREVIOUS`/`INIT` under RK to confirm the snapshot timing (prev captured after the final flows re-eval).
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasmgen RK2/RK4 integration loops`
+<!-- END_TASK_2 -->
+
+<!-- START_TASK_3 -->
+### Task 3: Raise floor; RK + PREVIOUS/INIT corpus parity
+
+**Verifies:** wasm-backend.AC1.1, wasm-backend.AC7.4.
+
+**Files:**
+- Modify: `src/simlin-engine/tests/simulate.rs` (raise `WASM_SUPPORTED_FLOOR`).
+
+**Implementation:**
+Corpus models using RK2/RK4 and/or PREVIOUS/INIT now run through wasm. Re-observe the `Ran` count and raise `WASM_SUPPORTED_FLOOR`.
+
+**Testing:** the raised floor gate; note in the commit which RK/PREVIOUS models flipped from `Skipped` to `Ran`.
+
+**Verification:** `cargo test -p simlin-engine --features file_io --test simulate`
+
+**Commit:** `engine: raise wasm parity floor after RK + PREVIOUS/INIT`
+<!-- END_TASK_3 -->
+<!-- END_SUBCOMPONENT_A -->
+
+---
+
+## Phase 4 Done When
+- RK2/RK4 models and PREVIOUS/INIT models match the VM through wasm.
+- Unit tests cover each integration method and the snapshot timing (Euler post-step; RK after the end-of-step flows re-eval; initial_values once after initials; the `use_prev_fallback` gate).
+- The floor gate is raised.
diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_05.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_05.md
new file mode 100644
index 000000000..3cfb22f17
--- /dev/null
+++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_05.md
@@ -0,0 +1,175 @@
+# WebAssembly Simulation Backend — Phase 5: Arrays — subscripts, iteration, reducers
+
+**Goal:** Lower the core array machinery — the view-stack opcodes, the `BeginIter…NextIterOrJump…EndIter` iteration loop, the `Array{Sum,Max,Min,Mean,Stddev,Size}` reducers, the temp-array region, and dynamic subscripting — to wasm, matching the VM element-for-element including out-of-bounds→NaN and empty-view semantics.
+
+**Architecture:** The VM resolves array access through a runtime `view_stack` of `RuntimeView`s. Because every view's geometry (base offset, dims, strides, offset, sparsity, is_temp) is known at compile time, the wasm emitter maintains a **compile-time view-descriptor stack** instead: `Push*View`/`ViewSubscript*`/`ViewRange*`/`ViewWildcard`/`ViewTranspose`/`PopView`/`DupView` push/transform/pop descriptors; `BeginIter…EndIter` becomes a wasm bounded loop with a loop-index local and compile-time stride arithmetic (or a precomputed flat-offset table for non-contiguous views); reducers loop over the top descriptor's elements; dynamic subscripts (`ViewSubscriptDynamic`/`ViewRangeDynamic`, legacy `PushSubscriptIndex`/`LoadSubscript`) carry a runtime offset + validity flag so OOB yields NaN exactly as the VM does. **Apply-to-all (A2A) variables are unrolled to scalar bytecode by the compiler — they need no array opcodes — so this phase targets array-producing builtins, reducer arguments, and explicit subscripting.**
+
+**Tech Stack:** `wasm-encoder` (loops/blocks, data segments for precomputed offset tables); `StaticArrayView`/`RuntimeView`/`DimensionInfo`/`SubdimensionRelation` (`bytecode.rs`); the VM array dispatch arms + `reduce_view` + `flat_offset` + `match_dimensions_two_pass` as spec.
+
+**Scope:** Phase 5 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`.
+
+**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`).
+
+---
+
+## Acceptance Criteria Coverage
+
+### wasm-backend.AC1
+- **wasm-backend.AC1.1 Success:** A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` at those tests' existing tolerances.
+- **wasm-backend.AC1.2 Success:** Arrayed/subscripted models (apply-to-all, subscripts, vector operations) match the VM element-for-element. *(Phase 5 covers A2A/subscript/reducer; vector ops complete it in Phase 6.)*
+- **wasm-backend.AC1.5 Edge:** Empty-view reducers, out-of-bounds subscripts, and division-by-zero produce the same NaN / finite-`:NA:` / Inf values the VM produces. *(Phase 5 covers the empty-view-reducer (NaN-vs-0.0, and invalid-view→NaN for all reducers) and out-of-bounds-subscript portions.)*
+
+### wasm-backend.AC7
+- **wasm-backend.AC7.3 Edge:** `Mod` matches `rem_euclid`; `Max`/`Min` use `f64.max`/`f64.min` with compare-fallback. *(Reaffirmed for the array reducers `ArrayMax`/`ArrayMin`, whose empty-view→NaN semantics differ from the binary builtins.)*
+
+---
+
+## Notes for the implementer (read first)
+
+- **CRITICAL — the design's "opcode" names are `Expr` IR, not bytecode.** `Subscript`, `StaticSubscript`, `TempArray`, `TempArrayElement`, `AssignTemp` are `compiler::Expr` nodes (`compiler/expr.rs:62-88`) that codegen lowers to the view/iter opcodes below — they NEVER appear in `ByteCode.code`. Lower the actual opcodes.
+- **A2A is unrolled at compile time** (`compiler/mod.rs:1912-1990`): `c[D] = a[D]*b[D]` compiles to one independent scalar `LoadVar…AssignCurr(off+i)` per element — no array opcodes. So most arrayed models already pass via Phases 1-2. The array opcodes appear for: array-producing builtins (`AssignTemp` → `BeginIter` loop), reducer arguments that are elementwise array expressions, and reducers (`PushStaticView → Array<Reduce> → PopView`).
+- **The actual array opcodes** (`bytecode.rs`), with operands and stack effects (`bytecode.rs:1220-1365`):
+  - View construction (stack `(0,0)` unless noted): `PushVarView { base_off: u16, dim_list_id: u16 }` (full var array; dims from `ctx.dim_lists[dim_list_id]` → `(n_dims,[DimId;4])`, sizes from `ctx.dimensions[DimId].size`); `PushTempView { temp_id: u8, dim_list_id: u16 }` (is_temp); `PushStaticView { view_id: u16 }` (**the workhorse**: `ctx.static_views[view_id]` baked at compile time); `PushVarViewDirect { base_off, dim_list_id }` (raw sizes, dynamic subscript).
+  - View transform (mutate top descriptor): `ViewSubscriptConst { dim_idx: u8, index: u16 }` (drop a dim, 0-based); `ViewSubscriptDynamic { dim_idx }` (stack `(1,0)`: pop 1-based index, **OOB → view invalid**); `ViewRange { dim_idx, start, end }` ([start:end)); `ViewRangeDynamic { dim_idx }` (stack `(2,0)`: pop end then start, clamp); `ViewStarRange { dim_idx, subdim_relation_id }` (sparse via `ctx.subdim_relations[id]`); `ViewWildcard { dim_idx }` (**no-op**); `ViewTranspose {}` (reverse dims/strides/dim_ids); `PopView {}`; `DupView {}`.
+  - Temp element: `LoadTempConst { temp_id, index }` (stack `(0,1)`: push `temp_storage[temp_offsets[temp_id]+index]`); `LoadTempDynamic { temp_id }` (stack `(1,1)`: pop index).
+  - Iteration: `BeginIter { write_temp_id: u8, has_write_temp: bool }` (captures `view_stack.last()` as the iter view); `LoadIterElement {}` (`(0,1)`, element at `current` from the captured view); `LoadIterTempElement { temp_id }`; `LoadIterViewTop {}` (`(0,1)`, from `view_stack.last()` at `current`, broadcasting); `LoadIterViewAt { offset: u8 }` (`(0,1)`, from `view_stack[len-offset]`, broadcasting; **this is what `StaticSubscript`/`TempArray` lower to inside a loop**, codegen.rs:523-571); `StoreIterElement {}` (`(1,0)`, write to `temp_storage[temp_offsets[write_temp_id]+current]`); `NextIterOrJump { jump_back: i16 }` (`current+=1`; if `<size`, `pc+=jump_back`); `EndIter {}`.
+  - Reducers (operate on top view, **do not pop it**, stack `(0,1)`): `ArraySum` (empty→**0.0**); `ArrayMax`/`ArrayMin`/`ArrayMean`/`ArrayStddev` (empty→**NaN**; Stddev = population variance, divisor N, then sqrt); `ArraySize` (push `view.size()`).
+  - Legacy dynamic scalar subscript: `PushSubscriptIndex { bounds: u16 }` (`(1,0)`: pop 1-based index, append `(index-1,bounds)`; OOB → invalid); `LoadSubscript { off: u16 }` (`(0,1)`: fold accumulated indices to a flat offset, push `curr[module_off+off+flat]`; invalid → NaN). VM arms `vm.rs:1341-1366`.
+  - Broadcast iteration (also Phase 5): `BeginBroadcastIter { n_sources, dest_temp_id }`, `LoadBroadcastElement { source_idx }`, `StoreBroadcastElement {}`, `NextBroadcastOrJump { jump_back }`, `EndBroadcastIter {}`.
+- **`StaticArrayView`** (`bytecode.rs:1522-1541`): `{ base_off: u32, is_temp: bool, dims: SmallVec<[u16;4]>, strides: SmallVec<[i32;4]>, offset: u32, sparse: SmallVec<[RuntimeSparseMapping;2]>, dim_ids: SmallVec<[DimId;4]> }`. **Dense element address** for indices `[i_0..i_{n-1}]`: `base_address + offset + Σ i_k*strides[k]`, where `base_address` = `curr[base_off..]` if `!is_temp` else `temp_storage[temp_offsets[base_off]..]`. `size() = Π dims`. Sparse: a sparse dim's real index is `parent_offsets[idx]` (precomputable at compile time). See `RuntimeView::flat_offset` (`bytecode.rs:283-323`), `offset_for_iter_index` (`bytecode.rs:433-456`).
+- **`BeginIter` precompute** (`vm.rs:1876-1912`): if the iter view is `sparse.is_empty() && is_contiguous()`, per-iteration offset is `view.offset + current`; else the VM precomputes a `flat_offsets` table by walking multi-dim indices. The wasm emitter does the same at compile time: contiguous → `base+offset+i`; non-contiguous/sparse → bake a precomputed offset table (data segment) and read `offsets[i]`, **or** fully unroll for small arrays.
+- **`reduce_view`** (`vm.rs:2802-2840`): `if !view.is_valid { return NaN }`; else fold over `size()` elements (via `flat_offset` + the is_temp dual addressing). **Asymmetry to match exactly:** an *invalid* view (OOB subscript) → NaN for **all** reducers including `ArraySum`; an *empty-but-valid* view → 0.0 for `ArraySum`, NaN for Max/Min/Mean/Stddev, `0` size for `ArraySize`. OOB-subscript→NaN is pinned by `array_tests.rs:1298-1340, 2449-2575`.
+- **`temp_storage`**: a flat region of `temp_total_size` f64 (`vm.rs:584-586`); element `index` of temp `temp_id` lives at `temp_storage[temp_offsets[temp_id] + index]`. `temp_offsets`/`temp_total_size` are `ByteCodeContext` fields (compile-time).
+- **Broadcasting** in `LoadIterViewTop`/`LoadIterViewAt` (`vm.rs:1946-2182`) uses `match_dimensions_two_pass` (`dimensions.rs:729`) when the source view's dims/dim_ids differ from the iter view's; a smaller source or invalid view → NaN. Mirror this exactly.
+- **Memory layout addition:** the `temp_storage` region (`temp_total_size*8` bytes) + any precomputed iter-offset tables (data segments). Append after the Phase-4 regions; grow `pages`.
+- `pub(crate)`/`pub` latitude per the repo owner. TDD, inline `#[cfg(test)] mod tests`; `cargo test -p simlin-engine --features file_io wasmgen`.
+
+---
+
+<!-- START_SUBCOMPONENT_A (tasks 1-2) -->
+<!-- START_TASK_1 -->
+### Task 1: Compile-time view-descriptor stack + static view opcodes + temp region
+
+**Verifies:** wasm-backend.AC1.2 (prerequisite).
+
+**Files:**
+- Create: `src/simlin-engine/src/wasmgen/views.rs` (the compile-time `ViewDesc` model + address-computation helpers) — or add to `lower.rs`.
+- Modify: `wasmgen/module.rs` (temp region in the layout), `wasmgen/lower.rs` (view-stack opcode arms).
+- Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+1. Add the `temp_storage` region to the memory layout (base + `temp_total_size*8`); thread its base into `EmitCtx`.
+2. Define a compile-time `ViewDesc` mirroring the static parts of `RuntimeView`: `{ base_off, is_temp, dims, strides, offset, sparse, dim_ids, runtime_off_local: Option<u32>, valid_local: Option<u32> }`. The last two are wasm locals introduced only by dynamic subscripts (Task 4); static views leave them `None`. Maintain a `Vec<ViewDesc>` in the emitter as the compile-time view stack.
+3. Lower the static view opcodes: `PushStaticView{view_id}` (clone `ctx.static_views[view_id]` into a `ViewDesc`), `PushVarView`/`PushTempView`/`PushVarViewDirect` (build from `dim_list_id`/`base_off`), `ViewSubscriptConst`/`ViewRange`/`ViewStarRange`/`ViewWildcard`(no-op)/`ViewTranspose` (static transforms of the top `ViewDesc` mirroring `RuntimeView::apply_*`), `PopView`/`DupView`. Provide a `view_element_addr(desc, flat_index)` emitter that produces the byte address for a flat element index (contiguous fast path `base+offset+i`; strided/sparse via precomputed table or arithmetic).
+4. Lower `LoadTempConst{temp_id,index}` (push `f64.load[temp_offsets[temp_id]*8 + index*8]`) and `LoadTempDynamic{temp_id}` (pop index → compute address → load).
+
+**Testing:**
+- Unit-test the `ViewDesc` transforms by compiling tiny models whose bytecode contains each view op (a reducer over a subscripted/transposed/sparse view) and asserting the emitted reads hit the addresses the VM's `flat_offset` computes (compare a reducer's result to the VM). Test `LoadTempConst`/`LoadTempDynamic` reads.
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasmgen compile-time view-descriptor stack + static view ops`
+<!-- END_TASK_1 -->
+
+<!-- START_TASK_2 -->
+### Task 2: Array reducers
+
+**Verifies:** wasm-backend.AC1.2, wasm-backend.AC7.3, wasm-backend.AC1.5 (empty-view reducers: `ArraySum`→0.0, Max/Min/Mean/Stddev→NaN; invalid view→NaN for all).
+
+**Files:**
+- Modify: `wasmgen/lower.rs` (reducer arms).
+- Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+Lower `ArraySum`/`ArrayMax`/`ArrayMin`/`ArrayMean`/`ArrayStddev`/`ArraySize` over the top `ViewDesc` (do not pop it). Emit a bounded loop (or unrolled sum for small static sizes) over the view's `size()` elements, reading each via `view_element_addr`. Match `reduce_view` (`vm.rs:2802-2840`) and the per-reducer arms (`vm.rs:2216-2309`) exactly:
+- Invalid view (the `valid_local`, when present, is 0) → push NaN for **all** reducers.
+- `ArraySum`: fold with init `0.0` (empty valid view → 0.0).
+- `ArrayMax`/`ArrayMin`: if `size()==0` → NaN, else fold with `NEG_INFINITY`/`INFINITY` and the VM's compare form (use compare-and-select to match the VM's `if a>b`/`if a<b`, not `f64.max`/`f64.min`, since these are the *reduce* path; AC7.3 fallback applies if a NaN difference surfaces).
+- `ArrayMean`: `size()==0` → NaN, else `sum/size`.
+- `ArrayStddev`: `size()==0` → NaN, else two-pass population variance (divisor `size`), then `sqrt`.
+- `ArraySize`: push `size() as f64` (always defined; 0 for empty).
+
+**Testing:**
+- Reducer parity tests vs the VM: non-empty arrays for each reducer; an empty-but-valid view (`ArraySum`→0, others→NaN, `ArraySize`→0); an invalid (OOB-subscripted) view (all→NaN). Stddev population-variance value check.
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasmgen array reducers (Sum/Max/Min/Mean/Stddev/Size)`
+<!-- END_TASK_2 -->
+<!-- END_SUBCOMPONENT_A -->
+
+<!-- START_SUBCOMPONENT_B (tasks 3-4) -->
+<!-- START_TASK_3 -->
+### Task 3: Iteration loops (BeginIter…EndIter) + broadcast
+
+**Verifies:** wasm-backend.AC1.2.
+
+**Files:**
+- Modify: `wasmgen/lower.rs` (iteration arms).
+- Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+Lower the iteration opcodes to a wasm bounded loop. On `BeginIter{write_temp_id,has_write_temp}`: capture the top `ViewDesc` as the iter view, compute `size()` at compile time, and open a wasm `block`/`loop` with an i32 iteration-index local (`current`) initialized to 0; record the iter context (the captured view, the write temp, the loop label depth) on an emitter-side iter stack. Within the body:
+- `LoadIterElement` → read the captured iter view at `current` (contiguous: `base+offset+current`; else precomputed offsets[current]).
+- `LoadIterTempElement{temp_id}` → `temp_storage[temp_offsets[temp_id]+current]`.
+- `LoadIterViewTop`/`LoadIterViewAt{offset}` → read `view_stack[len-1]` / `view_stack[len-offset]` at `current`, reproducing the VM's dim-matching/broadcast (`match_dimensions_two_pass`, `dimensions.rs:729`) and the "smaller source / invalid view → NaN" rules (`vm.rs:1946-2182`). When the source view's dims/dim_ids equal the iter view's, it's the simple `offset_for_iter_index(current)` read.
+- `StoreIterElement` → pop value, store to `temp_storage[temp_offsets[write_temp_id]+current]`.
+On `NextIterOrJump{jump_back}`: `current+=1`; `br_if loop` when `current<size`. `EndIter`: close the loop/block, pop the iter context.
+Also lower the `BeginBroadcastIter`/`LoadBroadcastElement`/`StoreBroadcastElement`/`NextBroadcastOrJump`/`EndBroadcastIter` family the same way, mirroring their VM arms.
+
+(Note: `jump_back` is a bytecode PC delta; the wasm structured loop does not need it — the emitter detects the loop body span between `BeginIter` and `NextIterOrJump` and emits a structured `loop`. Confirm the codegen always emits well-nested `BeginIter…NextIterOrJump…EndIter` so structured lowering is valid; the example at codegen.rs:1183-1378 shows the canonical shape.)
+
+**Testing:**
+- `SUM(a[*]*b[*])`-style models (elementwise product hoisted into an `AssignTemp` `BeginIter` loop then reduced): assert wasm matches the VM element-for-element. A broadcast case (source dims ≠ iter dims). A case where the source is smaller than the iter view (→NaN elements).
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasmgen BeginIter/broadcast iteration loops`
+<!-- END_TASK_3 -->
+
+<!-- START_TASK_4 -->
+### Task 4: Dynamic subscripts + OOB→NaN
+
+**Verifies:** wasm-backend.AC1.2, wasm-backend.AC1.5 (out-of-bounds subscripts → NaN, matching the VM).
+
+**Files:**
+- Modify: `wasmgen/lower.rs` (dynamic-subscript arms; extend `ViewDesc` with runtime offset/validity).
+- Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+- `ViewSubscriptDynamic{dim_idx}`: pop the 1-based runtime index; bounds-check against `dims[dim_idx]`; on OOB set the descriptor's `valid_local` (a wasm i32 local) to 0; otherwise fold `(index-1)*strides[dim_idx]` into the descriptor's `runtime_off_local`. Subsequent reads add `runtime_off_local` to the element address and, if `valid_local==0`, yield NaN. `ViewRangeDynamic{dim_idx}`: pop end then start, clamp to `[0,dims)` (empty range → 0-size dim, stays valid) per `apply_range_checked`.
+- Legacy `PushSubscriptIndex{bounds}` / `LoadSubscript{off}` (`vm.rs:1341-1366`): maintain an emitter-side accumulator of `(runtime_index, bounds)` + a validity local; `PushSubscriptIndex` pops a 1-based index, range-checks against `bounds` (OOB → invalid), and accumulates; `LoadSubscript` folds the accumulated indices into a flat offset, and pushes `curr[module_off+off+flat]` unless invalid → NaN.
+
+**Testing:**
+- Models with a runtime/dynamic subscript `arr[i]` (i from an expression) in-range and out-of-range (→NaN); a dynamic range; assert wasm matches the VM (including the OOB→NaN cases pinned by `array_tests.rs`).
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasmgen dynamic subscripts with OOB->NaN`
+<!-- END_TASK_4 -->
+<!-- END_SUBCOMPONENT_B -->
+
+<!-- START_TASK_5 -->
+### Task 5: Raise floor; arrayed corpus parity
+
+**Verifies:** wasm-backend.AC1.1, wasm-backend.AC1.2.
+
+**Files:**
+- Modify: `src/simlin-engine/tests/simulate.rs` (raise `WASM_SUPPORTED_FLOOR`).
+
+**Implementation:**
+Arrayed (A2A/subscript/reducer) corpus models now run through wasm. Re-observe the `Ran` count and raise `WASM_SUPPORTED_FLOOR`. (Models using vector ops/allocation remain `Skipped` until Phase 6; module-bearing models until Phase 7.)
+
+**Testing:** the raised floor gate; note which arrayed models flipped to `Ran`.
+
+**Verification:** `cargo test -p simlin-engine --features file_io --test simulate`
+
+**Commit:** `engine: raise wasm parity floor after array core`
+<!-- END_TASK_5 -->
+
+---
+
+## Phase 5 Done When
+- Arrayed (A2A/subscript/reducer) corpus models match the VM element-for-element.
+- Unit tests cover subscript OOB→NaN, broadcast, each reducer (incl. empty-valid vs invalid-view asymmetry), and the iteration loop.
+- The floor gate is raised.
diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_06.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_06.md
new file mode 100644
index 000000000..c49fcc850
--- /dev/null
+++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_06.md
@@ -0,0 +1,148 @@
+# WebAssembly Simulation Backend — Phase 6: Arrays — vector operations and allocation
+
+**Goal:** Lower the helper-heavy array builtins — `VectorSelect`, `VectorElmMap`, `VectorSortOrder`, `Rank`, `LookupArray`, and the `AllocateAvailable`/`AllocateByPriority` market-clearing allocators — to wasm helpers that match the VM (and its sibling modules `vm_vector_elm_map.rs`/`vm_vector_sort_order.rs`/`alloc.rs`) element-for-element.
+
+**Architecture:** Each opcode reads its inputs from the compile-time view stack (Phase 5) and the operand stack and writes its result array to its `write_temp_id` region of `temp_storage` (except `VectorSelect`, which reduces to one scalar). Each is emitted as a self-contained wasm helper mirroring the VM. Sorting (`VectorSortOrder`/`Rank`) uses a stable comparison sort (NaN-as-Equal to preserve stability). Allocation reuses Phase 2's `exp` helper for the open-coded `erfc`/`normal_cdf` and runs the VM's bisection over the per-requester allocation curves.
+
+**Tech Stack:** Phase 5 view/temp infrastructure; Phase 3 `lookup_*` helpers (for `LookupArray`); Phase 2 `approx_eq`/`is_truthy`/`exp`; the VM dispatch arms + sibling modules + `alloc.rs` as spec.
+
+**Scope:** Phase 6 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`.
+
+**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`).
+
+---
+
+## Acceptance Criteria Coverage
+
+### wasm-backend.AC1
+- **wasm-backend.AC1.2 Success:** Arrayed/subscripted models (apply-to-all, subscripts, vector operations) match the VM element-for-element.
+
+### wasm-backend.AC7
+- **wasm-backend.AC7.1 Success:** … the allocation `erfc` [is] open-coded as [a] self-contained wasm helper function (range reduction + polynomial). Each open-coded helper has a unit test comparing its output to Rust `f64` over a sampled range.
+
+---
+
+## Notes for the implementer (read first)
+
+- **Opcodes** (`bytecode.rs`), inputs from the view stack (top = last) and operand stack; outputs to `temp_storage[temp_offsets[write_temp_id]+i]`:
+  - `VectorSelect {}` (`vm.rs:2444-2502`): pop `action` (`.round() as i32`), pop `max_value`; views `expr_view=top`, `sel_view=top-1`. `size = min(sel.size, expr.size)`, independent index odometers. For each i: if `is_truthy(sel_val)` collect `expr_val`. Empty selection → `max_value`. Else by `action`: `1`=min, `2`=mean(sum/len), `3`=max, `4`=product, `_`=sum. Push the single scalar. (Invalid view → push one NaN.)
+  - `VectorElmMap { write_temp_id, full_source_len }` (`vm_vector_elm_map.rs:33-116`): `source_view=top-1`, `offset_view=top`. For each i in `offset_view.size()`: `base_i` = 0 if source is the full contiguous array else projected from carried axes; `flat_i = base_i + round(offset_val)`; result = `NaN` if `offset_val.is_nan()` or `flat_i<0 || flat_i>=full_source_len`, else `source[flat_i]` over full row-major storage. **No modulo.** Write `temp[i]`.
+  - `VectorSortOrder { write_temp_id }` (`vm_vector_sort_order.rs:49-101`): `input_view=top`; pop `direction` (`.round() as i32`). Innermost dim is the sorted axis (`inner = dims[n_dims-1]`, or whole view if scalar). Per row of `inner` elements: build `(value, local_idx 0..inner)`, **stable** sort (asc if `direction==1` else desc), write `temp[row_base + rank] = local_idx as f64` (**0-based in-row source index** at the sorted position).
+  - `Rank { write_temp_id }` (`vm.rs:2540-2584`): `input_view=top`; pop `direction`. Over the **whole view** collect `(value, orig_idx 0..size)` (orig_idx = sequential iteration index), **stable** sort, write `temp[orig_idx] = (rank_0based + 1) as f64` (**1-based**, indexed by original position).
+  - `LookupArray { base_gf, table_count, mode, write_temp_id }` (`vm.rs:2586-2629`): pop `index`; `input_view=top`. For each i in `view.size()`: `elem_off = view.flat_offset(indices)`; if `elem_off >= table_count` → NaN, else dispatch `mode` on `graphical_functions[base_gf+elem_off]` at `index` (reuse Phase 3 `lookup_interp/forward/backward`); write `temp[i]` (sequential index). 
+  - `AllocateAvailable { write_temp_id }` (`vm.rs:2631-2721`): pop `avail`; `profile_view=top`, `requests_view=top-1`. Collect `requests` (n), `pp_values`; `pp_cols = if !pp_values.is_empty() && n>0 && pp_size%n==0 { pp_size/n } else { 4 }`; build per-requester `profiles[(ptype,ppriority,pwidth,pextra)]` reading `pp_values[i*pp_cols + {0,1,2,3}]` with defaults `(0.0, 0.0, 1.0, 0.0)` when out of range; `allocate_available(&requests,&profiles,avail)` → write temp.
+  - `AllocateByPriority { write_temp_id }` (`vm.rs:2723-2794`): pop `supply` then `width`; `priority_view=top`, `requests_view=top-1`. Build rectangular `profiles[(1.0, priorities[i] or 0.0, width, 0.0)]`; `allocate_available(&requests,&profiles,supply)` → write temp.
+- **Invalid input view → `fill_temp_nan`** (`vm.rs:2866-2881`): fill the whole destination temp region with NaN (VectorSelect instead pushes one NaN). The NaN here is IEEE NaN, never `crate::float::NA`.
+- **`alloc.rs` (verbatim, port bit-faithfully):**
+  - `erfc_approx(z)` (`alloc.rs:8-21`): for `z<0` return `2.0 - erfc_approx(-z)`; else `t=1/(1+0.3275911*z)`; `(((((1.061405429*t + -1.453152027)*t) + 1.421413741)*t + -0.284496736)*t + 0.254829592) * t * (-z*z).exp()`. (Abramowitz-Stegun 26.2.17; uses Phase 2 `exp`.)
+  - `normal_cdf(x)` (`alloc.rs:25-30`): `if x.is_nan() {NaN} else 0.5 * erfc_approx(-x / SQRT_2)`.
+  - `alloc_curve(p, request, ptype, ppriority, pwidth, pextra)` (`alloc.rs:40-129`): `if request<=0 {0.0}`; `fraction` by `ptype % 10`: 0 fixed (`p<=ppriority?1:0`), 1 rectangular, 2 triangular, 3 normal (`normal_cdf((ppriority-p)/pwidth)`), 4 exponential, 5 CES, `_` fixed (exact formulas in the investigator report / `alloc.rs:48-126`). Then `alloc = request*fraction; if ptype>=10 { alloc.floor() } else alloc`.
+  - `allocate_available(requests, profiles, avail)` (`alloc.rs:136-199`): `n=len`; if 0 → empty. `total_demand = Σ requests where r>0`; if `avail>=total_demand` → `requests.map(|r| r.max(0))`; if `avail<=0` → zeros. Else compute search range `[p_min,p_max]` from profiles (per-type `spread`), then **bisection up to 100 iterations**: `mid=(lo+hi)/2; total=Σ alloc_curve(mid, ...); if total<avail {hi=mid} else {lo=mid}; break when |hi-lo| < 1e-14*(1+|hi|)`. Return `alloc_curve(p_star=(lo+hi)/2, ...)` per requester.
+- Shared primitives (Phase 5): `increment_indices`, `flat_offset`, `read_view_element`, `temp_offsets`; `is_truthy`/`approx_eq` (Phase 2). **Sorting:** emit a **stable** comparison sort (e.g. insertion sort over `(value, idx)` pairs in a scratch region) treating NaN comparisons as `Equal` to preserve input order (matching the VM's `partial_cmp(..).unwrap_or(Equal)` on a stable `sort_by`).
+- **Memory layout additions:** scratch regions for sorting (`(value,idx)` pairs, sized to the largest view) and for allocation (`requests`, `profiles`). Append after Phase-5 regions; grow `pages`.
+- `pub(crate)`/`pub` latitude per the repo owner. TDD, inline `#[cfg(test)] mod tests`; `cargo test -p simlin-engine --features file_io wasmgen`.
+
+---
+
+<!-- START_SUBCOMPONENT_A (tasks 1-3) -->
+<!-- START_TASK_1 -->
+### Task 1: VectorSelect + VectorElmMap
+
+**Verifies:** wasm-backend.AC1.2.
+
+**Files:** Modify `wasmgen/lower.rs` (+ a `wasmgen/vector.rs` helper module if preferred). Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+- `VectorSelect`: pop `action`/`max_value`; iterate `min(sel.size, expr.size)` with two index odometers; accumulate selected `expr` values where `is_truthy(sel)` (the Phase 2 helper); emit the empty→`max_value` and the action-dispatch (min/mean/max/product/sum) reductions; push one scalar. Invalid view → push NaN.
+- `VectorElmMap`: emit the per-element `source[base_i + round(offset[i])]` computation with the `full_source_len` bound (OOB/NaN→NaN, no modulo), reproducing `vm_vector_elm_map.rs` (including the `source_is_full_array` base_i=0 fast path vs the carried-axis projection). Write the result temp; `fill_temp_nan` on invalid input.
+
+**Testing:** parity vs the VM for VectorSelect (each action, empty selection→max_value, NaN-in-mask) and VectorElmMap (in-range, OOB→NaN, NaN offset→NaN, sliced source base_i).
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasmgen VectorSelect + VectorElmMap`
+<!-- END_TASK_1 -->
+
+<!-- START_TASK_2 -->
+### Task 2: VectorSortOrder + Rank (stable sort)
+
+**Verifies:** wasm-backend.AC1.2.
+
+**Files:** Modify `wasmgen/lower.rs`/`wasmgen/vector.rs`; add a sort scratch region. Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+Emit a stable sort helper over `(value, idx)` pairs in a scratch region, NaN-as-Equal. `VectorSortOrder`: per innermost-dim row, sort the row's `(value, local_idx)` pairs (asc/desc by `direction`), write `temp[row_base+rank] = local_idx` (0-based). `Rank`: over the whole view, sort `(value, orig_idx)`, write `temp[orig_idx] = rank+1` (1-based). Match `vm_vector_sort_order.rs` and `vm.rs:2540-2584` exactly, including the `direction` semantics and the indexing (sorted-position vs original-position).
+
+**Testing:** parity vs the VM for ascending/descending; tie stability (equal values keep input order); multi-row VectorSortOrder; whole-view Rank; a NaN element (compares Equal → stable).
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasmgen VectorSortOrder + Rank with stable sort`
+<!-- END_TASK_2 -->
+
+<!-- START_TASK_3 -->
+### Task 3: LookupArray (per-element arrayed GF)
+
+**Verifies:** wasm-backend.AC1.2.
+
+**Files:** Modify `wasmgen/lower.rs`. Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+Lower `LookupArray { base_gf, table_count, mode, write_temp_id }`: pop the shared `index`; for each element i of the input view, compute `elem_off = flat_offset(indices)`; if `elem_off >= table_count` → NaN, else look up the GF directory at `base_gf+elem_off` and `call` the Phase 3 `lookup_interp/forward/backward` (per `mode`) at `index`; write `temp[i]` (sequential index). `fill_temp_nan` on invalid view.
+
+**Testing:** parity vs the VM for an arrayed graphical function across its domain, including an out-of-range element_offset element (→NaN) and all three modes.
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasmgen LookupArray (per-element arrayed GF)`
+<!-- END_TASK_3 -->
+<!-- END_SUBCOMPONENT_A -->
+
+<!-- START_SUBCOMPONENT_B (tasks 4) -->
+<!-- START_TASK_4 -->
+### Task 4: Allocation — erfc/normal_cdf/alloc_curve/allocate_available + the two opcodes
+
+**Verifies:** wasm-backend.AC1.2, wasm-backend.AC7.1.
+
+**Files:** Create `src/simlin-engine/src/wasmgen/alloc.rs` (the allocation helper emitters); modify `wasmgen/lower.rs` (the two opcode arms); add allocation scratch regions. Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+Emit wasm helpers mirroring `alloc.rs` verbatim:
+- `erfc_approx(z)` (using the Phase 2 `exp` helper) and `normal_cdf(x)` with the exact constants/Horner order above.
+- `alloc_curve(p, request, ptype, ppriority, pwidth, pextra)` with all six `ptype % 10` branches and the `ptype >= 10` floor flag.
+- `allocate_available(requests_ptr, n, profiles_ptr, avail, out_ptr)` (operating over scratch memory arrays): the `total_demand` short-circuits, the search-range computation, the 100-iteration bisection with the `1e-14*(1+|hi|)` relative convergence break, and the final per-requester `alloc_curve(p_star, ...)`.
+Lower `AllocateAvailable`/`AllocateByPriority`: collect `requests`/`profiles` from the views into scratch arrays (with the `pp_cols`/default logic for AllocateAvailable, the rectangular-profile synthesis for AllocateByPriority), pop the scalars, call `allocate_available`, write results to the `write_temp_id` region. `fill_temp_nan` on invalid input views.
+
+**Testing:**
+- AC7.1: unit-test the emitted `erfc_approx`/`normal_cdf` against Rust `alloc::erfc_approx`/`normal_cdf` over a sampled range (expose them `pub(crate)` if needed); document the tolerance.
+- `alloc_curve` parity for each of the 6 profile types + the `>=10` floor.
+- `AllocateAvailable`/`AllocateByPriority` end-to-end parity vs the VM: `avail >= total_demand` (full grant), `avail <= 0` (zeros), and the partial-allocation bisection case across profile types.
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasmgen allocation (erfc/normal_cdf/alloc_curve/allocate_available)`
+<!-- END_TASK_4 -->
+<!-- END_SUBCOMPONENT_B -->
+
+<!-- START_TASK_5 -->
+### Task 5: Raise floor; vector-op/allocation corpus parity
+
+**Verifies:** wasm-backend.AC1.2.
+
+**Files:** Modify `src/simlin-engine/tests/simulate.rs` (raise `WASM_SUPPORTED_FLOOR`).
+
+**Implementation:** Corpus models using vector ops/allocation now run through wasm. Re-observe the `Ran` count and raise `WASM_SUPPORTED_FLOOR`. (Module-bearing models remain `Skipped` until Phase 7.)
+
+**Testing:** the raised floor gate.
+
+**Verification:** `cargo test -p simlin-engine --features file_io --test simulate`
+
+**Commit:** `engine: raise wasm parity floor after vector ops + allocation`
+<!-- END_TASK_5 -->
+
+---
+
+## Phase 6 Done When
+- Corpus models using vector ops/allocation match the VM element-for-element.
+- Unit tests cover each op including the allocation bisection and the `erfc`/`normal_cdf` accuracy vs Rust `f64`.
+- The floor gate is raised.
diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_07.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_07.md
new file mode 100644
index 000000000..cd120dd25
--- /dev/null
+++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_07.md
@@ -0,0 +1,152 @@
+# WebAssembly Simulation Backend — Phase 7: Modules + host interface (FFI, layout, override/reset)
+
+**Goal:** Run submodels in the blob (`EvalModule`/`LoadModuleInput`), give the blob `set_value`/`reset` override semantics matching the VM, and surface the blob plus its name→offset layout through a libsimlin FFI so a host can drive the model and read one variable's series by name.
+
+**Architecture:** Each unique module instance `(model, input_set)` in `CompiledSimulation.modules` becomes its own set of three wasm functions (initials/flows/stocks), each taking a runtime `module_off: i32` (a shared `CompiledModule` may run at several base offsets) plus its `n_inputs` f64 inputs as parameters. `EvalModule { id }` resolves the declaration to a child `ModuleKey`, computes `child_module_off = module_off + decl.off`, and emits a `call` to the child's function for the current phase, passing the popped inputs as args; `LoadModuleInput { input }` reads the corresponding input parameter. Overridable constants are sourced from a mutable constants region (initialized to defaults) so an exported `set_value(offset, val)` + `reset()` reproduce the VM's "override a constant, reset, re-run from t0." The `WasmLayout` (already built in Phase 1) is serialized and returned alongside the blob through `simlin_model_compile_to_wasm`, and a host reads one variable's `n_chunks`-long series by striding the results region.
+
+**Tech Stack:** `wasm-encoder` (multi-function modules, `call`, mutable globals/regions, exported functions); the VM `EvalModule`/`LoadModuleInput`/`set_value`/`reset` as spec; libsimlin's malloc-return convention.
+
+**Scope:** Phase 7 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`.
+
+**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`).
+
+---
+
+## Acceptance Criteria Coverage
+
+### wasm-backend.AC1
+- **wasm-backend.AC1.1 Success:** A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` at those tests' existing tolerances.
+
+### wasm-backend.AC4
+- **wasm-backend.AC4.1 Success:** The blob exports `n_slots`/`n_chunks`/`results_offset` and writes step-major snapshots; a host locates and strides the results with no external metadata.
+- **wasm-backend.AC4.2 Success:** Reading one variable's series via the name→offset layout copies only that variable's `n_chunks` values (never the whole `n_chunks × n_slots` slab) and equals the VM's series for that variable.
+
+### wasm-backend.AC5
+- **wasm-backend.AC5.1 Success:** Overriding a constant via `set_value`, then `reset`, then `run`, yields the same series the VM produces under the same override (matching `simlin_sim_set_value`/`reset` semantics).
+- **wasm-backend.AC5.2 Success:** `reset` with no override restores the compiled-default results.
+
+### wasm-backend.AC6
+- **wasm-backend.AC6.1 Success:** `simlin_model_compile_to_wasm` returns a valid wasm blob plus the name→offset layout via the malloc-return convention; both buffers are freeable with `simlin_free`; it works before any `SimlinSim` exists.
+- **wasm-backend.AC6.2 Failure:** A model that cannot be compiled to wasm surfaces a `SimlinError` rather than panicking across the FFI boundary.
+
+---
+
+## Notes for the implementer (read first)
+
+- **Opcodes** (`bytecode.rs`): `LoadModuleInput { input: ModuleInputOffset(u16) }` (`vm.rs:1376-1378`: push `module_inputs[input]`); `EvalModule { id: ModuleId(u16), n_inputs: u8 }` (`vm.rs:1379-1443`). **There is no `ModuleInput` opcode** (`Expr::ModuleInput` lowers to `LoadModuleInput`). `EvalModule` stack effect `(n_inputs, 0)`; `LoadModuleInput` `(0,1)`.
+- **`ModuleDeclaration`** (`bytecode.rs:1505-1514`), the element type of `ByteCodeContext.modules`: `{ model_name: Ident<Canonical>, input_set: BTreeSet<Ident<Canonical>>, off: usize }`.
+- **`EvalModule` VM dispatch** (`vm.rs:1379-1443`): pop `n_inputs` values into `module_inputs` **in reverse** (`for j in (0..n_inputs).rev() { module_inputs[j] = pop() }`); `child_module_off = module_off + context.modules[id].off`; resolve the child via `make_module_key(&decl.model_name, &decl.input_set)` (`vm.rs:27-32`) → the child `CompiledModule`; recurse phase-aware (Initials→child initials, Flows/Stocks→child `eval` with `part`). **The wasm backend does not need `CompiledSlicedSimulation`/`child_targets`** — resolve `EvalModule` to the child's wasm function index directly from `CompiledSimulation.modules` keyed by `make_module_key`.
+- **Single slab**: the root `n_slots` includes all nested module slots; a child reads/writes at `module_off + off` (`LoadVar`/`AssignCurr`/`AssignNext`), while `LoadGlobalVar` is absolute (TIME/DT/INITIAL_TIME/FINAL_TIME). This is the addressing the emitter has used since Phase 1 (`module_off` is a function parameter).
+- **Inputs as wasm params (clean approach):** each instance's three functions have signature `(module_off: i32, in_0: f64, …, in_{k-1}: f64) -> ()` where `k = n_inputs` for that `(model, input_set)`. `LoadModuleInput { input }` → `local.get(input + 1)` (param 0 is `module_off`). `EvalModule { id, n_inputs }`: pop the `n_inputs` operands into scratch locals (reverse, matching the VM), then push `child_module_off` (= `local.get(module_off) + decl.off`) followed by the input locals in order, and `call` the child's function for the current `StepPart`. (The root's functions are `(i32)->()`, 0 inputs.) This avoids any module-inputs memory scratch.
+- **Phase-aware child function resolution:** build a map `(ModuleKey, StepPart) → wasm function index` during assembly; an `EvalModule` site in the initials/flows/stocks program calls the child's initials/flows/stocks function respectively (the `StepPart` is compile-time per program). The module instantiation graph is acyclic, so the wasm call graph is well-founded.
+- **Per-instance side tables:** generalize Phase 3's GF directory and Phase 5's temp region to **per-instance** `ByteCodeContext`s — each instance has its own `graphical_functions`/`static_views`/`temp_offsets`/`temp_total_size`. The temp regions can be disjoint per instance (sum the sizes) or shared with care; disjoint is simplest. Generalize Phase 4's stock-offset collection to recurse through `EvalModule` declarations adding `decl.off` cumulatively (mirroring `collect_stock_offsets`, `vm.rs:512-543`) so the RK stage math covers nested stocks.
+- **`set_value`/`reset`** (`vm.rs:976-1062`): `set_value(off, v)` is valid only when `is_constant_offset(off)` (`vm.rs:167`) — an offset with an `AssignConstCurr` in the **flows** phase (`cached_constant_info`, `collect_constant_info` `vm.rs:426-507`). The VM mutates the bytecode literal(s) at those locations (so flows re-assigns the override each step) and the override **persists across `reset`** (which only re-runs initials). `clear_values` restores defaults. The libsimlin wrappers `simlin_sim_set_value`/`simlin_sim_reset`/`simlin_sim_clear_values` (`simulation.rs:303-556`) record overrides in `SimState.overrides` and re-apply on reset.
+- **`Results` has no `get_series`**; by-name retrieval strides the slab: `Vm::get_series(ident)` (`vm.rs:1140-1160`) does `off = offsets[ident]; for c in 0..n_steps { data[c*n_slots + off] }`. The host mirrors this over the blob's results region using `WasmLayout.var_offsets` — copying only `n_chunks` values.
+- **libsimlin** (`src/libsimlin/`): `write_bytes_to_ffi_output` (`model.rs:65-86`), `simlin_malloc`/`simlin_free` (`memory.rs:30-71`), the `out_error: *mut *mut SimlinError` + `clear_out_error`/`store_error`/`store_anyhow_error` convention (`lib.rs:384-421`), `require_model` (`lib.rs:512`). The current POC `simlin_model_compile_to_wasm` (`model.rs:101-149`) returns only the blob; this phase changes it to also return the serialized layout.
+- **Memory-layout addition:** a constants override region (a mutable region holding, per overridable offset, its current value, initialized to the compiled default). Append to the layout; grow `pages`.
+- `pub(crate)`/`pub` latitude per the repo owner. TDD; corpus tests gated on `file_io`.
+
+---
+
+<!-- START_SUBCOMPONENT_A (tasks 1) -->
+<!-- START_TASK_1 -->
+### Task 1: Per-instance module functions + EvalModule/LoadModuleInput
+
+**Verifies:** wasm-backend.AC1.1.
+
+**Files:** Modify `wasmgen/module.rs` (emit one function-triple per instance; the `(ModuleKey,StepPart)→fn index` map; per-instance GF directory + temp regions; recursive stock-offset collection), `wasmgen/lower.rs` (`EvalModule`/`LoadModuleInput` arms). Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+1. Enumerate `sim.modules` (every `(model, input_set)` instance). For each, emit initials/flows/stocks functions with signature `(module_off: i32, in_0..in_{k-1}: f64) -> ()` (k = the instance's module-input count). Record `(ModuleKey, StepPart) → fn_index`.
+2. `LoadModuleInput { input }` → `local.get(input + 1)`.
+3. `EvalModule { id, n_inputs }`: pop the `n_inputs` operands into scratch f64 locals (reverse); resolve `decl = current_instance.context.modules[id]`, `child_key = make_module_key(&decl.model_name, &decl.input_set)`; push `child_module_off = (local.get module_off) + (decl.off as i32)`; push the input locals in order; `call (child_key, current_part)`.
+4. The root `run` calls the root's initials/flows/stocks with `module_off=0` and no inputs. Generalize GF directory + temp regions to per-instance, and the RK stock-offset list to recurse through `EvalModule` (adding `decl.off`).
+
+**Testing:** module-bearing models (a model instantiating a submodel; SMOOTH/DELAY stdlib macros expand to implicit module stocks — exercise one) and the same `(model,input_set)` instantiated at two offsets: assert wasm matches the VM. Confirm `LoadModuleInput` reads the right input.
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasmgen per-instance module functions (EvalModule/LoadModuleInput)`
+<!-- END_TASK_1 -->
+<!-- END_SUBCOMPONENT_A -->
+
+<!-- START_SUBCOMPONENT_B (tasks 2-4) -->
+<!-- START_TASK_2 -->
+### Task 2: `set_value` / `reset` override mechanism
+
+**Verifies:** wasm-backend.AC5.1, wasm-backend.AC5.2.
+
+**Files:** Modify `wasmgen/module.rs` (constants region + `set_value`/`reset` exports), `wasmgen/lower.rs` (source overridable constants from the region). Test: inline `#[cfg(test)] mod tests`.
+
+**Implementation:**
+1. Identify the **full set** of overridable offsets: `CompiledSimulation::is_constant_offset(off)` (`vm.rs:167`, `pub fn`) only answers one offset at a time, and the set itself lives in the private `cached_constant_info` map — so expose its keys (widen `cached_constant_info`'s visibility, or add a `pub(crate) fn constant_offsets(&self) -> impl Iterator<Item = usize>` accessor on `CompiledSimulation`) and initialize the constants region from that key set. Add a constants override region holding each overridable offset's current value, initialized (data segment or init code) to the compiled-default literal.
+2. Redirect the value source for the overridable constant-assignment pattern (`LoadConstant{id}; AssignCurr{off}` where `off` is a constant offset, the un-fused form of `AssignConstCurr`): instead of `f64.const literal`, emit `f64.load const_region[off]`. This makes the override take effect every flows step, exactly like the VM mutating the literal.
+3. Export `set_value(offset: i32, val: f64) -> i32` (return 0 ok / nonzero if `offset` is not overridable — validate against the overridable set) writing `const_region[offset]=val`; and `reset()` resetting run state (chunk/step counters, `use_prev_fallback=1`, `did_initials`-equivalent) **without** clearing the constants region (overrides persist across reset, matching the VM). Optionally `clear_values()` to restore defaults. The next `run` re-runs initials and the loop, picking up the override.
+
+**Testing:**
+- AC5.1: `set_value(off_of_a_constant, v); reset(); run();` and compare the full series to the VM run with `vm.set_value(ident, v)` under the same override.
+- AC5.2: `reset(); run()` with no override reproduces the compiled-default series.
+- `set_value` on a non-constant offset returns the error code (no write).
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen`
+
+**Commit:** `engine: wasmgen blob set_value/reset override semantics`
+<!-- END_TASK_2 -->
+
+<!-- START_TASK_3 -->
+### Task 3: libsimlin FFI returning blob + layout; by-name series retrieval
+
+**Verifies:** wasm-backend.AC4.1, wasm-backend.AC4.2, wasm-backend.AC6.1, wasm-backend.AC6.2.
+
+**Files:** Modify `src/libsimlin/src/model.rs` (`simlin_model_compile_to_wasm` to also return the serialized layout); add a `WasmLayout` serializer (in `wasmgen` or libsimlin). Test: a Rust integration test in `src/libsimlin/` (and/or a `wasmgen` test for the by-name read).
+
+**Implementation:**
+1. Add a `WasmLayout` serializer: a length-prefixed encoding — `n_slots`, `n_chunks`, `results_offset` (as u64 LE), then `count` (u32), then per entry `name_len` (u32) + UTF-8 name bytes + `offset` (u64). (Avoids a protobuf dependency; matches the libsimlin "Pattern A" malloc-return convention.)
+2. Change `simlin_model_compile_to_wasm` to:
+   ```rust
+   pub unsafe extern "C" fn simlin_model_compile_to_wasm(
+       model: *mut SimlinModel,
+       out_wasm: *mut *mut u8, out_wasm_len: *mut usize,
+       out_layout: *mut *mut u8, out_layout_len: *mut usize,
+       out_error: *mut *mut SimlinError,
+   )
+   ```
+   Build the `CompiledSimulation` from the model's datamodel (sync + `compile_project_incremental`), call `compile_simulation` to get the `WasmArtifact`, then `write_bytes_to_ffi_output` the `artifact.wasm` and the serialized `artifact.layout` into the two buffer pairs. Follow the FFI prologue (`clear_out_error`, null-checks, `require_model`). On any compile/codegen error, `store_error`/`store_anyhow_error` (AC6.2 — never panic across the boundary); the function works before any `SimlinSim` exists (it takes a `SimlinModel`).
+3. A host reads one variable's series by name: locate `off` from the layout, then for `c in 0..n_chunks` read `results[results_offset + (c*n_slots + off)*8]` — copying only `n_chunks` values.
+
+**Testing:**
+- AC6.1: FFI test — compile a model to wasm + layout, assert the wasm validates, the layout deserializes to the expected geometry + name→offset map, and both buffers free with `simlin_free`. Works with only a `SimlinModel` (no `SimlinSim`).
+- AC6.2: a model that fails codegen (an unsupported construct, if any remain) surfaces a `SimlinError` (the out_error is set), no panic.
+- AC4.2: a `wasmgen`/libsimlin test that reads one variable's `n_chunks`-long series via the layout (striding the slab) and asserts it equals the VM's `get_series` for that variable, and that it copied only `n_chunks` values (not the whole slab).
+- AC4.1 (reaffirm): geometry read from the exported globals matches the layout.
+
+**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` and `cargo test -p libsimlin`
+
+**Commit:** `libsimlin: simlin_model_compile_to_wasm returns blob + WasmLayout`
+<!-- END_TASK_3 -->
+
+<!-- START_TASK_4 -->
+### Task 4: Raise floor; module + systems-format + metasd corpus parity
+
+**Verifies:** wasm-backend.AC1.1.
+
+**Files:** Modify `src/simlin-engine/tests/simulate.rs` (raise `WASM_SUPPORTED_FLOOR`); add the wasm hook to `src/simlin-engine/tests/simulate_systems.rs`.
+
+**Implementation:**
+Module-bearing models (including SMOOTH/DELAY stdlib expansions) now run through wasm. Add the `ensure_wasm_matches` hook to `simulate_systems.rs` (systems-format models become stdlib-module instances, so they exercise modules). Re-observe the `Ran` counts and raise `WASM_SUPPORTED_FLOOR` (and add a systems floor if appropriate). Heavy/`#[ignore]` models still defer their wasm twins to Phase 8.
+
+**Testing:** the raised floor gates (simulate + simulate_systems); note which module/systems models flipped to `Ran`.
+
+**Verification:** `cargo test -p simlin-engine --features file_io --test simulate` and `--test simulate_systems`
+
+**Commit:** `engine: raise wasm parity floor after modules + systems format`
+<!-- END_TASK_4 -->
+<!-- END_SUBCOMPONENT_B -->
+
+---
+
+## Phase 7 Done When
+- Module-bearing, systems-format, and metasd-simulation models match the VM through wasm.
+- Override-then-reset-then-run matches the VM under the same override; reset with no override restores defaults.
+- A by-name series read copies only `n_chunks` values and equals the VM's series; the FFI returns blob + layout (both `simlin_free`-able) and surfaces errors without panicking.
+- The floor gate(s) are raised.
diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_08.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_08.md
new file mode 100644
index 000000000..739db2cbd
--- /dev/null
+++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_08.md
@@ -0,0 +1,100 @@
+# WebAssembly Simulation Backend — Phase 8: Full-corpus parity + C-LEARN
+
+**Goal:** Close the gate — make any `WasmGenError::Unsupported` for a VM-simulated core model a hard failure (no skips remain for core simulation), add the `#[ignore]`d C-LEARN wasm twin against `Ref.vdf`, and document the backend and its coverage.
+
+**Architecture:** The parity harness flips from "skip-not-fail" to "fail" for core-simulation models: every XMILE/MDL/systems model the VM simulates in the default suite must also run through the wasm backend and clear the same comparator. The heavy `#[ignore]`d models (C-LEARN, WORLD3, COVID/metasd) get `#[ignore]`d wasm twins so they don't blow the 3-minute default-suite cap under the (interpreted, non-JIT) DLR-FT oracle.
+
+**Tech Stack:** the `tests/simulate.rs` corpus harness, `run_clearn_vs_vdf()`, `ensure_vdf_results` + `EXPECTED_VDF_RESIDUAL`; docs.
+
+**Scope:** Phase 8 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`.
+
+**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`).
+
+---
+
+## Acceptance Criteria Coverage
+
+### wasm-backend.AC1
+- **wasm-backend.AC1.3 Success:** C-LEARN runs through the wasm backend and matches `Ref.vdf` / the VM under the existing VDF tolerance and the `EXPECTED_VDF_RESIDUAL` carve-out.
+- **wasm-backend.AC1.4 Failure:** A model using a not-yet-supported construct returns `WasmGenError::Unsupported` — a clean error, never a panic or a silently wrong result. *(Phase 8 is the end-state expression of this AC: the flipped gate turns any `Unsupported` for a VM-simulated core model into a hard failure — never a silent wrong result.)*
+
+### wasm-backend.AC3
+- **wasm-backend.AC3.2 Success:** End state — no core-simulation model is skipped: every XMILE, MDL, and systems-format model in the corpus runs through both backends.
+- **wasm-backend.AC3.3 Failure:** A regression that makes a previously-supported model unsupported (dropping below the floor, or any `Unsupported` at the end-state gate) fails the test suite.
+
+---
+
+## Notes for the implementer (read first)
+
+- **The end-state gate applies to models the VM actually simulates in the default suite.** Models the VM itself does not simulate (the unsupported-feature `#[ignore]`s: DELAY FIXED `simulate.rs:1534-1552`, GET DATA `simulate.rs:1595-1609`) stay VM-only and are out of scope — the wasm hook runs *after* the VM run, so a model the VM `#[ignore]`s never reaches it. LTM (`simulate_ltm.rs`) stays VM-only (out of scope).
+- **C-LEARN harness** (confirmed): `run_clearn_vs_vdf() -> (Results, Results)` at `simulate.rs:1865-1893` (VM results + parsed `Ref.vdf`); `ensure_vdf_results`/`ensure_vdf_results_excluding` at `simulate.rs:309/349` (1% `VDF_RTOL` + matched-floor); `EXPECTED_VDF_RESIDUAL` at `simulate.rs:1746`; `simulates_clearn` at `simulate.rs:1849` (`#[ignore]`, `// Run with: cargo test --release -- --ignored simulates_clearn`). The wasm twin compares the **wasm** output to `Ref.vdf` with the **same** `ensure_vdf_results_excluding(&vdf, &wasm_results, EXPECTED_VDF_RESIDUAL)` check.
+- **Test-suite time budget** (`docs/dev/rust.md:13-17`): default suite under a 3-minute wall-clock cap; the DLR-FT interpreter is not a JIT, so heavy models run slowly under it. Keep the heavy models' wasm twins `#[ignore]`d (run via `cargo test --release -- --ignored <name>`), exactly like their VM counterparts.
+- **Building C-LEARN's `CompiledSimulation` for the wasm twin:** reuse the C-LEARN compile path from `run_clearn_vs_vdf` (open the `.mdl`, sync, `compile_project_incremental`), then `compile_simulation` → run the blob under DLR-FT → build `Results` from the slab (`is_vensim` consistent with the VDF comparison) → `ensure_vdf_results_excluding`.
+- `pub(crate)`/`pub` latitude per the repo owner. Engine tests gated on `file_io`.
+
+---
+
+<!-- START_SUBCOMPONENT_A (tasks 1-3) -->
+<!-- START_TASK_1 -->
+### Task 1: Flip the harness — Unsupported is a hard failure; close the floor
+
+**Verifies:** wasm-backend.AC3.2, wasm-backend.AC3.3.
+
+**Files:** Modify `src/simlin-engine/tests/test_helpers.rs` (or `simulate.rs`) and `src/simlin-engine/tests/simulate.rs`, `src/simlin-engine/tests/simulate_systems.rs`.
+
+**Implementation:**
+1. Change the inline wasm hook in `simulate_path_with_excluding` (and the `.mdl` + systems paths) so a `WasmRunOutcome::Skipped(msg)` for a model the VM simulated is now a **hard failure** (`panic!`) for core-simulation models, not a silent skip. (Equivalently, `ensure_wasm_matches` returns `()` and panics on `Unsupported`.)
+2. Replace the monotonic floor with the end-state assertion: the `wasm_parity_floor`/equivalent gate now requires that **every** VM-simulated core-simulation model in the default suite runs through wasm (zero `Unsupported`). Remove the skip-counting branch. Keep the gate's runtime within the cap (it only covers the small/medium default corpus; heavy models are `#[ignore]`d twins, Task 2).
+3. If Task 1 surfaces any remaining `Unsupported` for a VM-simulated core model, close that lowering gap (a small addition to the relevant phase's emitter) — the design's end state is full core-simulation coverage. (A genuinely VM-unsupported feature stays out of scope and must not reach the hook.)
+
+**Testing:** the flipped gate is the test: it fails if any VM-simulated core model is `Unsupported` (AC3.3) and passes only at full coverage (AC3.2). Confirm a deliberately-introduced `Unsupported` (temporarily) fails the suite.
+
+**Verification:** `cargo test -p simlin-engine --features file_io --test simulate` and `--test simulate_systems`
+
+**Commit:** `engine: close the wasm parity gate (Unsupported is a hard failure)`
+<!-- END_TASK_1 -->
+
+<!-- START_TASK_2 -->
+### Task 2: C-LEARN (and heavy-model) wasm twins
+
+**Verifies:** wasm-backend.AC1.3.
+
+**Files:** Modify `src/simlin-engine/tests/simulate.rs`.
+
+**Implementation:**
+Add `#[test] #[ignore] fn simulates_clearn_wasm()` (with the `// Run with: cargo test --release -- --ignored simulates_clearn_wasm` comment) that: builds C-LEARN's `CompiledSimulation` (reusing the compile path inside `run_clearn_vs_vdf`), compiles it via `compile_simulation`, runs the blob under DLR-FT, builds a `Results` from the slab, and asserts `ensure_vdf_results_excluding(&vdf_results, &wasm_results, EXPECTED_VDF_RESIDUAL)` — the same check `simulates_clearn` uses. Add similarly-`#[ignore]`d wasm twins for the other heavy models that have VM equivalents (WORLD3 `simulates_wrld3_03`, the COVID/metasd SSTATS model) if they exercise wasm-supported features, mirroring their existing VM tests' comparators.
+
+**Testing:** `simulates_clearn_wasm` (run on demand): C-LEARN's wasm output matches `Ref.vdf` under the existing tolerance + residual carve-out.
+
+**Verification:** `cargo test -p simlin-engine --release --features file_io -- --ignored simulates_clearn_wasm`
+Expected: passes (matches `Ref.vdf` within the VDF tolerance and `EXPECTED_VDF_RESIDUAL`).
+
+**Commit:** `engine: C-LEARN wasm parity twin against Ref.vdf`
+<!-- END_TASK_2 -->
+
+<!-- START_TASK_3 -->
+### Task 3: Documentation
+
+**Verifies:** (none — documentation; supports AC3.2 reporting.)
+
+**Files:** Modify `src/simlin-engine/CLAUDE.md`; update `docs/` (and `docs/README.md` if adding a doc file, per `docs/CLAUDE.md`).
+
+**Implementation:**
+- Add a `wasmgen` entry to `src/simlin-engine/CLAUDE.md`'s module map: the backend lowers `CompiledSimulation` bytecode to a self-contained wasm module (alternative execution path to the VM, validated against the VM via the DLR-FT interpreter), its file layout (`mod.rs`/`module.rs`/`lower.rs`/`math.rs`/`views.rs`/`vector.rs`/`alloc.rs` as built), the `compile_simulation`/`WasmArtifact`/`WasmLayout` contract, and the supported-feature coverage (full core simulation: scalar + arrays + lookups + Euler/RK2/RK4 + modules; LTM out of scope).
+- Document how to run the wasm parity tests (default suite runs small/medium corpus through wasm; heavy twins via `cargo test --release -- --ignored <name>`), and that the bytecode VM remains the correctness oracle.
+- Note the `libsimlin` `simlin_model_compile_to_wasm` entry (blob + `WasmLayout`).
+
+**Testing:** n/a (docs). Verify links/freshness; keep the `**Last updated:**` date current in `simlin-engine/CLAUDE.md`.
+
+**Verification:** `pnpm lint` / a docs build if applicable; manual review.
+
+**Commit:** `doc: document the wasm simulation backend and its coverage`
+<!-- END_TASK_3 -->
+<!-- END_SUBCOMPONENT_A -->
+
+---
+
+## Phase 8 Done When
+- Every core-simulation corpus model (XMILE, MDL, systems) runs through both VM and wasm with no skips; an `Unsupported` for a VM-simulated core model fails the suite.
+- C-LEARN matches `Ref.vdf` through wasm under the existing tolerance + `EXPECTED_VDF_RESIDUAL` (`#[ignore]`d twin).
+- The backend and its coverage are documented in `simlin-engine/CLAUDE.md` and `docs/`.
diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/test-requirements.md b/docs/implementation-plans/2026-05-20-wasm-backend/test-requirements.md
new file mode 100644
index 000000000..912694f13
--- /dev/null
+++ b/docs/implementation-plans/2026-05-20-wasm-backend/test-requirements.md
@@ -0,0 +1,135 @@
+# WebAssembly Simulation Backend — Test Requirements
+
+This document maps every acceptance criterion from the design plan
+([`docs/design-plans/2026-05-20-wasm-backend.md`](../../design-plans/2026-05-20-wasm-backend.md),
+the authoritative AC list) to its verification. There are 8 AC groups and 22
+individual cases (AC1.1–1.5, AC2.1–2.2, AC3.1–3.3, AC4.1–4.2, AC5.1–5.2,
+AC6.1–6.2, AC7.1–7.4, AC8.1–8.2). The phase mappings come from the
+`**Verifies:**` lines in [`phase_01.md`](phase_01.md) … [`phase_08.md`](phase_08.md).
+
+## Verification conventions
+
+This backend is engine-internal and is validated against the bytecode VM as the
+correctness oracle, so verification is almost entirely automated. Two test
+surfaces recur throughout:
+
+- **Unit (inline `#[cfg(test)] mod tests`)** in the relevant
+  `src/simlin-engine/src/wasmgen/*.rs` file. Each unit test hand-builds a tiny
+  `ByteCode`/`CompiledSimulation`, emits a wasm module with `wasm-encoder`,
+  validates it (`wasm::validate`), instantiates it under the DLR-FT
+  `wasm-interpreter` via the `checked` crate's `Store`, invokes the export, and
+  asserts on linear memory / return values against the VM's matching handler
+  (the executable spec). Files: `wasmgen/lower.rs`, `wasmgen/module.rs`,
+  `wasmgen/math.rs`, `wasmgen/lookup.rs` (if split out; otherwise in
+  `lower.rs`), `wasmgen/views.rs`, `wasmgen/vector.rs`, `wasmgen/alloc.rs`.
+- **Integration / corpus** in `src/simlin-engine/tests/simulate.rs` (and
+  `src/simlin-engine/tests/simulate_systems.rs`). The `ensure_wasm_matches`
+  hook runs each supported corpus model through the wasm backend after the VM
+  run and feeds its results through the model's existing comparator; the
+  `wasm_parity_floor` gate enforces a monotonically rising count of
+  wasm-supported models; the `#[ignore]`d `simulates_clearn_wasm` twin checks
+  C-LEARN against `Ref.vdf`.
+
+**The correctness bar is the existing comparators, not a separate
+wasm-vs-VM threshold.** A model's wasm output must clear the same
+`ensure_results` (abs `2e-3` / Vensim-relative `5e-6`) or `ensure_vdf_results`
+(1% `VDF_RTOL` + the `EXPECTED_VDF_RESIDUAL` carve-out) check the VM clears,
+against the same expected outputs. "wasm-vs-VM parity" is achieved because both
+backends clear the same comparator against the same expected outputs — there is
+no tighter backend-equivalence tolerance (design "Validation bar"; reflected in
+AC1.1, AC1.3, and AC7.4 below).
+
+---
+
+## AC1: The wasm backend reproduces the VM's simulation results
+
+| AC | Literal text | Verification |
+|---|---|---|
+| **AC1.1** (Success) | A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` against the model's expected outputs at those tests' existing tolerances. (No separate, tighter wasm-vs-VM threshold.) | **Automated — integration.** The `ensure_wasm_matches` hook in `src/simlin-engine/tests/simulate.rs` (`simulate_path_with_excluding` + the `.mdl` path) runs each supported model through the backend and asserts via the existing `ensure_results_excluding` comparator (the same check the VM passes; no separate threshold). The supported set widens each phase: scalar/Euler (Phase 1), full scalar builtins (Phase 2), graphical functions (Phase 3), RK + PREVIOUS/INIT (Phase 4), arrays (Phase 5), vector ops/allocation (Phase 6), modules + systems format (Phase 7). The per-phase floor raise in `wasm_parity_floor` records the widening. Per-opcode correctness is also covered by the unit tests under each AC below. |
+| **AC1.2** (Success) | Arrayed/subscripted models (apply-to-all, subscripts, vector operations) match the VM element-for-element. | **Automated — unit + integration.** Unit: reducer/iteration/subscript parity vs the VM in `wasmgen/views.rs` and `wasmgen/lower.rs` (Phase 5: subscript OOB→NaN, broadcast, each reducer, iteration loops) and the vector-op/allocation parity tests in `wasmgen/vector.rs` and `wasmgen/alloc.rs` (Phase 6: VectorSelect/ElmMap/SortOrder/Rank/LookupArray/Allocate). Integration: arrayed corpus models clear `ensure_results` via `ensure_wasm_matches` and raise the floor (Phase 5 Task 5, Phase 6 Task 5). A2A variables are unrolled to scalar bytecode by the compiler, so they are additionally covered by the Phase 1/2 scalar path. |
+| **AC1.3** (Success) | C-LEARN runs through the wasm backend and matches `Ref.vdf` / the VM under the existing VDF tolerance and the `EXPECTED_VDF_RESIDUAL` carve-out. | **Automated — integration (`#[ignore]`d).** Phase 8 Task 2 adds `#[test] #[ignore] fn simulates_clearn_wasm()` in `src/simlin-engine/tests/simulate.rs`, reusing `run_clearn_vs_vdf()`'s compile path, running the blob under DLR-FT, and asserting `ensure_vdf_results_excluding(&vdf, &wasm_results, EXPECTED_VDF_RESIDUAL)` — the same check `simulates_clearn` uses. `#[ignore]`d for runtime (interpreter is not a JIT); run via `cargo test --release --features file_io -- --ignored simulates_clearn_wasm`. |
+| **AC1.4** (Failure) | A model using a not-yet-supported construct returns `WasmGenError::Unsupported` — a clean error, never a panic or a silently wrong result. | **Automated — unit + integration (negative path).** Unit: Phase 1 Task 1 asserts unsupported opcodes (`Op2::Eq`/`Op2::Mod`/`Apply`/`Lookup`/an array opcode at that point) return `WasmGenError::Unsupported` rather than panicking, in `wasmgen/lower.rs`. Integration end state: Phase 8 Task 1 flips the `simulate.rs` hook so any `Unsupported` for a VM-simulated core model is a hard failure (never a silent wrong result); a deliberately-introduced `Unsupported` must fail the suite. |
+| **AC1.5** (Edge) | Empty-view reducers, out-of-bounds subscripts, and division-by-zero produce the same NaN / finite-`:NA:` / Inf values the VM produces. | **Automated — unit (edge path), split across phases.** Phase 1 Task 1: raw `Op2::Div` by zero (`x/0`→±Inf, `0/0`→NaN, IEEE-identical to the VM) in `wasmgen/lower.rs`. Phase 2 Task 1: the finite `:NA:` sentinel (`crate::float::NA`) vs genuine IEEE NaN, kept distinct by the `approx_eq` helper (curated sample incl. `(NA,NA)`/`(NA,0.0)`/`(NaN,NaN)`) in `wasmgen/lower.rs`. Phase 5 Task 2 + Task 4: empty-but-valid reducers (`ArraySum`→0.0; Max/Min/Mean/Stddev→NaN) and invalid-view→NaN for all reducers; out-of-bounds subscripts→NaN (pinned against `array_tests.rs` cases) in `wasmgen/lower.rs`/`wasmgen/views.rs`. |
+
+## AC2: The backend consumes the salsa compiled bytecode
+
+| AC | Literal text | Verification |
+|---|---|---|
+| **AC2.1** (Success) | The wasm module is produced from `compile_project_incremental(...) -> CompiledSimulation`, not from the `Expr` IR or the monolithic `compiler::Module`. | **Automated — unit (Phase 1).** Task 1: each scalar-core opcode lowers from `ByteCode.code` (bytecode, not `Expr`), unit-tested in `wasmgen/lower.rs`. Task 2: `compile_simulation(&CompiledSimulation)` builds the module from a `CompiledSimulation` produced by `compile_project_incremental`, unit-tested in `wasmgen/module.rs` against `Vm::new(sim).run_to_end()`. Task 3: `compile_datamodel_to_wasm` is rerouted through the salsa pipeline, and `wasmgen/expr.rs` (the `Expr`-tree path) is deleted — verified by `cargo test -p simlin-engine --features file_io wasmgen` plus the structural check that no `crate::compiler::Module` references remain in `wasmgen/`. |
+| **AC2.2** (Success) | The POC's `#[cfg(test)]` un-gating of the monolithic builder is reverted; the crate builds with `Module::new`/`build_metadata`/`calc_n_slots`/`calc_module_model_map` test-only again. | **Automated — build state (Phase 1 Task 4).** Operational verification (a visibility/gating revert, no new behavior): `cargo build -p simlin-engine` builds with the four items `#[cfg(test)]`-gated again; `cargo test -p simlin-engine --features file_io` still compiles and passes (test code reaches the now-test-only builder); `git diff main -- src/simlin-engine/src/compiler/mod.rs` shows only the re-gating. |
+
+## AC3: simulate.rs runs the corpus through both backends
+
+| AC | Literal text | Verification |
+|---|---|---|
+| **AC3.1** (Success) | During rollout, each corpus model runs through the VM and (when supported) the wasm backend, comparing wasm-vs-VM; unsupported models are skipped (not failed) and counted against a monotonically rising floor. | **Automated — integration (Phase 1 Tasks 5–6).** `ensure_wasm_matches` returns `WasmRunOutcome::Ran | Skipped(msg)` (`src/simlin-engine/tests/test_helpers.rs`); the inline hook in `simulate.rs` records `Skipped` rather than failing; `const WASM_SUPPORTED_FLOOR` + `#[test] fn wasm_parity_floor()` count `Ran` models and assert `ran >= WASM_SUPPORTED_FLOOR`. The floor is raised in every subsequent functionality phase (Phases 2–7 each have a "raise the floor" task). |
+| **AC3.2** (Success) | End state — no core-simulation model is skipped: every XMILE, MDL, and systems-format model in the corpus runs through both backends. | **Automated — integration (Phase 8 Task 1).** The harness flips: the skip-counting branch is removed and the gate asserts every VM-simulated core-simulation model in the default suite runs through wasm with zero `Unsupported`, across `src/simlin-engine/tests/simulate.rs` (XMILE + `.mdl`) and `src/simlin-engine/tests/simulate_systems.rs` (systems format). |
+| **AC3.3** (Failure) | A regression that makes a previously-supported model unsupported (dropping below the floor, or any `Unsupported` at the end-state gate) fails the test suite. | **Automated — integration (Phase 8 Task 1); the gate itself is the test.** During rollout, dropping below `WASM_SUPPORTED_FLOOR` fails `wasm_parity_floor` (Phase 1). At the end state, any `Unsupported` for a VM-simulated core model is a hard `panic!` in the hook + the closed gate. Confirmed by temporarily introducing an `Unsupported` and observing the suite fail. |
+
+## AC4: Self-describing results + efficient by-name retrieval
+
+| AC | Literal text | Verification |
+|---|---|---|
+| **AC4.1** (Success) | The blob exports `n_slots`/`n_chunks`/`results_offset` and writes step-major snapshots; a host locates and strides the results with no external metadata. | **Automated — unit (Phase 1 Task 2; reaffirmed Phase 7 Task 3).** A dedicated test in `wasmgen/module.rs` reads the three exported i32 globals from the instantiated module (`instance_export(inst, "n_slots").as_global()`, etc.), asserts they equal the `WasmLayout` values, then uses only the module-exported geometry to stride to one variable's series and confirms it matches the VM. Phase 7 Task 3 reaffirms geometry-from-globals matches the layout alongside the FFI test. |
+| **AC4.2** (Success) | Reading one variable's series via the name→offset layout copies only that variable's `n_chunks` values (never the whole `n_chunks × n_slots` slab) and equals the VM's series for that variable. | **Automated — unit / integration (Phase 7 Task 3).** A `wasmgen`/libsimlin test reads one variable's `n_chunks`-long series via `WasmLayout.var_offsets` (striding `results[results_offset + (c*n_slots + off)*8]`), asserts it equals the VM's `get_series` for that variable, and asserts only `n_chunks` values were copied (not the whole slab). |
+
+## AC5: Override + reset
+
+| AC | Literal text | Verification |
+|---|---|---|
+| **AC5.1** (Success) | Overriding a constant via `set_value`, then `reset`, then `run`, yields the same series the VM produces under the same override (matching `simlin_sim_set_value`/`reset` semantics). | **Automated — unit (Phase 7 Task 2).** A test in `wasmgen/module.rs` calls `set_value(off_of_a_constant, v); reset(); run();` on the blob and compares the full series to a VM run with `vm.set_value(ident, v)` under the same override. A `set_value` on a non-constant offset is asserted to return the error code with no write. |
+| **AC5.2** (Success) | `reset` with no override restores the compiled-default results. | **Automated — unit (Phase 7 Task 2).** A test in `wasmgen/module.rs` calls `reset(); run()` with no override and asserts the blob reproduces the compiled-default series. |
+
+## AC6: libsimlin FFI
+
+| AC | Literal text | Verification |
+|---|---|---|
+| **AC6.1** (Success) | `simlin_model_compile_to_wasm` returns a valid wasm blob plus the name→offset layout via the malloc-return convention; both buffers are freeable with `simlin_free`; it works before any `SimlinSim` exists. | **Automated — integration FFI (Phase 7 Task 3).** A Rust integration test in `src/libsimlin/` compiles a model to wasm + serialized layout, asserts the wasm validates, the layout deserializes to the expected geometry + name→offset map, both buffers free with `simlin_free`, and the call works from only a `SimlinModel` (no `SimlinSim`). |
+| **AC6.2** (Failure) | A model that cannot be compiled to wasm surfaces a `SimlinError` rather than panicking across the FFI boundary. | **Automated — integration FFI (negative path, Phase 7 Task 3).** A `src/libsimlin/` test feeds a model that fails codegen and asserts the `out_error` (`*mut *mut SimlinError`) is set via `store_error`/`store_anyhow_error` with no panic across the boundary. |
+
+## AC7: Numeric-parity specifics
+
+| AC | Literal text | Verification |
+|---|---|---|
+| **AC7.1** (Success) | Math wasm provides natively (`sqrt`, `abs`, `floor`/`ceil`/`trunc`/`nearest`, `min`/`max`, arithmetic) uses wasm instructions; the transcendentals wasm lacks (`sin`/`cos`/`tan`/`asin`/`acos`/`atan`/`exp`/`ln`/`log10`/`pow`) and the allocation `erfc` are open-coded as self-contained wasm helper functions (range reduction + polynomial). Each open-coded helper has a unit test comparing its output to Rust `f64` over a sampled range; results need not be bit-identical to the VM's libm — only close enough that the existing tests pass. | **Automated — unit, split across phases.** Phase 2 Task 2: each scalar transcendental helper (`exp`/`ln`/`sin`/`cos`/`atan` kernels + `tan`/`acos`/`log10`/`pow`/`asin` composed) emitted in `wasmgen/math.rs` gets a unit test comparing wasm output to Rust `f64` over a dense sampled domain + NaN/inf edges, with a documented tolerance comfortably inside the `simulate.rs` tolerances. Phase 2 Task 4 confirms native instructions are used for `Abs`/`Sqrt`/`Int`/`Min`/`Max`. Phase 3 Task 2: the lookup kernels tested against the VM's `lookup`/`lookup_forward`/`lookup_backward`. Phase 6 Task 4: `erfc_approx`/`normal_cdf` (in `wasmgen/alloc.rs`) unit-tested against the Rust `alloc::erfc_approx`/`normal_cdf`. |
+| **AC7.2** (Success) | Equality and truthiness (`Eq`/`Neq`/`And`/`Or`/`If` condition) use ULP-based `approx_eq` matching the VM. | **Automated — unit (Phase 2 Task 1).** An `approx_eq(a,b)->i32` wasm helper reproduces `float_cmp::approx_eq!(f64, …)` (epsilon + 4-ulp ordered-integer algorithm) bit-faithfully; a unit test in `wasmgen/lower.rs` runs it under DLR-FT over a curated + randomized sample (exact-equal, far, 1–4 ULP, EPSILON-apart, subnormals, `(NaN,NaN)`, `(NA,NA)`, `(NA,0.0)`, `(±0)`, `(±inf)`) and asserts equality with `crate::float::approx_eq`. Further tests confirm `Op2::Eq`, `Op2::And`, `Op2::Or`, `Not`, and `SetCond`+`If` match the VM for near-zero/ULP-adjacent operands where raw `==`/`!=0.0` would diverge. `Neq` lowers to `Eq`+`Not`, so it is covered transitively. |
+| **AC7.3** (Edge) | `Mod` matches the VM's `rem_euclid` semantics (computed via wasm `floor`). `Max`/`Min` use the wasm `f64.max`/`f64.min` instructions; if a corpus test surfaces a NaN/±0 difference from the VM's compare-based form, fall back to explicit compare-and-select for that case. | **Automated — unit (Phase 2 Tasks 3–4; reaffirmed Phase 5 Task 2).** Phase 2 Task 3: `Op2::Mod` asserted to match `l.rem_euclid(r)` over the four sign combinations + non-integer operands, result always in `[0,|r|)`, in `wasmgen/lower.rs`. Phase 2 Task 4: `Min`/`Max` use `f64.min`/`f64.max`, with the documented compare-and-select fallback if a corpus test surfaces a NaN/±0 divergence. Phase 5 Task 2 reaffirms for the array reducers `ArrayMax`/`ArrayMin` (which use the VM's compare form on the reduce path, since their empty-view→NaN semantics differ from the binary builtins). |
+| **AC7.4** (Success) | Euler, RK2, and RK4 each match the VM's saved samples (cadence and values); `PREVIOUS`/`INIT` match via the snapshot regions. | **Automated — unit + integration (Phase 1 Euler; Phase 4 RK2/RK4 + PREVIOUS/INIT).** Phase 1 Task 2: the Euler `run` loop's cadence and per-step values asserted against `Vm::new(sim).run_to_end()` in `wasmgen/module.rs` (`step_count == n_chunks`, save cadence matches). Phase 4 Task 1: `PREVIOUS`/`INIT` models (incl. `PREVIOUS` at t0/after-first-step and `INIT` from a flow vs from an initial) match the VM via the `prev_values`/`initial_values` snapshot regions. Phase 4 Task 2: RK2 (Heun) and RK4 scalar models match the VM's saved samples (cadence and values), incl. the snapshot timing under RK. Integration: RK + PREVIOUS/INIT corpus models clear `ensure_results` via `ensure_wasm_matches` and raise the floor (Phase 4 Task 3) — checked against expected outputs at the existing tolerances, not a separate threshold. |
+
+## AC8: Engineering quality (cross-cutting)
+
+These two criteria are not satisfied by a single test; they are properties of the
+test *structure* established uniformly across every phase, and they map to the
+unit-test suite as a whole.
+
+| AC | Literal text | Verification |
+|---|---|---|
+| **AC8.1** | New code reaches ≥95% test coverage via unit tests that execute emitted wasm under the DLR-FT interpreter, with each opcode/feature group individually tested. | **Automated — the unit-test suite as a whole (all phases).** Satisfied cross-cuttingly: every functionality task in Phases 1–7 is TDD'd with inline `#[cfg(test)] mod tests` in its `wasmgen/*.rs` file (`lower.rs`, `module.rs`, `math.rs`, `lookup.rs`/`lower.rs`, `views.rs`, `vector.rs`, `alloc.rs`), each test building and executing a wasm module under the DLR-FT interpreter and asserting against the VM. Each opcode/feature group (scalar core, builtins, transcendentals, lookups, RK/PREVIOUS/INIT, view ops, reducers, vector ops, allocation, modules, override/reset) is individually tested. Coverage ≥95% is a `wasmgen`-wide property of this suite, not one named test. |
+| **AC8.2** | Each functionality phase ends with passing tests for the acceptance criteria it claims to cover. | **Automated — per-phase "Done When" gates (all phases).** Each phase file ends with a "Done When" section enumerating the ACs it claims and the passing tests/commands that demonstrate them; the per-phase floor raise and the `cargo test -p simlin-engine --features file_io wasmgen` / `--test simulate` verifications gate each phase. This is a process/structure criterion satisfied by the phase boundaries themselves. |
+
+---
+
+## Human verification: none required, and why
+
+Every one of the 22 acceptance criteria is automatable, and the plan automates
+all of them. This backend has no human-verification surface:
+
+- It is **engine-internal** — there is no UI, rendering, animation, copy, or
+  interactive UX to inspect. (The `@simlin/engine` TypeScript API, browser
+  worker, and live-graph/diagram UX are explicitly out of scope per the design;
+  the in-scope override/reset and by-name retrieval are engine-side mechanisms
+  validated programmatically.)
+- Its correctness oracle is the **bytecode VM**, an in-repo executable
+  specification. Every numeric/behavioral claim is a diff against the VM (or, for
+  C-LEARN, against `Ref.vdf`) under the existing comparators — fully
+  programmatic.
+- Even the criteria that look qualitative reduce to automated checks:
+  "self-describing" (AC4.1) is asserted by reading exported globals with no
+  external metadata; "clean error, never a panic" (AC1.4) and "surfaces a
+  `SimlinError` rather than panicking" (AC6.2) are negative-path tests; the
+  cross-cutting engineering-quality criteria (AC8.1/AC8.2) are satisfied by the
+  per-opcode TDD + DLR-FT unit-test structure and the per-phase "Done When"
+  gates.
+
+The only non-test deliverable is Phase 8 Task 3 (documentation), which carries no
+AC and is verified by review.
diff --git a/docs/test-plans/2026-05-20-wasm-backend.md b/docs/test-plans/2026-05-20-wasm-backend.md
new file mode 100644
index 000000000..e63bdf3bf
--- /dev/null
+++ b/docs/test-plans/2026-05-20-wasm-backend.md
@@ -0,0 +1,66 @@
+# Human Test Plan: WebAssembly Simulation Backend
+
+Companion to [design-plans/2026-05-20-wasm-backend.md](../design-plans/2026-05-20-wasm-backend.md) and [implementation-plans/2026-05-20-wasm-backend/](../implementation-plans/2026-05-20-wasm-backend/).
+
+The wasm backend is engine-internal with the bytecode VM as its automated correctness oracle, so nearly everything is machine-verified: all 22 acceptance criteria map to genuine, non-vacuous automated tests that execute the emitted wasm under the DLR-FT interpreter and compare against the VM (or `crate::float::approx_eq` / `crate::vm::lookup*` / `crate::alloc::*` / `Ref.vdf`). The steps below cover the residual surface automation can't fully stand in for: the heavy `#[ignore]`d parity twins, the FFI driven from a real (non-Rust-test) host, the AC3.3 deliberate-regression confidence check, and an optional line-coverage measurement for AC8.1.
+
+## Prerequisites
+
+- `./scripts/dev-init.sh` has been run (idempotent).
+- The default suites are green (re-run if the tree changed):
+  - `cargo test -p simlin-engine --features file_io --lib wasmgen` (~259 tests)
+  - `cargo test -p simlin-engine --features file_io --test simulate` (incl. `wasm_parity_floor`)
+  - `cargo test -p simlin-engine --features file_io --test simulate_systems` (incl. `wasm_systems_parity_floor`)
+  - `cargo test -p simlin --test wasm` (FFI)
+
+## Phase A: Heavy parity twins (AC1.3; AC1.1/AC7.4 at scale)
+
+These are `#[ignore]`d for runtime (the DLR-FT interpreter is not a JIT) and never run in the default suite, so they are the only automated coverage of C-LEARN-against-`Ref.vdf` and WORLD3-at-scale through wasm. Run in release.
+
+| Step | Action | Expected |
+|------|--------|----------|
+| A1 | `cargo test -p simlin-engine --release --features file_io --test simulate -- --ignored simulates_clearn_wasm` | Passes. C-LEARN compiles to wasm, runs under the interpreter, and clears the 1% VDF gate + `EXPECTED_VDF_RESIDUAL` carve-out -- the same gate the VM clears (~3358 vars matched / 84 excluded across 251 steps). |
+| A2 | `cargo test -p simlin-engine --release --features file_io --test simulate -- --ignored simulates_wrld3_03_wasm` | Passes. WORLD3 wasm output matches the VM element-for-element. |
+
+## Phase B: FFI from a real host (AC6.1, AC6.2, AC4.1, AC4.2)
+
+`src/libsimlin/tests/wasm.rs` drives the FFI in-process; this exercises the same entry point from outside the Rust harness (how TS/WASM, CGo, C/C++ consumers reach it) -- the cross-boundary contract automation can't fully represent.
+
+| Step | Action | Expected |
+|------|--------|----------|
+| B1 | Build the cbindgen header + lib (per [src/libsimlin/CLAUDE.md](../../src/libsimlin/CLAUDE.md)). | `simlin_model_compile_to_wasm` is declared in `simlin.h` with five out-params + `out_error`. |
+| B2 | From a small C/Go driver (or `node` over the WASM build): open a model, `simlin_project_get_model("main")`, then `simlin_model_compile_to_wasm(...)` **without ever calling `simlin_sim_new`**. | Non-NULL `out_wasm`/`out_layout` with non-zero lengths, `out_error == NULL` (AC6.1: works pre-sim). |
+| B3 | Parse the layout per the documented little-endian wire format (`n_slots`/`n_chunks`/`results_offset` as u64; `count` u32; then per entry `name_len` u32 + UTF-8 + `offset` u64). Instantiate the blob, read the exported globals, call `run`, and stride one variable using only the layout. | The strided series matches `simlin_sim_get_series` for that variable; only `n_chunks` values are read per variable (AC4.1/AC4.2). |
+| B4 | `simlin_free(out_wasm); simlin_free(out_layout);` | No crash/leak/double-free. |
+| B5 | Feed an unsupported model (e.g. a true runtime-range subscript `SUM(source[lo:hi])` with variable `lo`/`hi` -> `ViewRangeDynamic`) to `simlin_model_compile_to_wasm`. | `out_error != NULL` with a descriptive message, both buffers NULL, **no panic across the boundary** (AC6.2). |
+| B6 | Pass a NULL `out_layout` pointer. | `out_error` set, no crash. |
+
+## Phase C: AC3.3 deliberate-regression confidence check
+
+The gate is automated; the deliberate break is a manual confidence step. **Do not commit the edit.**
+
+| Step | Action | Expected |
+|------|--------|----------|
+| C1 | Temporarily edit `src/simlin-engine/src/wasmgen/lower.rs` so a common opcode (e.g. the `Op2::Add` arm) returns `WasmGenError::Unsupported(...)`. | -- |
+| C2 | `cargo test -p simlin-engine --features file_io --test simulate` | **Fails**: `wasm_parity_floor` and the per-model `wasm_parity_hook` panic, listing the now-unsupported models (AC3.2/AC3.3). |
+| C3 | `cargo test -p simlin-engine --features file_io --test simulate_systems` | **Fails**: `wasm_systems_parity_floor` panics. |
+| C4 | `git checkout -- src/simlin-engine/src/wasmgen/lower.rs`; re-run C2/C3. | Back to green. |
+
+## Phase D (optional): AC8.1 coverage measurement
+
+| Step | Action | Expected |
+|------|--------|----------|
+| D1 | `cargo llvm-cov -p simlin-engine --features file_io --lib -- wasmgen` (or the repo's configured coverage command); read `src/wasmgen/*` line/region coverage. | `wasmgen/` aggregate >=95%. Pins the AC8.1 number the suite establishes structurally (per-opcode TDD) but does not assert in CI. |
+
+## Traceability
+
+Every acceptance criterion is covered by an automated test (see the test-analysis mapping); the manual steps above add real-host / heavy-model / deliberate-regression confidence on top:
+
+| AC | Manual step(s) | AC | Manual step(s) |
+|----|----------------|----|----------------|
+| AC1.3 | A1 | AC6.1 | B1-B4 |
+| AC1.1/AC7.4 (scale) | A2 | AC6.2 | B5-B6 |
+| AC1.4 | B5 | AC3.3 | C1-C4 |
+| AC4.1/AC4.2 | B3 | AC8.1 | D1 (optional) |
+
+All other ACs (AC1.2, AC1.5, AC2.1, AC2.2, AC3.1, AC3.2, AC5.1, AC5.2, AC7.1, AC7.2, AC7.3, AC8.2) are fully covered by automated tests and need no manual step.
diff --git a/src/engine/wasm-backend-poc.mjs b/src/engine/wasm-backend-poc.mjs
new file mode 100644
index 000000000..3780797bb
--- /dev/null
+++ b/src/engine/wasm-backend-poc.mjs
@@ -0,0 +1,280 @@
+// Throwaway proof-of-concept for the compile-to-WebAssembly backend.
+//
+// Demonstrates the "direct-drive" architecture end to end in Node:
+//   1. load libsimlin.wasm (the engine, compiled to wasm)
+//   2. open default_projects/population/model.xmile and get its model
+//   3. call simlin_model_compile_to_wasm -> a *second* wasm module (the model)
+//   4. JS instantiates that model module directly and drives its `run` export
+//      (libsimlin is not on the per-run hot path)
+//   5. check every VM variable's series shows up as a column of the blob's
+//      results, and compare run-to-run timing of the blob vs the bytecode VM.
+//
+// Run:  node src/engine/wasm-backend-poc.mjs
+//
+// This file is exploratory scaffolding, not part of the @simlin/engine API.
+
+import { readFileSync } from 'node:fs';
+import { fileURLToPath } from 'node:url';
+import { dirname, join } from 'node:path';
+import { performance } from 'node:perf_hooks';
+
+const here = dirname(fileURLToPath(import.meta.url));
+const WASM = join(here, 'core', 'libsimlin.wasm');
+const MODEL = join(here, '..', '..', 'default_projects', 'population', 'model.xmile');
+
+// ── load libsimlin (mirrors src/engine/src/internal/wasm.node.ts) ──────────
+let memory = new WebAssembly.Memory({ initial: 256, maximum: 16384 });
+const lib = await WebAssembly.instantiate(await WebAssembly.compile(readFileSync(WASM)), {
+  env: { memory },
+});
+const E = lib.exports;
+if (E.memory instanceof WebAssembly.Memory) memory = E.memory;
+E.simlin_init?.();
+
+if (typeof E.simlin_model_compile_to_wasm !== 'function') {
+  throw new Error('libsimlin.wasm is stale: missing simlin_model_compile_to_wasm (rebuild it)');
+}
+
+// ── minimal FFI glue (re-derived per call so memory growth is handled) ─────
+const TD = new TextDecoder();
+const TE = new TextEncoder();
+const dv = () => new DataView(memory.buffer);
+const malloc = (n) => {
+  const p = E.simlin_malloc(n);
+  if (!p && n) throw new Error('wasm allocation failed');
+  return p;
+};
+const free = (p) => {
+  if (p) E.simlin_free(p);
+};
+const u32 = (p) => dv().getUint32(p, true);
+const outPtr = () => {
+  const p = malloc(4);
+  dv().setUint32(p, 0, true);
+  return p;
+};
+const writeBytes = (bytes) => {
+  const p = malloc(bytes.length);
+  new Uint8Array(memory.buffer, p, bytes.length).set(bytes);
+  return p;
+};
+const cstr = (s) => writeBytes(TE.encode(s + '\0'));
+const readBytes = (p, n) => new Uint8Array(memory.buffer.slice(p, p + n));
+const readCStr = (p) => {
+  const v = new Uint8Array(memory.buffer);
+  let e = p;
+  while (v[e]) e++;
+  return TD.decode(v.slice(p, e));
+};
+const f64Array = (p, n) => {
+  const d = dv();
+  const out = new Float64Array(n);
+  for (let i = 0; i < n; i++) out[i] = d.getFloat64(p + i * 8, true);
+  return out;
+};
+function checkErr(ep, what) {
+  const err = u32(ep);
+  if (err !== 0) {
+    let msg = '(no message)';
+    const mp = E.simlin_error_get_message(err);
+    if (mp) msg = readCStr(mp);
+    E.simlin_error_free(err);
+    throw new Error(`${what}: ${msg}`);
+  }
+}
+
+// ── open population, get its model, extract the compiled-model wasm ────────
+const xmile = readFileSync(MODEL);
+let dataPtr = writeBytes(xmile);
+let ep = outPtr();
+const project = E.simlin_project_open_xmile(dataPtr, xmile.length, ep);
+checkErr(ep, 'open_xmile');
+free(ep);
+free(dataPtr);
+
+const namePtr = cstr('main');
+ep = outPtr();
+const model = E.simlin_project_get_model(project, namePtr, ep);
+checkErr(ep, 'get_model');
+free(ep);
+free(namePtr);
+
+const outBuf = outPtr();
+const outLen = outPtr();
+const outLayout = outPtr();
+const outLayoutLen = outPtr();
+ep = outPtr();
+// New 6-arg signature: returns the wasm blob AND a serialized WasmLayout
+// (name -> slot offset map + geometry), each via the malloc-return convention.
+E.simlin_model_compile_to_wasm(model, outBuf, outLen, outLayout, outLayoutLen, ep);
+checkErr(ep, 'compile_to_wasm');
+const blobPtr = u32(outBuf);
+const blobLen = u32(outLen);
+const blob = readBytes(blobPtr, blobLen);
+const layoutPtr = u32(outLayout);
+const layoutLen = u32(outLayoutLen);
+const layoutBytes = readBytes(layoutPtr, layoutLen);
+free(blobPtr);
+free(layoutPtr);
+free(outBuf);
+free(outLen);
+free(outLayout);
+free(outLayoutLen);
+free(ep);
+console.log(`compiled model -> ${blobLen} bytes of WebAssembly + ${layoutLen}-byte layout`);
+
+// Parse the serialized WasmLayout (little-endian): n_slots, n_chunks,
+// results_offset (u64 each), count (u32), then per entry name_len (u32) +
+// UTF-8 name + offset (u64). This is the same name->offset map the engine
+// exposes, so a host can read a variable's series by name with no guessing.
+function parseLayout(bytes) {
+  const d = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
+  let p = 0;
+  const u64 = () => {
+    const v = Number(d.getBigUint64(p, true));
+    p += 8;
+    return v;
+  };
+  const u32le = () => {
+    const v = d.getUint32(p, true);
+    p += 4;
+    return v;
+  };
+  const nSlots = u64();
+  const nChunks = u64();
+  const resultsOffset = u64();
+  const count = u32le();
+  const varOffsets = new Map();
+  for (let i = 0; i < count; i++) {
+    const nameLen = u32le();
+    const name = TD.decode(bytes.slice(p, p + nameLen));
+    p += nameLen;
+    varOffsets.set(name, u64());
+  }
+  return { nSlots, nChunks, resultsOffset, varOffsets };
+}
+const layout = parseLayout(layoutBytes);
+console.log(`layout: ${layout.varOffsets.size} named variables`);
+
+// ── direct-drive: JS instantiates the model blob and calls run() ──────────
+const { instance: mi } = await WebAssembly.instantiate(blob, {});
+const ME = mi.exports;
+const nSlots = ME.n_slots.value;
+const nChunks = ME.n_chunks.value;
+const resultsOffset = ME.results_offset.value;
+console.log(`blob self-describes: n_slots=${nSlots}, n_chunks=${nChunks}, results_offset=${resultsOffset}`);
+
+ME.run();
+const blobColumn = (col) => {
+  const d = new DataView(ME.memory.buffer);
+  const s = new Float64Array(nChunks);
+  for (let c = 0; c < nChunks; c++) s[c] = d.getFloat64(resultsOffset + (c * nSlots + col) * 8, true);
+  return s;
+};
+const blobCols = Array.from({ length: nSlots }, (_, c) => blobColumn(c));
+
+// ── VM golden via libsimlin ────────────────────────────────────────────────
+ep = outPtr();
+const sim = E.simlin_sim_new(model, 0, ep);
+checkErr(ep, 'sim_new');
+free(ep);
+ep = outPtr();
+E.simlin_sim_run_to_end(sim, ep);
+checkErr(ep, 'run_to_end');
+free(ep);
+
+const vmSeries = (name) => {
+  const np = cstr(name);
+  const rp = malloc(nChunks * 8);
+  const wp = outPtr();
+  const e = outPtr();
+  E.simlin_sim_get_series(sim, np, rp, nChunks, wp, e);
+  checkErr(e, `get_series(${name})`);
+  const written = u32(wp);
+  const s = f64Array(rp, written);
+  free(np);
+  free(rp);
+  free(wp);
+  free(e);
+  return s;
+};
+
+// ── correctness: match every VM variable's series to a blob column ─────────
+console.log('\ncorrectness (each VM variable matched to a blob column by value):');
+const vars = ['time', 'population', 'births', 'deaths', 'birth_rate', 'average_lifespan'];
+let worst = 0;
+for (const name of vars) {
+  let vm;
+  try {
+    vm = vmSeries(name);
+  } catch (e) {
+    console.log(`  ${name.padEnd(18)} (skipped: ${e.message})`);
+    continue;
+  }
+  let best = Infinity;
+  let bestCol = -1;
+  for (let col = 0; col < nSlots; col++) {
+    let m = 0;
+    for (let c = 0; c < vm.length; c++) m = Math.max(m, Math.abs(vm[c] - blobCols[col][c]));
+    if (m < best) {
+      best = m;
+      bestCol = col;
+    }
+  }
+  worst = Math.max(worst, best);
+  console.log(`  ${name.padEnd(18)} -> blob column ${bestCol}, max|Δ| = ${best.toExponential(2)}`);
+}
+console.log(`worst mismatch across variables: ${worst.toExponential(2)} -> ${worst < 1e-9 ? 'MATCH' : 'FAIL'}`);
+
+// ── by-name reads via the layout (no brute-force column matching) ──────────
+// The layout's name -> offset map lets a host read a variable's series directly,
+// striding the results region by `n_slots`. Verify it agrees with the VM.
+console.log('\nby-name reads via the returned layout:');
+let worstByName = 0;
+for (const name of vars) {
+  let vm;
+  try {
+    vm = vmSeries(name);
+  } catch {
+    continue;
+  }
+  const off = layout.varOffsets.get(name);
+  if (off === undefined) {
+    console.log(`  ${name.padEnd(18)} (not in layout)`);
+    continue;
+  }
+  const series = blobColumn(off);
+  let m = 0;
+  for (let c = 0; c < vm.length; c++) m = Math.max(m, Math.abs(vm[c] - series[c]));
+  worstByName = Math.max(worstByName, m);
+  console.log(`  ${name.padEnd(18)} -> layout offset ${off}, max|Δ| = ${m.toExponential(2)}`);
+}
+console.log(`worst by-name mismatch: ${worstByName.toExponential(2)} -> ${worstByName < 1e-9 ? 'MATCH' : 'FAIL'}`);
+
+const pop = vmSeries('population');
+console.log(`\npopulation: ${pop[0].toFixed(2)} (t=start) ... ${pop[pop.length - 1].toFixed(2)} (t=stop), ${pop.length} steps`);
+
+// ── timing: blob run() vs VM reset+run_to_end (both re-simulate from t0) ───
+console.log('\ntiming (each call re-runs the whole simulation):');
+const NB = 5000;
+let t = performance.now();
+for (let i = 0; i < NB; i++) ME.run();
+const blobMs = (performance.now() - t) / NB;
+
+const NV = 500;
+t = performance.now();
+for (let i = 0; i < NV; i++) {
+  const e1 = outPtr();
+  E.simlin_sim_reset(sim, e1);
+  checkErr(e1, 'reset');
+  free(e1);
+  const e2 = outPtr();
+  E.simlin_sim_run_to_end(sim, e2);
+  checkErr(e2, 'run_to_end');
+  free(e2);
+}
+const vmMs = (performance.now() - t) / NV;
+
+console.log(`  blob run():           ${blobMs.toFixed(5)} ms/run  (${NB} runs)`);
+console.log(`  VM reset+run_to_end:  ${vmMs.toFixed(5)} ms/run  (${NV} runs)`);
+console.log(`  blob is ${(vmMs / blobMs).toFixed(1)}x faster per re-simulation`);
diff --git a/src/libsimlin/CLAUDE.md b/src/libsimlin/CLAUDE.md
index c88158a47..b29edbd4b 100644
--- a/src/libsimlin/CLAUDE.md
+++ b/src/libsimlin/CLAUDE.md
@@ -40,6 +40,7 @@ All public FFI functions are prefixed with `simlin_` and declared `extern "C"`.
 - **`src/model.rs`** - Inspect model structure:
   - `simlin_model_{ref,unref}()`, `simlin_model_get_var_count()`, `simlin_model_get_var_names()`
   - `simlin_model_get_dependencies()`, `simlin_model_get_links()`, `simlin_model_get_equations()`
+  - `simlin_model_compile_to_wasm()` - Compile the model to a self-contained wasm module (engine `wasmgen` backend, an alternative to the VM for fast repeated re-simulation). Returns two malloc'd buffers, each freed with `simlin_free`: the wasm blob and a serialized `WasmLayout` (length-prefixed, little-endian: geometry `n_slots`/`n_chunks`/`results_offset` then a canonical-name -> slot-offset map a host strides the results region with). Works from a `SimlinModel`'s datamodel alone -- no `SimlinSim` required -- and stores a `SimlinError` (never panics) on any compile/codegen failure
 
 ### Serialization
 
@@ -81,6 +82,7 @@ Integration tests live in `tests/` (standard Rust layout), organized by FFI modu
 - **`tests/patch.rs`** - JSON patch application, error collection, unit warnings, XMILE patches
 - **`tests/incremental.rs`** - Incremental compilation path (patch-then-sim, snapshot isolation)
 - **`tests/analysis.rs`** - Causal analysis: incoming links, loop detection, loop scores
+- **`tests/wasm.rs`** - `simlin_model_compile_to_wasm`: validates and executes the returned blob under the DLR-FT interpreter (a libsimlin dev-dependency), parses the returned layout per its documented wire format, and checks the strided series against the VM via `simlin_sim_get_series`; also asserts a graceful `SimlinError` (no panic) for an unsupported model
 - **`tests/rendering.rs`** - SVG and PNG diagram rendering
 - **`tests/diagram.rs`** - Diagram layout sync
 - **`tests/errors.rs`** - Error formatting, error kind mapping, diagnostics
diff --git a/src/libsimlin/Cargo.toml b/src/libsimlin/Cargo.toml
index 1585091dd..51c664d54 100644
--- a/src/libsimlin/Cargo.toml
+++ b/src/libsimlin/Cargo.toml
@@ -31,6 +31,14 @@ anyhow = "1.0"
 mimalloc = { version = "0.1", optional = true }
 
 [dev-dependencies]
+# Pure-Rust no_std wasm interpreter (the same DLR-FT rev simlin-engine pins),
+# used by the `tests/wasm.rs` integration test to validate and execute the blob
+# `simlin_model_compile_to_wasm` returns and check it against the returned
+# layout. Dev-only: dev-dependencies are never built into the cdylib/staticlib
+# or the wasm32 bundle (which uses --no-default-features), so this cannot leak
+# into the shipped library.
+wasm-interpreter = { git = "https://github.com/DLR-FT/wasm-interpreter.git", rev = "64cedbba603edfd64cbb6b5a19f5fa34530bb03a" }
+checked = { git = "https://github.com/DLR-FT/wasm-interpreter.git", rev = "64cedbba603edfd64cbb6b5a19f5fa34530bb03a", features = ["linker", "interop"] }
 
 [package.metadata.wasm-pack.profile.release]
 wasm-opt = false
diff --git a/src/libsimlin/simlin.h b/src/libsimlin/simlin.h
index 6d8f24276..0623ec37e 100644
--- a/src/libsimlin/simlin.h
+++ b/src/libsimlin/simlin.h
@@ -349,6 +349,41 @@ void simlin_free(uint8_t *ptr);
 // - `s` must be a valid pointer returned by simlin API functions that return strings
 void simlin_free_string(char *s);
 
+// Compile the model to a self-contained WebAssembly module plus its layout.
+//
+// The emitted module exports its own linear `memory` and a `run` function
+// that executes the whole simulation in one call, writing step-major result
+// snapshots into a results region of its memory. This is an alternative to
+// the bytecode VM intended for fast, repeated re-simulation (e.g. interactive
+// parameter scrubbing): the host instantiates the module once and calls `run`
+// on every change.
+//
+// Two buffers are returned via the malloc-return convention, each freed
+// separately with `simlin_free`:
+// - `out_wasm`/`out_wasm_len`: the wasm blob.
+// - `out_layout`/`out_layout_len`: a self-describing, length-prefixed layout
+//   buffer (all integers little-endian): `n_slots` (u64), `n_chunks` (u64),
+//   `results_offset` (u64), `count` (u32), then per entry `name_len` (u32) +
+//   UTF-8 name + `offset` (u64). A host strides one variable's `n_chunks`-long
+//   series from the results region using `results_offset`, `n_slots`, and the
+//   variable's `offset` from this map.
+//
+// Works from the model's datamodel alone -- no `SimlinSim` is required. Any
+// compile or codegen failure stores a `SimlinError` (never panics across the
+// boundary) and leaves both output buffers NULL.
+//
+// # Safety
+// - `model` must be a valid pointer to a SimlinModel
+// - `out_wasm`, `out_wasm_len`, `out_layout`, and `out_layout_len` must be
+//   valid, non-null pointers
+// - `out_error` may be null
+void simlin_model_compile_to_wasm(SimlinModel *model,
+                                  uint8_t **out_wasm,
+                                  uintptr_t *out_wasm_len,
+                                  uint8_t **out_layout,
+                                  uintptr_t *out_layout_len,
+                                  SimlinError **out_error);
+
 // Increments the reference count of a model
 //
 // # Safety
diff --git a/src/libsimlin/src/model.rs b/src/libsimlin/src/model.rs
index 8c5472b2a..d830a7b3b 100644
--- a/src/libsimlin/src/model.rs
+++ b/src/libsimlin/src/model.rs
@@ -85,6 +85,118 @@ unsafe fn write_bytes_to_ffi_output(
     true
 }
 
+/// Compile the model to a self-contained WebAssembly module plus its layout.
+///
+/// The emitted module exports its own linear `memory` and a `run` function
+/// that executes the whole simulation in one call, writing step-major result
+/// snapshots into a results region of its memory. This is an alternative to
+/// the bytecode VM intended for fast, repeated re-simulation (e.g. interactive
+/// parameter scrubbing): the host instantiates the module once and calls `run`
+/// on every change.
+///
+/// Two buffers are returned via the malloc-return convention, each freed
+/// separately with `simlin_free`:
+/// - `out_wasm`/`out_wasm_len`: the wasm blob.
+/// - `out_layout`/`out_layout_len`: a self-describing, length-prefixed layout
+///   buffer (all integers little-endian): `n_slots` (u64), `n_chunks` (u64),
+///   `results_offset` (u64), `count` (u32), then per entry `name_len` (u32) +
+///   UTF-8 name + `offset` (u64). A host strides one variable's `n_chunks`-long
+///   series from the results region using `results_offset`, `n_slots`, and the
+///   variable's `offset` from this map.
+///
+/// Works from the model's datamodel alone -- no `SimlinSim` is required. Any
+/// compile or codegen failure stores a `SimlinError` (never panics across the
+/// boundary) and leaves both output buffers NULL.
+///
+/// # Safety
+/// - `model` must be a valid pointer to a SimlinModel
+/// - `out_wasm`, `out_wasm_len`, `out_layout`, and `out_layout_len` must be
+///   valid, non-null pointers
+/// - `out_error` may be null
+#[no_mangle]
+pub unsafe extern "C" fn simlin_model_compile_to_wasm(
+    model: *mut SimlinModel,
+    out_wasm: *mut *mut u8,
+    out_wasm_len: *mut usize,
+    out_layout: *mut *mut u8,
+    out_layout_len: *mut usize,
+    out_error: *mut *mut SimlinError,
+) {
+    clear_out_error(out_error);
+    if out_wasm.is_null()
+        || out_wasm_len.is_null()
+        || out_layout.is_null()
+        || out_layout_len.is_null()
+    {
+        store_error(
+            out_error,
+            SimlinError::new(SimlinErrorCode::Generic)
+                .with_message("output pointers must not be NULL"),
+        );
+        return;
+    }
+    *out_wasm = ptr::null_mut();
+    *out_wasm_len = 0;
+    *out_layout = ptr::null_mut();
+    *out_layout_len = 0;
+
+    let model_ref = match require_model(model) {
+        Ok(m) => m,
+        Err(err) => {
+            store_anyhow_error(out_error, err);
+            return;
+        }
+    };
+
+    // The compiled-model wasm is regenerated from the project's datamodel; it
+    // does not depend on the VM `SimState`, so this works even before a
+    // `SimlinSim` has been created for the model.
+    let project_ref = &*model_ref.project;
+    let datamodel = project_ref.datamodel.lock().unwrap();
+
+    let artifact = match engine::wasmgen::compile_datamodel_to_artifact(
+        &datamodel,
+        model_ref.model_name.as_str(),
+    ) {
+        Ok(artifact) => artifact,
+        Err(err) => {
+            store_error(
+                out_error,
+                SimlinError::new(SimlinErrorCode::Generic)
+                    .with_message(format!("wasm code generation failed: {err}")),
+            );
+            return;
+        }
+    };
+
+    let layout_bytes = artifact.layout.serialize();
+
+    // Write the wasm blob first. On its allocation failure `write_bytes_to_ffi_output`
+    // stores the error and returns false; bail before touching the layout buffer.
+    if !write_bytes_to_ffi_output(
+        &artifact.wasm,
+        out_wasm,
+        out_wasm_len,
+        out_error,
+        "model wasm",
+    ) {
+        return;
+    }
+    // If the layout allocation fails, free the wasm buffer already handed out so
+    // the caller is never left with one buffer set and the other NULL-but-leaked.
+    if !write_bytes_to_ffi_output(
+        &layout_bytes,
+        out_layout,
+        out_layout_len,
+        out_error,
+        "model wasm layout",
+    ) {
+        crate::memory::simlin_free(*out_wasm);
+        *out_wasm = ptr::null_mut();
+        *out_wasm_len = 0;
+    }
+}
+
 /// Find a model by name in a locked datamodel.
 pub(crate) fn find_model_in_datamodel<'a>(
     datamodel: &'a MutexGuard<'_, datamodel::Project>,
diff --git a/src/libsimlin/tests/wasm.rs b/src/libsimlin/tests/wasm.rs
new file mode 100644
index 000000000..21c1c3381
--- /dev/null
+++ b/src/libsimlin/tests/wasm.rs
@@ -0,0 +1,347 @@
+// Copyright 2026 The Simlin Authors. All rights reserved.
+// Use of this source code is governed by the Apache License,
+// Version 2.0, that can be found in the LICENSE file.
+
+//! FFI integration tests for `simlin_model_compile_to_wasm`.
+//!
+//! These exercise the host-facing contract: the function returns a valid wasm
+//! blob plus a self-describing, length-prefixed layout buffer (both freeable
+//! with `simlin_free`), works from a `SimlinModel` alone (no `SimlinSim`), and
+//! surfaces a `SimlinError` -- never a panic -- for a model the wasm backend
+//! cannot compile. The blob is validated and executed under the same DLR-FT
+//! interpreter the engine's own wasmgen tests use, and the series a host would
+//! stride from the results region (using only the returned layout) is checked
+//! against the bytecode VM via `simlin_sim_get_series`.
+
+mod common;
+
+use std::ptr;
+
+use checked::Store;
+use common::open_project_from_datamodel;
+use simlin::*;
+use simlin_engine::test_common::TestProject;
+use wasm::validate;
+
+/// A small scalar stock-and-flow model: a constant inflow fills a stock. Used as
+/// the supported-model fixture (it runs through the wasm backend cleanly).
+fn simple_model() -> simlin_engine::datamodel::Project {
+    TestProject::new("ffi_wasm")
+        .with_sim_time(0.0, 10.0, 1.0)
+        .aux("inflow_rate", "2", None)
+        .stock("level", "0", &["inflow"], &[], None)
+        .flow("inflow", "inflow_rate", None)
+        .build_datamodel()
+}
+
+/// The host-side layout parse, mirroring the documented little-endian wire
+/// format (`n_slots`/`n_chunks`/`results_offset` u64, `count` u32, then per entry
+/// `name_len` u32 + UTF-8 name + `offset` u64). Returns the geometry and the
+/// name->offset map.
+struct ParsedLayout {
+    n_slots: usize,
+    n_chunks: usize,
+    results_offset: usize,
+    var_offsets: Vec<(String, usize)>,
+}
+
+fn parse_layout(bytes: &[u8]) -> ParsedLayout {
+    let mut pos = 0usize;
+    let read_u64 = |pos: &mut usize| -> u64 {
+        let v = u64::from_le_bytes(bytes[*pos..*pos + 8].try_into().unwrap());
+        *pos += 8;
+        v
+    };
+    let read_u32 = |pos: &mut usize| -> u32 {
+        let v = u32::from_le_bytes(bytes[*pos..*pos + 4].try_into().unwrap());
+        *pos += 4;
+        v
+    };
+    let n_slots = read_u64(&mut pos) as usize;
+    let n_chunks = read_u64(&mut pos) as usize;
+    let results_offset = read_u64(&mut pos) as usize;
+    let count = read_u32(&mut pos) as usize;
+    let mut var_offsets = Vec::with_capacity(count);
+    for _ in 0..count {
+        let name_len = read_u32(&mut pos) as usize;
+        let name = String::from_utf8(bytes[pos..pos + name_len].to_vec()).unwrap();
+        pos += name_len;
+        let offset = read_u64(&mut pos) as usize;
+        var_offsets.push((name, offset));
+    }
+    assert_eq!(pos, bytes.len(), "layout buffer had trailing bytes");
+    ParsedLayout {
+        n_slots,
+        n_chunks,
+        results_offset,
+        var_offsets,
+    }
+}
+
+/// AC6.1: `simlin_model_compile_to_wasm` returns a valid wasm blob plus the
+/// name->offset layout via the malloc-return convention; both buffers free with
+/// `simlin_free`; it works from a `SimlinModel` with no `SimlinSim`.
+#[test]
+fn compile_to_wasm_returns_blob_and_layout() {
+    let datamodel = simple_model();
+    unsafe {
+        let project = open_project_from_datamodel(&datamodel);
+        let model_name = std::ffi::CString::new("main").unwrap();
+        let mut err: *mut SimlinError = ptr::null_mut();
+        // No SimlinSim is ever created -- the model handle alone must suffice.
+        let model = simlin_project_get_model(project, model_name.as_ptr(), &mut err);
+        assert!(err.is_null(), "get_model should not error");
+        assert!(!model.is_null(), "model handle must be non-null");
+
+        let mut out_wasm: *mut u8 = ptr::null_mut();
+        let mut out_wasm_len: usize = 0;
+        let mut out_layout: *mut u8 = ptr::null_mut();
+        let mut out_layout_len: usize = 0;
+        let mut err: *mut SimlinError = ptr::null_mut();
+        simlin_model_compile_to_wasm(
+            model,
+            &mut out_wasm,
+            &mut out_wasm_len,
+            &mut out_layout,
+            &mut out_layout_len,
+            &mut err,
+        );
+        assert!(
+            err.is_null(),
+            "compile_to_wasm should not error on a supported model"
+        );
+        assert!(
+            !out_wasm.is_null() && out_wasm_len > 0,
+            "wasm blob must be non-empty"
+        );
+        assert!(
+            !out_layout.is_null() && out_layout_len > 0,
+            "layout buffer must be non-empty"
+        );
+
+        // The wasm blob validates under the interpreter.
+        let wasm = std::slice::from_raw_parts(out_wasm, out_wasm_len).to_vec();
+        validate(&wasm).expect("returned blob must validate");
+
+        // The layout deserializes to the expected geometry + name->offset map.
+        let layout_bytes = std::slice::from_raw_parts(out_layout, out_layout_len).to_vec();
+        let layout = parse_layout(&layout_bytes);
+        assert!(
+            layout.n_slots >= 4,
+            "scalar model has at least the 4 reserved slots"
+        );
+        // dt=1 over [0,10] -> 11 saved samples.
+        assert_eq!(layout.n_chunks, 11, "n_chunks should match the sim specs");
+        // The results region sits two chunks past the start of memory (curr+next).
+        assert_eq!(
+            layout.results_offset,
+            2 * layout.n_slots * 8,
+            "results_offset = 2 chunks (curr + next) past byte 0"
+        );
+        for name in ["level", "inflow", "inflow_rate"] {
+            assert!(
+                layout.var_offsets.iter().any(|(n, _)| n == name),
+                "{name} must appear in the layout name->offset map"
+            );
+        }
+        // Offsets are within a chunk.
+        for (name, off) in &layout.var_offsets {
+            assert!(
+                *off < layout.n_slots,
+                "{name} offset {off} must be < n_slots"
+            );
+        }
+
+        // Run the blob and stride `level`'s series using only the layout, then
+        // check it against the VM's series.
+        let level_off = layout
+            .var_offsets
+            .iter()
+            .find(|(n, _)| n == "level")
+            .map(|(_, o)| *o)
+            .unwrap();
+        let blob_level = run_and_stride(&wasm, &layout, level_off);
+        // level integrates by 2/step: 0, 2, 4, ..., 20.
+        assert!((blob_level[0]).abs() < 1e-9, "level starts at 0");
+        assert!(
+            (blob_level[blob_level.len() - 1] - 20.0).abs() < 1e-9,
+            "level reaches 20 by the last step, got {}",
+            blob_level[blob_level.len() - 1]
+        );
+        let vm_level = vm_series(project, &model_name, "level", layout.n_chunks);
+        assert_eq!(blob_level.len(), vm_level.len());
+        for (c, (&b, &v)) in blob_level.iter().zip(vm_level.iter()).enumerate() {
+            assert!((b - v).abs() < 1e-9, "level chunk {c}: blob {b} != vm {v}");
+        }
+
+        // Both buffers free with simlin_free without leaking or double-free.
+        simlin_free(out_wasm);
+        simlin_free(out_layout);
+
+        simlin_model_unref(model);
+        simlin_project_unref(project);
+    }
+}
+
+/// AC6.2: a model the wasm backend cannot compile surfaces a `SimlinError`
+/// (out_error is set, both buffers stay NULL), never a panic across the FFI
+/// boundary. `SUM(source[lo:hi])` with variable bounds lowers to a runtime view
+/// range the fully-unrolled emitter cannot express.
+#[test]
+fn compile_to_wasm_unsupported_model_surfaces_error() {
+    let datamodel = TestProject::new("ffi_wasm_unsupported")
+        .with_sim_time(0.0, 5.0, 1.0)
+        .indexed_dimension("A", 5)
+        .array_aux("source[A]", "A")
+        .scalar_aux("lo", "2")
+        .scalar_aux("hi", "4")
+        .scalar_aux("total", "SUM(source[lo:hi])")
+        .build_datamodel();
+    unsafe {
+        let project = open_project_from_datamodel(&datamodel);
+        let model_name = std::ffi::CString::new("main").unwrap();
+        let mut err: *mut SimlinError = ptr::null_mut();
+        let model = simlin_project_get_model(project, model_name.as_ptr(), &mut err);
+        assert!(err.is_null());
+        assert!(!model.is_null());
+
+        let mut out_wasm: *mut u8 = ptr::null_mut();
+        let mut out_wasm_len: usize = 0;
+        let mut out_layout: *mut u8 = ptr::null_mut();
+        let mut out_layout_len: usize = 0;
+        let mut err: *mut SimlinError = ptr::null_mut();
+        simlin_model_compile_to_wasm(
+            model,
+            &mut out_wasm,
+            &mut out_wasm_len,
+            &mut out_layout,
+            &mut out_layout_len,
+            &mut err,
+        );
+
+        assert!(!err.is_null(), "an unsupported model must set out_error");
+        // The message names the unsupported construct (no panic, a clean error).
+        let msg_ptr = simlin_error_get_message(err);
+        assert!(!msg_ptr.is_null(), "the error must carry a message");
+        let msg = std::ffi::CStr::from_ptr(msg_ptr).to_str().unwrap();
+        assert!(
+            msg.contains("ViewRangeDynamic") || msg.contains("code generation failed"),
+            "error message should describe the codegen failure, got: {msg}"
+        );
+        // Both output buffers stay NULL on failure.
+        assert!(
+            out_wasm.is_null() && out_wasm_len == 0,
+            "wasm buffer stays NULL on error"
+        );
+        assert!(
+            out_layout.is_null() && out_layout_len == 0,
+            "layout buffer stays NULL on error"
+        );
+
+        simlin_error_free(err);
+        simlin_model_unref(model);
+        simlin_project_unref(project);
+    }
+}
+
+/// NULL output pointers are rejected with an error rather than a crash.
+#[test]
+fn compile_to_wasm_null_outputs_error() {
+    let datamodel = simple_model();
+    unsafe {
+        let project = open_project_from_datamodel(&datamodel);
+        let model_name = std::ffi::CString::new("main").unwrap();
+        let mut err: *mut SimlinError = ptr::null_mut();
+        let model = simlin_project_get_model(project, model_name.as_ptr(), &mut err);
+        assert!(!model.is_null());
+
+        let mut out_wasm: *mut u8 = ptr::null_mut();
+        let mut out_wasm_len: usize = 0;
+        let mut out_layout_len: usize = 0;
+        let mut err: *mut SimlinError = ptr::null_mut();
+        // A NULL out_layout pointer must be rejected.
+        simlin_model_compile_to_wasm(
+            model,
+            &mut out_wasm,
+            &mut out_wasm_len,
+            ptr::null_mut(),
+            &mut out_layout_len,
+            &mut err,
+        );
+        assert!(!err.is_null(), "a NULL output pointer must set out_error");
+        simlin_error_free(err);
+
+        simlin_model_unref(model);
+        simlin_project_unref(project);
+    }
+}
+
+/// Instantiate `wasm` under the interpreter, invoke `run`, and stride out the
+/// `n_chunks`-long series for the variable at `off` (using only the layout).
+fn run_and_stride(wasm: &[u8], layout: &ParsedLayout, off: usize) -> Vec<f64> {
+    let info = validate(wasm).expect("validate");
+    let mut store = Store::new(());
+    let inst = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("instantiate")
+        .module_addr;
+    let run = store
+        .instance_export(inst, "run")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    store.invoke_simple_typed::<(), ()>(run, ()).expect("run");
+    let mem = store
+        .instance_export(inst, "memory")
+        .unwrap()
+        .as_mem()
+        .unwrap();
+    let base = layout.results_offset;
+    let n_slots = layout.n_slots;
+    store.mem_access_mut_slice(mem, |bytes| {
+        (0..layout.n_chunks)
+            .map(|c| {
+                let a = base + (c * n_slots + off) * 8;
+                f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap())
+            })
+            .collect()
+    })
+}
+
+/// The VM's series for `name` via `simlin_sim_new` + `simlin_sim_get_series`.
+unsafe fn vm_series(
+    project: *mut SimlinProject,
+    model_name: &std::ffi::CStr,
+    name: &str,
+    n_chunks: usize,
+) -> Vec<f64> {
+    let mut err: *mut SimlinError = ptr::null_mut();
+    let model = simlin_project_get_model(project, model_name.as_ptr(), &mut err);
+    assert!(err.is_null());
+    let sim = simlin_sim_new(model, false, &mut err);
+    assert!(
+        err.is_null(),
+        "sim_new should succeed for a supported model"
+    );
+    simlin_sim_run_to_end(sim, &mut err);
+    assert!(err.is_null(), "run_to_end should succeed");
+
+    let name_c = std::ffi::CString::new(name).unwrap();
+    let mut results = vec![0.0f64; n_chunks];
+    let mut written: usize = 0;
+    let mut err: *mut SimlinError = ptr::null_mut();
+    simlin_sim_get_series(
+        sim,
+        name_c.as_ptr(),
+        results.as_mut_ptr(),
+        n_chunks,
+        &mut written,
+        &mut err,
+    );
+    assert!(err.is_null(), "get_series should succeed");
+    results.truncate(written);
+
+    simlin_sim_unref(sim);
+    simlin_model_unref(model);
+    results
+}
diff --git a/src/simlin-engine/CLAUDE.md b/src/simlin-engine/CLAUDE.md
index a51188a93..ce60d3e82 100644
--- a/src/simlin-engine/CLAUDE.md
+++ b/src/simlin-engine/CLAUDE.md
@@ -2,8 +2,6 @@
 
 Core simulation engine for system dynamics models. Compiles, type-checks, unit-checks, and simulates SD models. See the root `CLAUDE.md` for full development guidelines; this file maps where functionality lives.
 
-**Last updated: 2026-05-21 (unit-inference robustness, GH #614 + the `units.rs:263` TODO: the unit subsystem no longer fails all-or-nothing. `units_infer::infer` returns `InferenceResult { resolved, conflicts }` -- a dimensional conflict no longer discards the units already resolved (a contradiction is confined to its connected component, since substitution only flows along shared metavariables), and `find_constraint_mismatches` collects every residual contradiction rather than the first; `db_units::check_model_units` keeps the resolved units (so the rest of the model is still checked) and surfaces conflicts as ONE umbrella warning, not one-per-conflict. `gen_constraints` is now total (returns `Units`, not a vestigial always-`Ok` `UnitResult`; removed the `.unwrap()` panic landmine and the `None`-arm gap that dropped declared-units propagation for equation-less variables). `units::Context::new`/`new_with_builtins` return `(Context, Vec<errors>)` instead of discarding the whole built context on the first duplicate/conflicting declaration (an empty context lost project-wide alias normalization). RANK results are dimensionless in both `units_infer` and `units_check` (an ordinal index, not the ranked array's units). `ModelStage0`/`ModelStage1` gain `is_macro`: a Vensim macro's body-variable units may name the formal parameters (`~ xfrom` inside RAMP FROM TO -- a polymorphic unit, not a base unit), so inference skips declared-units constraints for macro bodies; without this, keeping the resolved map re-floods C-LEARN (the `xfrom`/`xto` leak), with it C-LEARN holds its documented 14-diagnostic residual. New tests in `units_infer.rs` (partial-results-survive-conflict, declared-units-without-equation propagation, macro-polymorphic-units, RANK dimensionless), `units.rs` (partial-context), `unit_checking_test.rs` (RANK-in-checking).) Earlier 2026-05-21 (#606: a standalone lookup-only variable -- a graphical-function holder with no functional input -- is now a non-value-bearing **static table** (`Variable::Var::is_table_only` / `db::source_var_is_table_only`), NOT a runtime variable: excluded from the runlist and the saved output (produces no series), its data reached only via `LOOKUP(table, x)` call sites. A table reference is kept off the data-flow dependency graph by a dedicated `builtins::BuiltinContents::LookupTable` walk variant; `referenced_tables` on `VariableDeps`/`ImplicitVarDeps` re-supplies the fragment compiler's metadata + tables map. A bare reference (no argument) is a compile error (`ErrorCode::LookupReferencedWithoutArgument`, emitted in `db_var_fragment::lower_var_fragment`). The MDL importer emits the canonical empty-equation form; the `"0+0"` `LOOKUP_SENTINEL` is still ACCEPTED on read (now produced only for empty-RHS vars). The shared lookup-only predicate moved to `src/variable.rs`. Retired the `gf(Time)` lowering (`lookup_only_index_expr`, `LookupOnlyLayout`). C-LEARN `EXPECTED_VDF_RESIDUAL` shrank 13->4. SUPERSEDES the #590 `gf(Time)` primitive (1) below.) Earlier (2026-05-20): C-LEARN residual closure (#590/#591), as five general Vensim import/simulation primitives, not model-specific patches: (1) a standalone lookup-only variable -- a graphical-function holder with no functional input -- lowers uniformly to `gf(Time)` across scalar/A2A/arrayed shapes (`src/compiler/mod.rs`: `var_is_lookup_only`/`is_lookup_only`/`lookup_only_index_expr` + `LookupOnlyLayout { PerElement, Shared }`); (2) a genuine passthrough macro `:MACRO: INIT(x) = INITIAL(x)` collapses to the builtin opcode at the call site (`module_functions.rs`: `classify_passthrough` + a `passthrough: Option<PassthroughBuiltin>` field on `ModuleFunctionDescriptor`, classified at `MacroRegistry::build`), instead of expanding a buggy per-element synthetic module; (3) the import-time XMILE formatter no longer linearizes a shadowed `RAMP FROM TO` macro -- `xmile_compat.rs::format_call_ctx` carries a macro-shadowing audit and the `ramp from to` restructuring arm was removed so the call survives as `RAMP_FROM_TO(...)` and resolves through the macro path; (4) the simulation **initials runlist is now deterministic** (`db_dep_graph.rs` sorts the init set before `topo_sort_str`, GH #595) and the dt stock-submodel-output chain-break applies to ALL readers (parity with the legacy `model.rs::module_output_deps` gate); (5) the VDF reader re-binds a **standalone** graphical-function descriptor to its forward-link output OT (`record_results.rs::standalone_descriptor_rebinds`). New `#[cfg(test)] lookup_only_tests.rs`; the C-LEARN `EXPECTED_VDF_RESIDUAL` carve-out (`tests/simulate.rs`) is now an exact, taxonomy-attributed remainder pinned by `clearn_residual_exactness`. simlin-cli now resolves `GET DIRECT *` external data via a `FilesystemDataProvider` -- see its CLAUDE.md.) Earlier: Element-level cycle resolution + genuine-Vensim VECTOR ELM MAP/SORT ORDER + `:NA:` sentinel, the work that made C-LEARN compile via the incremental path, run to FINAL TIME, and match genuine Vensim (`Ref.vdf`) within the 1% cross-simulator tolerance on the matched floor. The whole-variable `model_dependency_graph` cycle gate now refines a recurrence SCC to an element-acyclic verdict over a cross-member-comparable symbolic `SymVarRef` element graph (`db_dep_graph.rs`: `resolve_recurrence_sccs`/`refine_scc_to_element_verdict`/`symbolic_phase_element_order`, GH #575); a resolved SCC's per-element symbolic segments are interleaved into one combined fragment along the SCC's `element_order` and injected at `assemble_module` (`db.rs`: `combine_scc_fragment`/`var_phase_symbolic_fragment_prod`). Per-variable lowering moved to a new sibling module `db_var_fragment.rs` (`lower_var_fragment`). `crate::float::NA` is the finite Vensim `:NA:` sentinel (`-2^109`, NOT IEEE NaN); both `:NA:` paths route to it. New top-level VM-adjacent modules `vm_vector_sort_order.rs` (arrayed VECTOR SORT ORDER, per-iterated-slice 0-based ranks, #585) and `vm_vector_elm_map.rs` (base+full-source, OOB→NaN, no modulo). New `Opcode::LookupArray` (per-element arrayed-GF apply → array view, #580); `src/compiler/symbolic.rs` gains cross-fragment GF de-duplication (`GfDedup`, #582) and `TempStrategy { Recycle, Sum }`. Per-element graphical-function tables now lay out by element-name → declared dimension index (`variable.rs::reorder_arrayed_element_tables`, `db.rs::extract_tables_from_source_var`), not `Equation::Arrayed` Vec position. New tests: `db_dep_graph_tests.rs`, `db_combined_fragment_tests.rs`, `per_element_gf_tests.rs`; `tests/simulate.rs` `simulates_clearn` is un-stubbed (`#[ignore]` for runtime only) with a hardened `ensure_vdf_results` comparator + `EXPECTED_VDF_RESIDUAL` carve-out.) Earlier: Vensim macro support, Phases 1-7 complete. `:MACRO:`/`<macro>` definitions import as macro-marked `datamodel::Model`s (`Model.macro_spec: Option<MacroSpec>`, persisted through protobuf/JSON/schema); single-output macros inline through `BuiltinVisitor`, multi-output (`:`-list) ones materialize at import. New top-level modules: `module_functions.rs` (the unified `ModuleFunctionDescriptor`/`MacroRegistry` resolver+validator for stdlib functions *and* macros, the shared `is_renamed_*` collision predicates) and `db_macro_registry.rs` (the `project_macro_registry` salsa query + sync-time `macro_registry_build_error`); `SourceProject` gains `macro_registry_build_error`, `SourceModel` gains `macro_spec`; `ErrorCode::DuplicateMacroName`; new `tests/metasd_macros.rs` (gated on `file_io`). LTM arrays hardening, Phases 1-8 complete. Phase 7: #502 per-element graphical-function static link polarity -- when an arrayed source feeds an arrayed graphical-function target, `lookup_table_polarity` folds the per-element `tables` list on `Variable::Var` into one link polarity, falling back to `Unknown` for the multi-dim case; #492 the GF strict-monotonicity check uses a y-range-relative epsilon (`max(EPSILON, range_rel * (y_max - y_min))`) so numeric-import noise no longer flips a monotone lookup table to `Unknown`. Phase 6: #483 analytic STDDEV ceteris-paribus partial -- `generate_nonlinear_partial` builds the unrolled population-variance `sqrt` formula for STDDEV (divisor `N`, matching `vm.rs::Opcode::ArrayStddev`) instead of the delta-ratio stand-in; RANK keeps the delta-ratio (an order statistic, unreachable via real models since it returns an array) with a documented justification, pinned by `test_generate_rank_keeps_delta_ratio`. Phase 5: #515 budgeted cross-element-through-aggregate loop recovery -- `recover_cross_agg_loops` drops the old `MAX_AGG_PETALS = 8` hard drop for a deterministic petal priority + a threaded `agg_loop_budget` loop-count budget (`MAX_CROSS_AGG_LOOPS = 256`, `#[cfg(test)]`-overridable via `AggLoopBudgetGuard`; `MAX_AGG_PETALS` survives as a soft per-agg petal cap), surfaces truncation on `LtmVariablesResult.agg_recovery_truncated` + a `Warning`, and enumerates each disjoint petal subset's distinct *cyclic orderings* (`cyclic_orderings(m)` -- (m-1)!/2 for m≥3, mirror reversals skipped, via Heap's algorithm) instead of one ordering per subset. Phase 4 (2026-05-12): #514 sliced-reducer hoisting -- `AggNode.read_slice`, read-slice-driven element graph / link scores, dynamic-index carve-out reclassified as `DynamicIndex`; arrayed synthetic aggs route through agg-half link-score emitters with subscripted agg names and a subscripted Δsource denominator in the diagonal case (strict-prefix broadcast over-subscribes -- GH #528); mapped-dimension sliced reducers stay conservative; a scalar feeder of a hoisted reducer emits a bare element-graph node. Phases 1-3: the `model_ltm_reference_sites` classification IR (`db_ltm_ir.rs`), the consolidated `reducer_kind`/`ReducerKind` table in `ltm_agg.rs`, element-level A2A `Loop::stocks` + per-slot `loop_partitions` (#487), iterated-dimension subscripts ⇒ `Bare` (#511), disjoint-dim arrayed→arrayed per-source-element link scores + the unscoreable-edge `Warning` (#510).)**
-
 **Maintenance note**: Keep this file up to date when adding, removing, or reorganizing modules.
 
 ## Compilation pipeline
@@ -28,6 +26,16 @@ Equation text flows through these stages in order:
    - **`src/vm_vector_sort_order.rs`** - Genuine-Vensim VECTOR SORT ORDER. Ranks WITHIN each currently-iterated source slice (the innermost/last-declared dim is the sorted axis; outer dims select independent rows), 0-based: result position `j` of a row holds the 0-based source index *within that row* of its `j`-th element in sorted order (`direction == 1` ascending, else descending; stable ties). A 1-D view is the degenerate single-row case (in-row ranks == whole-view ranks). The prior whole-flattened-view absolute-index behavior (GH #585) made a multi-row source feed out-of-range flat indices into a downstream single-column ELM MAP; ground truth is real Vensim DSS `/test/test-models/tests/vector_order/output.tab` (ranks include `0`, impossible for a 1-based permutation). RANK is a distinct, correctly 1-based opcode.
    - **`src/vm_vector_elm_map.rs`** - Genuine-Vensim VECTOR ELM MAP: result element `i` = `source[base_i + round(offset[i])]` over the source variable's FULL row-major contiguous storage, where `base_i` is the flat position arg-1's element reference establishes and the offset steps the source's innermost dim (stride 1). An offset+base outside `[0, full_source_len)`, or a NaN offset, yields genuine IEEE NaN (the out-of-range result Vensim documents as `:NA:`; this is the absorbing NaN, NOT the finite `crate::float::NA` sentinel). NO modulo / NO wraparound (the bug the prior sliced-view-no-base implementation had).
 8. **`src/alloc.rs`** - Allocation helpers for VM priority allocation: `allocate_available()` (bisection-based priority allocation), `alloc_curve()` (per-requester allocation curves for 6 profile types), `normal_cdf()`/`erfc_approx()`.
+9. **`src/wasmgen/`** - WebAssembly code-generation backend: an alternative execution path to the bytecode VM (item 7) that lowers the salsa-compiled `CompiledSimulation` to one self-contained wasm module (no host imports), mirroring the VM opcode-for-opcode. Intended for fast repeated re-simulation (e.g. interactive parameter scrubbing): a host instantiates the blob once and calls its exported `run` on every change. **The bytecode VM remains the correctness oracle** -- every emitted module is executed under the pure-Rust DLR-FT `wasm-interpreter` in tests and compared against `Vm::run_to_end`. Entry point `compile_simulation(&CompiledSimulation) -> WasmArtifact { wasm: Vec<u8>, layout: WasmLayout }`; the blob exports `memory`, `run`, the geometry globals `n_slots`/`n_chunks`/`results_offset` (step-major results), and `set_value`/`reset`/`clear_values` (constant-override semantics matching the VM, sourced from a mutable const-override region indexed by absolute slot). `WasmLayout` (canonical-name -> slot offset) lets a host read one variable's series by striding the results region. Coverage is the full core-simulation surface: every scalar opcode + builtin (transcendentals open-coded as wasm helpers, so the blob needs no math imports), arrays (subscripts, iteration, reducers, dynamic subscripts with OOB->NaN), graphical-function lookups (scalar + per-element `LookupArray`), the vector ops (`VectorSelect`/`VectorElmMap`/`VectorSortOrder`/`Rank`) and market-clearing allocation (`AllocateAvailable`/`AllocateByPriority`), Euler/RK2/RK4 integration, `PREVIOUS`/`INIT`, and nested modules (one set of initials/flows/stocks functions per `(model, input_set)` instance, addressed by a runtime `module_off`). Out of scope: LTM (VM-only); a true-runtime-range subscript (`ViewRangeDynamic`, GH #612) returns `WasmGenError::Unsupported`; array unrolling is bounded by `MAX_UNROLL_UNITS` (65,536 elements/function), above which a model cleanly returns `Unsupported` and the caller falls back to the VM. Files:
+   - **`mod.rs`** - the `WasmGenError` error type + module re-exports.
+   - **`module.rs`** - `compile_simulation`/`compile_datamodel_to_*`: whole-module assembly -- memory layout, the per-instance initials/flows/stocks functions + the `run` driver (Euler/RK2/RK4 loops), the GF/temp/snapshot/const-override regions, the `set_value`/`reset` exports, and `WasmLayout` (de)serialization.
+   - **`lower.rs`** - the per-opcode emitter (`emit_bytecode` over the un-fused + peephole opcode set), the `HelperFns` registry, and the `EmitState` unroll budget. Its `#[cfg(test)]` tests live in the sibling **`lower_tests.rs`** (split out for the per-file line cap).
+   - **`views.rs`** - the compile-time `ViewDesc` view-descriptor stack + element-address arithmetic mirroring `RuntimeView::flat_offset`/`offset_for_iter_index`.
+   - **`math.rs`** - open-coded transcendental wasm helpers (`exp`/`ln`/`sin`/`cos`/`tan`/`atan`/`asin`/`acos`/`log10`/`pow`), each validated against Rust `f64`.
+   - **`lookup.rs`** - the three GF lookup helpers (`lookup_interp`/`lookup_forward`/`lookup_backward`) reproducing the VM lookup functions.
+   - **`vector.rs`** - the vector-op emitters (`VectorSelect`/`VectorElmMap`/`VectorSortOrder`/`Rank`/`LookupArray`) + a runtime-loop NaN-as-Equal stable sort.
+   - **`alloc.rs`** - the allocation emitters (`erfc_approx`/`normal_cdf`/`alloc_curve` + the runtime-loop `allocate_available` bisection) ported bit-faithfully from `crate::alloc`.
+   The libsimlin FFI `simlin_model_compile_to_wasm` returns the blob + serialized `WasmLayout` (see `src/libsimlin/CLAUDE.md`). Parity tests: the default `tests/simulate.rs` + `tests/simulate_systems.rs` corpora run every VM-simulated model through the wasm backend via an inline hook (an `Unsupported` for a core-simulation model is a hard failure -- AC3.2); the heavy twins (`simulates_clearn_wasm` vs `Ref.vdf`, `simulates_wrld3_03_wasm` vs the VM) are `#[ignore]`d, run via `cargo test --release -- --ignored <name>`.
 
 ## Data model and project structure
 
diff --git a/src/simlin-engine/Cargo.toml b/src/simlin-engine/Cargo.toml
index f5109fb0e..c02081eed 100644
--- a/src/simlin-engine/Cargo.toml
+++ b/src/simlin-engine/Cargo.toml
@@ -43,6 +43,12 @@ xmutil = { version = "1", path = "../xmutil", optional = true }
 bumpalo = "3"
 salsa = "0.26"
 
+# WebAssembly code-generation backend (compiles models to wasm as an
+# alternative to the bytecode VM). no_std + alloc only, single transitive
+# dependency (leb128fmt), and builds cleanly to wasm32-unknown-unknown so it
+# is available inside the libsimlin wasm bundle.
+wasm-encoder = { version = "0.244", default-features = false }
+
 rand = { version = "0.9", default-features = false, features = ["std_rng"] }
 
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
@@ -60,6 +66,14 @@ ed25519-dalek = "2"
 ssh-key = "0.6"
 tempfile = "3"
 
+# Pure-Rust no_std wasm interpreter used as a correctness oracle: every wasm
+# module the wasmgen backend produces is executed here and checked against the
+# bytecode VM. Git-only (not yet published to crates.io); pinned to a commit.
+# The host `Store` API lives in the `checked` workspace member, not the `wasm`
+# lib crate.
+wasm-interpreter = { git = "https://github.com/DLR-FT/wasm-interpreter.git", rev = "64cedbba603edfd64cbb6b5a19f5fa34530bb03a" }
+checked = { git = "https://github.com/DLR-FT/wasm-interpreter.git", rev = "64cedbba603edfd64cbb6b5a19f5fa34530bb03a", features = ["linker", "interop"] }
+
 [[test]]
 name = "simulate"
 required-features = ["file_io"]
diff --git a/src/simlin-engine/src/lib.rs b/src/simlin-engine/src/lib.rs
index 2d08e1187..2a3053574 100644
--- a/src/simlin-engine/src/lib.rs
+++ b/src/simlin-engine/src/lib.rs
@@ -116,6 +116,12 @@ mod vm;
 mod vm_profile;
 mod vm_vector_elm_map;
 mod vm_vector_sort_order;
+// WebAssembly code-generation backend: lowers the salsa-compiled
+// `CompiledSimulation` bytecode (the same value `Vm::new` consumes) to a
+// self-contained wasm module, as an alternative execution path to the bytecode
+// VM. Validated in tests by executing the emitted module under a pure-Rust wasm
+// interpreter and comparing against the VM.
+pub mod wasmgen;
 pub mod xmile;
 
 pub use self::common::{Error, ErrorCode, ErrorKind, Result, canonicalize};
diff --git a/src/simlin-engine/src/test_common.rs b/src/simlin-engine/src/test_common.rs
index ddb7fd693..ac3ba3958 100644
--- a/src/simlin-engine/src/test_common.rs
+++ b/src/simlin-engine/src/test_common.rs
@@ -183,6 +183,28 @@ impl TestProject {
         self
     }
 
+    /// Add an auxiliary variable backed by a graphical function. The `equation`
+    /// is the lookup input expression; `gf` is the table the value is looked up
+    /// in. With a real input expression this lowers to `LOOKUP(self, input)`.
+    pub fn aux_with_gf(
+        mut self,
+        name: &str,
+        equation: &str,
+        gf: datamodel::GraphicalFunction,
+    ) -> Self {
+        self.variables.push(Variable::Aux(datamodel::Aux {
+            ident: name.to_string(),
+            equation: Equation::Scalar(equation.to_string()),
+            documentation: String::new(),
+            units: None,
+            gf: Some(gf),
+            ai_state: None,
+            uid: None,
+            compat: datamodel::Compat::default(),
+        }));
+        self
+    }
+
     /// Add a flow variable
     pub fn flow(mut self, name: &str, equation: &str, units: Option<&str>) -> Self {
         self.variables.push(Variable::Flow(datamodel::Flow {
diff --git a/src/simlin-engine/src/vm.rs b/src/simlin-engine/src/vm.rs
index 5b3b40524..0df6afade 100644
--- a/src/simlin-engine/src/vm.rs
+++ b/src/simlin-engine/src/vm.rs
@@ -167,6 +167,19 @@ impl CompiledSimulation {
     pub fn is_constant_offset(&self, off: usize) -> bool {
         self.cached_constant_info.contains_key(&off)
     }
+
+    /// The full set of overridable constant offsets (absolute data-buffer
+    /// offsets), i.e. every offset for which [`is_constant_offset`] is true.
+    /// These are the offsets with an `AssignConstCurr` in some module's flows
+    /// phase (see `collect_constant_info`); `set_value`/`set_value_by_offset`
+    /// accept exactly these. The wasm backend reads this to size and initialize
+    /// its constants-override region so a blob's `set_value` accepts the same
+    /// set the VM does.
+    ///
+    /// [`is_constant_offset`]: Self::is_constant_offset
+    pub(crate) fn constant_offsets(&self) -> impl Iterator<Item = usize> + '_ {
+        self.cached_constant_info.keys().copied()
+    }
 }
 
 /// One unique compiled module (a distinct `(model_name, input_set)`), holding
@@ -204,7 +217,7 @@ struct CompiledSlicedSimulation {
 }
 
 #[cfg_attr(feature = "debug-derive", derive(Debug))]
-#[derive(Clone, Copy, PartialEq, Eq)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash)]
 pub(crate) enum StepPart {
     Initials,
     Flows,
@@ -3052,8 +3065,11 @@ pub(crate) fn pulse(time: f64, dt: f64, volume: f64, first_pulse: f64, interval:
     0.0
 }
 
+// `pub(crate)` so the wasm backend's lookup-helper tests can compare the
+// emitted helpers directly against the VM functions they reproduce
+// (`wasmgen::lookup`), the byte-faithful oracle for `vm.rs:3055-3186`.
 #[inline(never)]
-fn lookup(table: &[(f64, f64)], index: f64) -> f64 {
+pub(crate) fn lookup(table: &[(f64, f64)], index: f64) -> f64 {
     if table.is_empty() {
         return f64::NAN;
     }
@@ -3105,7 +3121,7 @@ fn lookup(table: &[(f64, f64)], index: f64) -> f64 {
 /// If x is beyond the last point, returns the y-value of the last point.
 /// This is a "sample and hold" interpolation where we look forward.
 #[inline(never)]
-fn lookup_forward(table: &[(f64, f64)], index: f64) -> f64 {
+pub(crate) fn lookup_forward(table: &[(f64, f64)], index: f64) -> f64 {
     if table.is_empty() {
         return f64::NAN;
     }
@@ -3147,7 +3163,7 @@ fn lookup_forward(table: &[(f64, f64)], index: f64) -> f64 {
 ///
 /// For duplicate x-values, returns the y of the LAST point with that x.
 #[inline(never)]
-fn lookup_backward(table: &[(f64, f64)], index: f64) -> f64 {
+pub(crate) fn lookup_backward(table: &[(f64, f64)], index: f64) -> f64 {
     if table.is_empty() {
         return f64::NAN;
     }
diff --git a/src/simlin-engine/src/wasmgen/alloc.rs b/src/simlin-engine/src/wasmgen/alloc.rs
new file mode 100644
index 000000000..6ea5dd673
--- /dev/null
+++ b/src/simlin-engine/src/wasmgen/alloc.rs
@@ -0,0 +1,1839 @@
+// Copyright 2026 The Simlin Authors. All rights reserved.
+// Use of this source code is governed by the Apache License,
+// Version 2.0, that can be found in the LICENSE file.
+
+// pattern: Functional Core
+// Pure transformation: each emitter builds the body of one self-contained wasm
+// helper function mirroring the matching `crate::alloc` function. No I/O; the
+// only side effect is in `#[cfg(test)]` (which lives in `lower_tests.rs`
+// alongside the rest of the lowering harness).
+
+//! Lowering of the bytecode VM's market-clearing allocators
+//! (`AllocateAvailable`/`AllocateByPriority`) to WebAssembly (Phase 6).
+//!
+//! These opcodes route through four self-contained wasm helper functions that
+//! port `crate::alloc` *bit-faithfully* -- exact constants, exact Horner
+//! evaluation order, exact branch structure, and the exact bisection loop +
+//! relative-convergence break -- so the emitted module takes the same numerical
+//! path the VM does:
+//!
+//! - [`emit_erfc_approx`] -- `crate::alloc::erfc_approx` (Abramowitz-Stegun
+//!   26.2.17), `call`ing the Phase-2 `exp` helper for the `(-z*z).exp()` factor.
+//! - [`emit_normal_cdf`] -- `crate::alloc::normal_cdf`
+//!   (`0.5 * erfc_approx(-x / SQRT_2)`).
+//! - [`emit_alloc_curve`] -- `crate::alloc::alloc_curve` (all six `ptype % 10`
+//!   curve branches + the `ptype >= 10` floor flag).
+//! - [`emit_allocate_available`] -- `crate::alloc::allocate_available` (the
+//!   `total_demand` short-circuits, the per-type search-range computation, the
+//!   100-iteration bisection, and the final per-requester `alloc_curve`).
+//!
+//! ## Runtime loop vs unrolled
+//!
+//! [`emit_allocate_available`] is a **runtime-loop** helper: `n` (the requester
+//! count) is a runtime value, so it iterates over scratch-memory arrays
+//! (`requests`/`profiles`/`out`) with wasm `loop`/`br_if`, never unrolled. The
+//! other three helpers are straight-line numeric kernels. The lowering arm
+//! (`super::lower`) gathers the request + profile values from the compile-time
+//! view stack into the scratch region (an unrolled per-element copy charged
+//! against the unroll budget) before `call`ing this helper.
+//!
+//! ## Why bit-faithful (rather than "close enough")
+//!
+//! The allocation curves and the bisection are sensitive: `alloc_curve` selects
+//! among six analytic survival functions by an integer `ptype % 10`, and the
+//! bisection's `total < avail` comparison decides which half to keep at each of
+//! 100 steps. Reproducing the Rust reference's exact arithmetic (including the
+//! `(-z) * z` / `(-z).exp()` unary-negation order and the `q.is_infinite()`
+//! CES guard) keeps the converged price -- and therefore every per-requester
+//! allocation -- identical to the VM up to the leaf `exp`/`pow` helpers' own
+//! documented tolerance.
+
+use wasm_encoder::{BlockType, Function, Instruction as Ins, ValType};
+
+use super::WasmGenError;
+use super::lower::{
+    EmitCtx, SLOT_SIZE, emit_fill_temp_nan, emit_view_element_load, f64_const, memarg,
+    temp_element_byte_addr,
+};
+use super::math::emit_horner;
+use super::views::ViewDesc;
+
+// ── erfc_approx (alloc.rs:8-21) ──────────────────────────────────────────────
+
+// Abramowitz & Stegun 26.2.17 constants (alloc.rs:12-17). Low-order-first for
+// the shared `emit_horner`, whose `acc = acc*t + c` fold reproduces the Rust
+// expression `(((((a5*t + a4)*t) + a3)*t + a2)*t + a1)` op-for-op.
+const A1: f64 = 0.254829592;
+const A2: f64 = -0.284496736;
+const A3: f64 = 1.421413741;
+const A4: f64 = -1.453152027;
+const A5: f64 = 1.061405429;
+const AS_P: f64 = 0.3275911;
+
+// `erfc_approx` local layout. Param 0 is `z`; `T` is the reduced argument
+// `t = 1/(1 + p*z)`, materialized in a local so `emit_horner` can read it once
+// per polynomial term.
+const ERFC_Z: u32 = 0;
+const ERFC_T: u32 = 1;
+
+/// Emit `erfc_approx(z: f64) -> f64`, porting `crate::alloc::erfc_approx`
+/// (Abramowitz-Stegun 26.2.17) bit-faithfully.
+///
+/// For `z < 0` returns `2.0 - erfc_approx(-z)` (the symmetry the Rust reference
+/// uses); else `t = 1/(1 + p*z)` and the result is the degree-5 polynomial
+/// `(((((a5*t + a4)*t) + a3)*t + a2)*t + a1) * t * (-z*z).exp()`. The polynomial
+/// is evaluated by the shared [`emit_horner`] (identical fold order); `(-z) * z`
+/// reproduces Rust's unary-negation precedence (`-z * z == (-z) * z`); the
+/// `.exp()` is the Phase-2 `exp` helper (`exp_idx`). The `z < 0` symmetry branch
+/// is open-coded as `2 - kernel(-z)` (the kernel is the shared non-negative path),
+/// so no self-`call` -- and therefore no forward index to itself -- is needed.
+pub(crate) fn emit_erfc_approx(exp_idx: u32) -> Function {
+    // One f64 scratch local (ERFC_T) after the `z` param.
+    let mut f = Function::new([(1, ValType::F64)]);
+    emit_erfc_body(&mut f, exp_idx);
+    f.instruction(&Ins::End);
+    f
+}
+
+/// The body of `erfc_approx` (no terminating `End`). The `z < 0` symmetry branch
+/// is open-coded as `2 - erfc_approx_of(-z)` rather than a self-`call`, so the
+/// helper needs no forward index to itself: `erfc_approx_of` shares the
+/// non-negative-argument kernel.
+fn emit_erfc_body(f: &mut Function, exp_idx: u32) {
+    // if z < 0 { 2.0 - kernel(-z) } else { kernel(z) }.
+    f.instruction(&Ins::LocalGet(ERFC_Z));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    // 2.0 - kernel(-z): negate z in place, run the kernel, subtract from 2.
+    f.instruction(&f64_const(2.0));
+    f.instruction(&Ins::LocalGet(ERFC_Z));
+    f.instruction(&Ins::F64Neg);
+    f.instruction(&Ins::LocalSet(ERFC_Z));
+    emit_erfc_kernel(f, exp_idx);
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::Else);
+    emit_erfc_kernel(f, exp_idx);
+    f.instruction(&Ins::End);
+}
+
+/// The non-negative-argument kernel of `erfc_approx`, leaving the f64 result on
+/// the stack: `t = 1/(1 + p*z)`, then `poly(t) * t * (-z*z).exp()`. Reads `z`
+/// from [`ERFC_Z`] (already non-negative at every call site).
+fn emit_erfc_kernel(f: &mut Function, exp_idx: u32) {
+    // t = 1.0 / (1.0 + p * z)
+    f.instruction(&f64_const(1.0));
+    f.instruction(&f64_const(AS_P));
+    f.instruction(&Ins::LocalGet(ERFC_Z));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::F64Add);
+    f.instruction(&Ins::F64Div);
+    f.instruction(&Ins::LocalSet(ERFC_T));
+
+    // poly(t) = (((((a5*t + a4)*t) + a3)*t + a2)*t + a1) -- the shared Horner
+    // fold matches this op order exactly.
+    emit_horner(f, ERFC_T, &[A1, A2, A3, A4, A5]);
+    // * t
+    f.instruction(&Ins::LocalGet(ERFC_T));
+    f.instruction(&Ins::F64Mul);
+    // * (-z * z).exp(): (-z) then * z (Rust unary-neg precedence), then exp().
+    f.instruction(&Ins::LocalGet(ERFC_Z));
+    f.instruction(&Ins::F64Neg);
+    f.instruction(&Ins::LocalGet(ERFC_Z));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::Call(exp_idx));
+    f.instruction(&Ins::F64Mul);
+}
+
+// ── normal_cdf (alloc.rs:25-30) ──────────────────────────────────────────────
+
+const NCDF_X: u32 = 0;
+
+/// Emit `normal_cdf(x: f64) -> f64`, porting `crate::alloc::normal_cdf`:
+/// `if x.is_nan() { NaN } else { 0.5 * erfc_approx(-x / SQRT_2) }`. `erfc_idx`
+/// is [`emit_erfc_approx`]'s assigned function index.
+pub(crate) fn emit_normal_cdf(erfc_idx: u32) -> Function {
+    let mut f = Function::new([]);
+
+    // NaN guard: x != x -> return NaN.
+    f.instruction(&Ins::LocalGet(NCDF_X));
+    f.instruction(&Ins::LocalGet(NCDF_X));
+    f.instruction(&Ins::F64Ne);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&f64_const(f64::NAN));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // 0.5 * erfc_approx(-x / SQRT_2)
+    f.instruction(&f64_const(0.5));
+    f.instruction(&Ins::LocalGet(NCDF_X));
+    f.instruction(&Ins::F64Neg);
+    f.instruction(&f64_const(std::f64::consts::SQRT_2));
+    f.instruction(&Ins::F64Div);
+    f.instruction(&Ins::Call(erfc_idx));
+    f.instruction(&Ins::F64Mul);
+
+    f.instruction(&Ins::End);
+    f
+}
+
+// ── alloc_curve (alloc.rs:40-129) ────────────────────────────────────────────
+
+// `alloc_curve` param layout (mirrors the Rust signature order).
+const CURVE_P: u32 = 0;
+const CURVE_REQUEST: u32 = 1;
+const CURVE_PTYPE: u32 = 2;
+const CURVE_PPRIORITY: u32 = 3;
+const CURVE_PWIDTH: u32 = 4;
+const CURVE_PEXTRA: u32 = 5;
+// Scratch locals (after the six params).
+const CURVE_PT_MOD: u32 = 6; // i32 `ptype % 10`
+const CURVE_FRACTION: u32 = 7; // f64 the survival fraction
+const CURVE_T: u32 = 8; // f64 the rectangular/triangular interpolation `t`
+const CURVE_Z: u32 = 9; // f64 the exponential branch `z`
+const CURVE_Q: u32 = 10; // f64 the CES branch `q`
+
+/// Emit `alloc_curve(p, request, ptype, ppriority, pwidth, pextra) -> f64`,
+/// porting `crate::alloc::alloc_curve` bit-faithfully.
+///
+/// `request <= 0` returns 0 immediately. Otherwise the survival `fraction` is
+/// selected by `ptype % 10` across all six branches (0 fixed, 1 rectangular,
+/// 2 triangular, 3 normal via [`normal_cdf`](emit_normal_cdf), 4 exponential
+/// via the `exp` helper, 5 CES via the `pow` helper, `_` fixed), then
+/// `alloc = request * fraction` is floored when `ptype >= 10`. `ptype` is
+/// carried as an f64 (the VM stores profile fields as f64 and casts `pt as i32`);
+/// `ptype % 10` and the `ptype >= 10` test reproduce that i32 cast via
+/// `i32.trunc_sat_f64_s`. `normal_cdf_idx`/`exp_idx`/`pow_idx` are the helpers'
+/// assigned function indices.
+pub(crate) fn emit_alloc_curve(normal_cdf_idx: u32, exp_idx: u32, pow_idx: u32) -> Function {
+    // Scratch: one i32 (CURVE_PT_MOD) + four f64 (FRACTION/T/Z/Q).
+    let mut f = Function::new([(1, ValType::I32), (4, ValType::F64)]);
+
+    // if request <= 0.0 { return 0.0 }
+    f.instruction(&Ins::LocalGet(CURVE_REQUEST));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Le);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // pt_mod = (ptype as i32) % 10  (truncated remainder, sign of the dividend --
+    // wasm `i32.rem_s` matches Rust `%`).
+    f.instruction(&Ins::LocalGet(CURVE_PTYPE));
+    f.instruction(&Ins::I32TruncSatF64S);
+    f.instruction(&Ins::I32Const(10));
+    f.instruction(&Ins::I32RemS);
+    f.instruction(&Ins::LocalSet(CURVE_PT_MOD));
+
+    // fraction = match pt_mod { 0|_ => fixed, 1 => rect, 2 => tri, 3 => normal,
+    //                           4 => exp, 5 => ces }. Emitted as an if/else
+    // chain on pt_mod; each arm leaves the fraction on the stack, stored into
+    // CURVE_FRACTION below.
+    emit_curve_fraction(&mut f, normal_cdf_idx, exp_idx, pow_idx);
+    f.instruction(&Ins::LocalSet(CURVE_FRACTION));
+
+    // alloc = request * fraction, parked in CURVE_T (free here) so the floor
+    // branch can read it inside both `if` arms (a wasm `If(Result(F64))` does
+    // NOT carry the pre-`if` stack value into the block).
+    f.instruction(&Ins::LocalGet(CURVE_REQUEST));
+    f.instruction(&Ins::LocalGet(CURVE_FRACTION));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::LocalSet(CURVE_T));
+
+    // if ptype >= 10 { alloc.floor() } else { alloc }. `ptype >= 10` tests the
+    // original f64 ptype (Rust `ptype >= 10`, an i32 compare; ptype is
+    // integer-valued here).
+    f.instruction(&Ins::LocalGet(CURVE_PTYPE));
+    f.instruction(&Ins::I32TruncSatF64S);
+    f.instruction(&Ins::I32Const(10));
+    f.instruction(&Ins::I32GeS);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&Ins::LocalGet(CURVE_T));
+    f.instruction(&Ins::F64Floor);
+    f.instruction(&Ins::Else);
+    f.instruction(&Ins::LocalGet(CURVE_T));
+    f.instruction(&Ins::End);
+
+    f.instruction(&Ins::End);
+    f
+}
+
+/// Push the survival `fraction` for the `pt_mod` already in [`CURVE_PT_MOD`],
+/// dispatching the six `ptype % 10` branches as a nested if/else chain (each arm
+/// a `Result(F64)` leaving exactly one f64). The `_` default and branch `0` are
+/// the identical "fixed" survival, so the chain falls through to it.
+fn emit_curve_fraction(f: &mut Function, normal_cdf_idx: u32, exp_idx: u32, pow_idx: u32) {
+    // if pt_mod == 1 { rect } else if pt_mod == 2 { tri } else if pt_mod == 3
+    // { normal } else if pt_mod == 4 { exp } else if pt_mod == 5 { ces }
+    // else { fixed }.
+    emit_pt_eq(f, 1);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    emit_curve_rectangular(f);
+    f.instruction(&Ins::Else);
+
+    emit_pt_eq(f, 2);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    emit_curve_triangular(f);
+    f.instruction(&Ins::Else);
+
+    emit_pt_eq(f, 3);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    emit_curve_normal(f, normal_cdf_idx);
+    f.instruction(&Ins::Else);
+
+    emit_pt_eq(f, 4);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    emit_curve_exponential(f, exp_idx);
+    f.instruction(&Ins::Else);
+
+    emit_pt_eq(f, 5);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    emit_curve_ces(f, pow_idx);
+    f.instruction(&Ins::Else);
+
+    // Default (pt_mod == 0 or anything else): the fixed survival.
+    emit_curve_fixed(f);
+
+    f.instruction(&Ins::End); // 5
+    f.instruction(&Ins::End); // 4
+    f.instruction(&Ins::End); // 3
+    f.instruction(&Ins::End); // 2
+    f.instruction(&Ins::End); // 1
+}
+
+/// Push the i32 condition `pt_mod == n`.
+fn emit_pt_eq(f: &mut Function, n: i32) {
+    f.instruction(&Ins::LocalGet(CURVE_PT_MOD));
+    f.instruction(&Ins::I32Const(n));
+    f.instruction(&Ins::I32Eq);
+}
+
+/// Branch 0 / `_`: fixed quantity -- `if p <= ppriority { 1.0 } else { 0.0 }`.
+fn emit_curve_fixed(f: &mut Function) {
+    f.instruction(&f64_const(1.0));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::LocalGet(CURVE_P));
+    f.instruction(&Ins::LocalGet(CURVE_PPRIORITY));
+    f.instruction(&Ins::F64Le); // p <= ppriority
+    f.instruction(&Ins::Select); // 1.0 if p<=ppriority else 0.0
+}
+
+/// Branch 1: rectangular survival. `lo = ppriority - pwidth; hi = ppriority +
+/// pwidth; if p <= lo { 1 } else if p >= hi { 0 } else { (hi - p)/(hi - lo) }`.
+/// `lo`/`hi` are recomputed inline at each use (matching the Rust let-bindings'
+/// values; the FP result is identical) to avoid extra scratch locals.
+fn emit_curve_rectangular(f: &mut Function) {
+    // if p <= lo { 1.0 }
+    f.instruction(&Ins::LocalGet(CURVE_P));
+    emit_lo(f);
+    f.instruction(&Ins::F64Le);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::Else);
+    // else if p >= hi { 0.0 } else { (hi - p) / (hi - lo) }
+    f.instruction(&Ins::LocalGet(CURVE_P));
+    emit_hi(f);
+    f.instruction(&Ins::F64Ge);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::Else);
+    emit_hi_minus_p_over_hi_minus_lo(f);
+    f.instruction(&Ins::End);
+    f.instruction(&Ins::End);
+}
+
+/// Branch 2: triangular survival. `lo`/`hi` as in rectangular; `if p <= lo { 1 }
+/// else if p >= hi { 0 } else if p <= ppriority { t = (hi-p)/(hi-lo); 1 -
+/// 2(1-t)^2 } else { t = (hi-p)/(hi-lo); 2 t^2 }`.
+fn emit_curve_triangular(f: &mut Function) {
+    // if p <= lo { 1.0 }
+    f.instruction(&Ins::LocalGet(CURVE_P));
+    emit_lo(f);
+    f.instruction(&Ins::F64Le);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::Else);
+    // else if p >= hi { 0.0 }
+    f.instruction(&Ins::LocalGet(CURVE_P));
+    emit_hi(f);
+    f.instruction(&Ins::F64Ge);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::Else);
+    // t = (hi - p) / (hi - lo)
+    emit_hi_minus_p_over_hi_minus_lo(f);
+    f.instruction(&Ins::LocalSet(CURVE_T));
+    // else if p <= ppriority { 1 - 2*(1-t)*(1-t) } else { 2*t*t }
+    f.instruction(&Ins::LocalGet(CURVE_P));
+    f.instruction(&Ins::LocalGet(CURVE_PPRIORITY));
+    f.instruction(&Ins::F64Le); // p <= ppriority
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    // 1.0 - 2.0 * (1.0 - t) * (1.0 - t)
+    f.instruction(&f64_const(1.0));
+    f.instruction(&f64_const(2.0));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::LocalGet(CURVE_T));
+    f.instruction(&Ins::F64Sub); // (1 - t)
+    f.instruction(&Ins::F64Mul); // 2 * (1 - t)
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::LocalGet(CURVE_T));
+    f.instruction(&Ins::F64Sub); // (1 - t)
+    f.instruction(&Ins::F64Mul); // 2 * (1 - t) * (1 - t)
+    f.instruction(&Ins::F64Sub); // 1 - 2*(1-t)*(1-t)
+    f.instruction(&Ins::Else);
+    // 2.0 * t * t
+    f.instruction(&f64_const(2.0));
+    f.instruction(&Ins::LocalGet(CURVE_T));
+    f.instruction(&Ins::F64Mul); // 2 * t
+    f.instruction(&Ins::LocalGet(CURVE_T));
+    f.instruction(&Ins::F64Mul); // 2 * t * t
+    f.instruction(&Ins::End);
+    f.instruction(&Ins::End);
+    f.instruction(&Ins::End);
+}
+
+/// Branch 3: normal survival. `if pwidth <= 0 { if p <= ppriority { 1 } else
+/// { 0 } } else { normal_cdf((ppriority - p) / pwidth) }`.
+fn emit_curve_normal(f: &mut Function, normal_cdf_idx: u32) {
+    f.instruction(&Ins::LocalGet(CURVE_PWIDTH));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Le); // pwidth <= 0
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    emit_curve_fixed(f);
+    f.instruction(&Ins::Else);
+    // normal_cdf((ppriority - p) / pwidth)
+    f.instruction(&Ins::LocalGet(CURVE_PPRIORITY));
+    f.instruction(&Ins::LocalGet(CURVE_P));
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::LocalGet(CURVE_PWIDTH));
+    f.instruction(&Ins::F64Div);
+    f.instruction(&Ins::Call(normal_cdf_idx));
+    f.instruction(&Ins::End);
+}
+
+/// Branch 4: symmetric exponential survival. `if pwidth <= 0 { fixed } else
+/// { z = (p - ppriority) / pwidth; if z > 0 { 0.5 * (-z).exp() } else { 1 - 0.5
+/// * z.exp() } }`.
+fn emit_curve_exponential(f: &mut Function, exp_idx: u32) {
+    f.instruction(&Ins::LocalGet(CURVE_PWIDTH));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Le); // pwidth <= 0
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    emit_curve_fixed(f);
+    f.instruction(&Ins::Else);
+    // z = (p - ppriority) / pwidth
+    f.instruction(&Ins::LocalGet(CURVE_P));
+    f.instruction(&Ins::LocalGet(CURVE_PPRIORITY));
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::LocalGet(CURVE_PWIDTH));
+    f.instruction(&Ins::F64Div);
+    f.instruction(&Ins::LocalSet(CURVE_Z));
+    // if z > 0 { 0.5 * (-z).exp() } else { 1.0 - 0.5 * z.exp() }
+    f.instruction(&Ins::LocalGet(CURVE_Z));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Gt);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    // 0.5 * (-z).exp()
+    f.instruction(&f64_const(0.5));
+    f.instruction(&Ins::LocalGet(CURVE_Z));
+    f.instruction(&Ins::F64Neg);
+    f.instruction(&Ins::Call(exp_idx));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::Else);
+    // 1.0 - 0.5 * z.exp()
+    f.instruction(&f64_const(1.0));
+    f.instruction(&f64_const(0.5));
+    f.instruction(&Ins::LocalGet(CURVE_Z));
+    f.instruction(&Ins::Call(exp_idx));
+    f.instruction(&Ins::F64Mul); // 0.5 * z.exp()
+    f.instruction(&Ins::F64Sub); // 1 - 0.5 * z.exp()
+    f.instruction(&Ins::End);
+    f.instruction(&Ins::End);
+}
+
+/// Branch 5: constant elasticity of substitution (CES). `if p <= 0 { 1 } else
+/// if ppriority <= 0 { 0 } else { ratio = ppriority / p; q = ratio.powf(pextra);
+/// if q.is_infinite() { 1 } else { q / (1 + q) } }`.
+fn emit_curve_ces(f: &mut Function, pow_idx: u32) {
+    // if p <= 0 { 1.0 }
+    f.instruction(&Ins::LocalGet(CURVE_P));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Le);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::Else);
+    // else if ppriority <= 0 { 0.0 }
+    f.instruction(&Ins::LocalGet(CURVE_PPRIORITY));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Le);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::Else);
+    // q = (ppriority / p).powf(pextra)
+    f.instruction(&Ins::LocalGet(CURVE_PPRIORITY));
+    f.instruction(&Ins::LocalGet(CURVE_P));
+    f.instruction(&Ins::F64Div); // ratio
+    f.instruction(&Ins::LocalGet(CURVE_PEXTRA));
+    f.instruction(&Ins::Call(pow_idx));
+    f.instruction(&Ins::LocalSet(CURVE_Q));
+    // if q.is_infinite() { 1.0 } else { q / (1.0 + q) }
+    f.instruction(&Ins::LocalGet(CURVE_Q));
+    f.instruction(&Ins::F64Abs);
+    f.instruction(&f64_const(f64::INFINITY));
+    f.instruction(&Ins::F64Eq); // |q| == inf  (q.is_infinite())
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::Else);
+    // q / (1.0 + q)
+    f.instruction(&Ins::LocalGet(CURVE_Q));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::LocalGet(CURVE_Q));
+    f.instruction(&Ins::F64Add); // 1 + q
+    f.instruction(&Ins::F64Div); // q / (1 + q)
+    f.instruction(&Ins::End);
+    f.instruction(&Ins::End);
+    f.instruction(&Ins::End);
+}
+
+/// Push `ppriority - pwidth` (the rectangular/triangular `lo`).
+fn emit_lo(f: &mut Function) {
+    f.instruction(&Ins::LocalGet(CURVE_PPRIORITY));
+    f.instruction(&Ins::LocalGet(CURVE_PWIDTH));
+    f.instruction(&Ins::F64Sub);
+}
+
+/// Push `ppriority + pwidth` (the rectangular/triangular `hi`).
+fn emit_hi(f: &mut Function) {
+    f.instruction(&Ins::LocalGet(CURVE_PPRIORITY));
+    f.instruction(&Ins::LocalGet(CURVE_PWIDTH));
+    f.instruction(&Ins::F64Add);
+}
+
+/// Push `(hi - p) / (hi - lo)` where `lo = ppriority - pwidth`, `hi = ppriority
+/// + pwidth`. `hi - lo == 2*pwidth`, but the Rust reference computes `(hi - lo)`
+/// from the let-bound `hi`/`lo`, so reproduce that exact subtraction.
+fn emit_hi_minus_p_over_hi_minus_lo(f: &mut Function) {
+    // hi - p
+    emit_hi(f);
+    f.instruction(&Ins::LocalGet(CURVE_P));
+    f.instruction(&Ins::F64Sub);
+    // hi - lo
+    emit_hi(f);
+    emit_lo(f);
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::F64Div);
+}
+
+// ── allocate_available (alloc.rs:136-199) ────────────────────────────────────
+
+// `allocate_available(requests_ptr: i32, n: i32, profiles_ptr: i32, avail: f64,
+// out_ptr: i32) -> ()` local layout. `requests_ptr`/`profiles_ptr`/`out_ptr` are
+// byte addresses into the scratch region; `profiles` is 4 f64/requester laid out
+// `(ptype, ppriority, pwidth, pextra)`.
+const ALLOC_REQ_PTR: u32 = 0;
+const ALLOC_N: u32 = 1;
+const ALLOC_PROF_PTR: u32 = 2;
+const ALLOC_AVAIL: u32 = 3;
+const ALLOC_OUT_PTR: u32 = 4;
+// Scratch locals (after the five params).
+const ALLOC_I: u32 = 5; // i32 loop index
+const ALLOC_TOTAL_DEMAND: u32 = 6; // f64 Σ requests where r > 0
+const ALLOC_R: u32 = 7; // f64 a request value
+const ALLOC_P_MIN: u32 = 8; // f64 search-range lower bound
+const ALLOC_P_MAX: u32 = 9; // f64 search-range upper bound
+const ALLOC_SPREAD: u32 = 10; // f64 per-profile spread
+const ALLOC_PPRIORITY: u32 = 11; // f64 a profile's ppriority
+const ALLOC_PWIDTH: u32 = 12; // f64 a profile's pwidth
+const ALLOC_PT_MOD: u32 = 13; // i32 a profile's ptype % 10
+const ALLOC_LO: u32 = 14; // f64 bisection low
+const ALLOC_HI: u32 = 15; // f64 bisection high
+const ALLOC_MID: u32 = 16; // f64 bisection midpoint
+const ALLOC_TOTAL: u32 = 17; // f64 Σ alloc_curve(mid, ...)
+const ALLOC_ITER: u32 = 18; // i32 bisection iteration counter
+const ALLOC_PSTAR: u32 = 19; // f64 the converged price
+
+// Bytes per profile tuple (4 f64) and per request/out slot (1 f64).
+const PROFILE_BYTES: i32 = 32;
+const SLOT_BYTES: i32 = 8;
+
+/// Emit `allocate_available(requests_ptr, n, profiles_ptr, avail, out_ptr)`,
+/// porting `crate::alloc::allocate_available` bit-faithfully over scratch-memory
+/// arrays.
+///
+/// The three short-circuits (`n == 0` -> nothing written; `avail >=
+/// total_demand` -> each requester gets `r.max(0)`; `avail <= 0` -> zeros)
+/// mirror the Rust early returns. Otherwise the per-type search range
+/// `[p_min, p_max]` is computed from the profiles' `spread`, then a 100-iteration
+/// bisection finds the market-clearing price (the `total < avail` -> `hi = mid`
+/// step and the `|hi - lo| < 1e-14 * (1 + |hi|)` relative-convergence break),
+/// and `out[i] = alloc_curve(p_star, requests[i], ...)` is written for every
+/// requester. A runtime loop (never unrolled): `n` is a runtime value.
+/// `alloc_curve_idx` is [`emit_alloc_curve`]'s assigned function index.
+pub(crate) fn emit_allocate_available(alloc_curve_idx: u32) -> Function {
+    // Scratch: i32 (I), f64 (TOTAL_DEMAND, R, P_MIN, P_MAX, SPREAD, PPRIORITY,
+    // PWIDTH), i32 (PT_MOD), f64 (LO, HI, MID, TOTAL), i32 (ITER), f64 (PSTAR).
+    // Declaration order fixes the indices ALLOC_I..ALLOC_PSTAR.
+    let mut f = Function::new([
+        (1, ValType::I32),
+        (7, ValType::F64),
+        (1, ValType::I32),
+        (4, ValType::F64),
+        (1, ValType::I32),
+        (1, ValType::F64),
+    ]);
+
+    // if n == 0 { return }  (the Rust `if n == 0 { return vec![] }`).
+    f.instruction(&Ins::LocalGet(ALLOC_N));
+    f.instruction(&Ins::I32Eqz);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // total_demand = Σ requests[i] where requests[i] > 0.0.
+    emit_total_demand(&mut f);
+
+    // if avail >= total_demand { out[i] = requests[i].max(0.0); return }
+    f.instruction(&Ins::LocalGet(ALLOC_AVAIL));
+    f.instruction(&Ins::LocalGet(ALLOC_TOTAL_DEMAND));
+    f.instruction(&Ins::F64Ge);
+    f.instruction(&Ins::If(BlockType::Empty));
+    emit_full_grant(&mut f);
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // if avail <= 0.0 { out[i] = 0.0; return }
+    f.instruction(&Ins::LocalGet(ALLOC_AVAIL));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Le);
+    f.instruction(&Ins::If(BlockType::Empty));
+    emit_zero_out(&mut f);
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // Compute the search range [p_min, p_max] from the profiles.
+    emit_search_range(&mut f);
+
+    // 100-iteration bisection for the market-clearing price.
+    emit_bisection(&mut f, alloc_curve_idx);
+
+    // p_star = (lo + hi) / 2.0; out[i] = alloc_curve(p_star, requests[i], ...).
+    f.instruction(&Ins::LocalGet(ALLOC_LO));
+    f.instruction(&Ins::LocalGet(ALLOC_HI));
+    f.instruction(&Ins::F64Add);
+    f.instruction(&f64_const(2.0));
+    f.instruction(&Ins::F64Div);
+    f.instruction(&Ins::LocalSet(ALLOC_PSTAR));
+    emit_final_allocations(&mut f, alloc_curve_idx);
+
+    f.instruction(&Ins::End);
+    f
+}
+
+/// `total_demand = Σ requests[i] where requests[i] > 0.0` into
+/// [`ALLOC_TOTAL_DEMAND`]. A runtime `for i in 0..n` loop.
+fn emit_total_demand(f: &mut Function) {
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::LocalSet(ALLOC_TOTAL_DEMAND));
+    emit_for_n(f, |f| {
+        // r = requests[i]
+        emit_load_request(f);
+        f.instruction(&Ins::LocalSet(ALLOC_R));
+        // if r > 0.0 { total_demand += r }
+        f.instruction(&Ins::LocalGet(ALLOC_R));
+        f.instruction(&f64_const(0.0));
+        f.instruction(&Ins::F64Gt);
+        f.instruction(&Ins::If(BlockType::Empty));
+        f.instruction(&Ins::LocalGet(ALLOC_TOTAL_DEMAND));
+        f.instruction(&Ins::LocalGet(ALLOC_R));
+        f.instruction(&Ins::F64Add);
+        f.instruction(&Ins::LocalSet(ALLOC_TOTAL_DEMAND));
+        f.instruction(&Ins::End);
+    });
+}
+
+/// The `avail >= total_demand` arm: `out[i] = requests[i].max(0.0)` for every
+/// requester. `f64::max` is NaN-ignoring; reproduce it with the compare-select
+/// form (`r > 0 ? r : 0` is `r.max(0.0)` for a non-NaN `r`, and a NaN request
+/// would be ignored by `f64::max` -- but the Rust path stores `r.max(0.0)` which
+/// is `0.0` for a NaN `r`, matched here since `NaN > 0.0` is false).
+fn emit_full_grant(f: &mut Function) {
+    emit_for_n(f, |f| {
+        // out[i] = max(requests[i], 0.0)
+        emit_out_addr(f);
+        // value = r > 0.0 ? r : 0.0  (== f64::max(r, 0.0) for non-NaN; for NaN r
+        // this yields 0.0, matching Rust `NaN.max(0.0) == 0.0`).
+        emit_load_request(f);
+        f.instruction(&Ins::LocalSet(ALLOC_R));
+        f.instruction(&Ins::LocalGet(ALLOC_R));
+        f.instruction(&f64_const(0.0));
+        f.instruction(&Ins::LocalGet(ALLOC_R));
+        f.instruction(&f64_const(0.0));
+        f.instruction(&Ins::F64Gt); // r > 0.0
+        f.instruction(&Ins::Select); // r if r>0 else 0.0
+        f.instruction(&Ins::F64Store(f64_memarg()));
+    });
+}
+
+/// The `avail <= 0.0` arm: `out[i] = 0.0` for every requester.
+fn emit_zero_out(f: &mut Function) {
+    emit_for_n(f, |f| {
+        emit_out_addr(f);
+        f.instruction(&f64_const(0.0));
+        f.instruction(&Ins::F64Store(f64_memarg()));
+    });
+}
+
+/// Compute `[p_min, p_max]` from the profiles (alloc.rs:154-169): `p_min =
+/// INFINITY`, `p_max = NEG_INFINITY`; for each profile `spread = match ptype % 10
+/// { 0 => 1, 1|2 => pwidth, 3 => pwidth*6, 4 => pwidth*10, 5 => ppriority*10,
+/// _ => 1 }`, then `p_min = min(p_min, ppriority - spread)`, `p_max =
+/// max(p_max, ppriority + spread)`. `f64::min`/`f64::max` are NaN-ignoring;
+/// realistic profiles never carry NaN, and the reference uses them, so the
+/// NaN-ignoring compare-select form is reproduced for fidelity.
+fn emit_search_range(f: &mut Function) {
+    f.instruction(&f64_const(f64::INFINITY));
+    f.instruction(&Ins::LocalSet(ALLOC_P_MIN));
+    f.instruction(&f64_const(f64::NEG_INFINITY));
+    f.instruction(&Ins::LocalSet(ALLOC_P_MAX));
+
+    emit_for_n(f, |f| {
+        // ppriority = profiles[i].1; pwidth = profiles[i].2; pt_mod =
+        // (profiles[i].0 as i32) % 10.
+        emit_load_profile_field(f, 1);
+        f.instruction(&Ins::LocalSet(ALLOC_PPRIORITY));
+        emit_load_profile_field(f, 2);
+        f.instruction(&Ins::LocalSet(ALLOC_PWIDTH));
+        emit_load_profile_field(f, 0);
+        f.instruction(&Ins::I32TruncSatF64S);
+        f.instruction(&Ins::I32Const(10));
+        f.instruction(&Ins::I32RemS);
+        f.instruction(&Ins::LocalSet(ALLOC_PT_MOD));
+
+        // spread = match pt_mod { 1|2 => pwidth, 3 => pwidth*6, 4 => pwidth*10,
+        //                         5 => ppriority*10, 0|_ => 1.0 }.
+        emit_spread(f);
+        f.instruction(&Ins::LocalSet(ALLOC_SPREAD));
+
+        // p_min = f64::min(p_min, ppriority - spread)
+        f.instruction(&Ins::LocalGet(ALLOC_P_MIN));
+        f.instruction(&Ins::LocalGet(ALLOC_PPRIORITY));
+        f.instruction(&Ins::LocalGet(ALLOC_SPREAD));
+        f.instruction(&Ins::F64Sub);
+        emit_f64_min(f);
+        f.instruction(&Ins::LocalSet(ALLOC_P_MIN));
+
+        // p_max = f64::max(p_max, ppriority + spread)
+        f.instruction(&Ins::LocalGet(ALLOC_P_MAX));
+        f.instruction(&Ins::LocalGet(ALLOC_PPRIORITY));
+        f.instruction(&Ins::LocalGet(ALLOC_SPREAD));
+        f.instruction(&Ins::F64Add);
+        emit_f64_max(f);
+        f.instruction(&Ins::LocalSet(ALLOC_P_MAX));
+    });
+}
+
+/// Push the per-profile `spread` for the `pt_mod` in [`ALLOC_PT_MOD`] (uses
+/// [`ALLOC_PWIDTH`]/[`ALLOC_PPRIORITY`]): 1 (0/_), pwidth (1/2), pwidth*6 (3),
+/// pwidth*10 (4), ppriority*10 (5). Emitted as a nested if/else chain.
+fn emit_spread(f: &mut Function) {
+    // pt_mod == 1 || pt_mod == 2 -> pwidth
+    f.instruction(&Ins::LocalGet(ALLOC_PT_MOD));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Eq);
+    f.instruction(&Ins::LocalGet(ALLOC_PT_MOD));
+    f.instruction(&Ins::I32Const(2));
+    f.instruction(&Ins::I32Eq);
+    f.instruction(&Ins::I32Or);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&Ins::LocalGet(ALLOC_PWIDTH));
+    f.instruction(&Ins::Else);
+
+    // pt_mod == 3 -> pwidth * 6.0
+    f.instruction(&Ins::LocalGet(ALLOC_PT_MOD));
+    f.instruction(&Ins::I32Const(3));
+    f.instruction(&Ins::I32Eq);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&Ins::LocalGet(ALLOC_PWIDTH));
+    f.instruction(&f64_const(6.0));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::Else);
+
+    // pt_mod == 4 -> pwidth * 10.0
+    f.instruction(&Ins::LocalGet(ALLOC_PT_MOD));
+    f.instruction(&Ins::I32Const(4));
+    f.instruction(&Ins::I32Eq);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&Ins::LocalGet(ALLOC_PWIDTH));
+    f.instruction(&f64_const(10.0));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::Else);
+
+    // pt_mod == 5 -> ppriority * 10.0
+    f.instruction(&Ins::LocalGet(ALLOC_PT_MOD));
+    f.instruction(&Ins::I32Const(5));
+    f.instruction(&Ins::I32Eq);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&Ins::LocalGet(ALLOC_PPRIORITY));
+    f.instruction(&f64_const(10.0));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::Else);
+
+    // default (pt_mod == 0 or anything else) -> 1.0
+    f.instruction(&f64_const(1.0));
+
+    f.instruction(&Ins::End); // 5
+    f.instruction(&Ins::End); // 4
+    f.instruction(&Ins::End); // 3
+    f.instruction(&Ins::End); // 1|2
+}
+
+/// The 100-iteration bisection (alloc.rs:171-190): `lo = p_min; hi = p_max; for
+/// _ in 0..100 { mid = (lo+hi)/2; total = Σ alloc_curve(mid, ...); if total <
+/// avail { hi = mid } else { lo = mid }; if |hi-lo| < 1e-14*(1+|hi|) { break } }`.
+fn emit_bisection(f: &mut Function, alloc_curve_idx: u32) {
+    // lo = p_min; hi = p_max; iter = 0
+    f.instruction(&Ins::LocalGet(ALLOC_P_MIN));
+    f.instruction(&Ins::LocalSet(ALLOC_LO));
+    f.instruction(&Ins::LocalGet(ALLOC_P_MAX));
+    f.instruction(&Ins::LocalSet(ALLOC_HI));
+    f.instruction(&Ins::I32Const(0));
+    f.instruction(&Ins::LocalSet(ALLOC_ITER));
+
+    f.instruction(&Ins::Block(BlockType::Empty)); // $bisect_exit
+    f.instruction(&Ins::Loop(BlockType::Empty)); // $bisect
+
+    // while-head: if !(iter < 100) break $bisect_exit  (br depth 1).
+    f.instruction(&Ins::LocalGet(ALLOC_ITER));
+    f.instruction(&Ins::I32Const(100));
+    f.instruction(&Ins::I32LtS);
+    f.instruction(&Ins::I32Eqz);
+    f.instruction(&Ins::BrIf(1));
+
+    // mid = (lo + hi) / 2.0
+    f.instruction(&Ins::LocalGet(ALLOC_LO));
+    f.instruction(&Ins::LocalGet(ALLOC_HI));
+    f.instruction(&Ins::F64Add);
+    f.instruction(&f64_const(2.0));
+    f.instruction(&Ins::F64Div);
+    f.instruction(&Ins::LocalSet(ALLOC_MID));
+
+    // total = Σ_{i<n} alloc_curve(mid, requests[i], profiles[i]...)
+    emit_total_at_price(f, ALLOC_MID, alloc_curve_idx);
+    f.instruction(&Ins::LocalSet(ALLOC_TOTAL));
+
+    // if total < avail { hi = mid } else { lo = mid }
+    f.instruction(&Ins::LocalGet(ALLOC_TOTAL));
+    f.instruction(&Ins::LocalGet(ALLOC_AVAIL));
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&Ins::LocalGet(ALLOC_MID));
+    f.instruction(&Ins::LocalSet(ALLOC_HI));
+    f.instruction(&Ins::Else);
+    f.instruction(&Ins::LocalGet(ALLOC_MID));
+    f.instruction(&Ins::LocalSet(ALLOC_LO));
+    f.instruction(&Ins::End);
+
+    // if |hi - lo| < 1e-14 * (1.0 + |hi|) { break $bisect_exit }
+    f.instruction(&Ins::LocalGet(ALLOC_HI));
+    f.instruction(&Ins::LocalGet(ALLOC_LO));
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::F64Abs);
+    f.instruction(&f64_const(1e-14));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::LocalGet(ALLOC_HI));
+    f.instruction(&Ins::F64Abs);
+    f.instruction(&Ins::F64Add); // 1 + |hi|
+    f.instruction(&Ins::F64Mul); // 1e-14 * (1 + |hi|)
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::BrIf(1)); // break $bisect_exit
+
+    // iter += 1; continue $bisect
+    f.instruction(&Ins::LocalGet(ALLOC_ITER));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Add);
+    f.instruction(&Ins::LocalSet(ALLOC_ITER));
+    f.instruction(&Ins::Br(0));
+
+    f.instruction(&Ins::End); // end $bisect loop
+    f.instruction(&Ins::End); // end $bisect_exit block
+}
+
+/// Push `Σ_{i<n} alloc_curve(price, requests[i], profiles[i]...)` for the price
+/// in `price_local`. A runtime `for i in 0..n` accumulating into a scratch
+/// f64 left on the stack at the end.
+fn emit_total_at_price(f: &mut Function, price_local: u32, alloc_curve_idx: u32) {
+    // ALLOC_SPREAD is the running sum here. It was only live inside
+    // `emit_search_range` (which has finished by the time the bisection runs),
+    // and `alloc_curve` is a separate function that cannot touch this helper's
+    // locals, so reusing it as the fold accumulator is safe.
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::LocalSet(ALLOC_SPREAD));
+    emit_for_n(f, |f| {
+        // sum += alloc_curve(price, requests[i], ptype, ppriority, pwidth, pextra)
+        f.instruction(&Ins::LocalGet(ALLOC_SPREAD));
+        emit_alloc_curve_call(f, price_local, alloc_curve_idx);
+        f.instruction(&Ins::F64Add);
+        f.instruction(&Ins::LocalSet(ALLOC_SPREAD));
+    });
+    f.instruction(&Ins::LocalGet(ALLOC_SPREAD));
+}
+
+/// `out[i] = alloc_curve(p_star, requests[i], profiles[i]...)` for every
+/// requester (alloc.rs:193-198).
+fn emit_final_allocations(f: &mut Function, alloc_curve_idx: u32) {
+    emit_for_n(f, |f| {
+        emit_out_addr(f);
+        emit_alloc_curve_call(f, ALLOC_PSTAR, alloc_curve_idx);
+        f.instruction(&Ins::F64Store(f64_memarg()));
+    });
+}
+
+/// Push `alloc_curve(price, requests[i], profiles[i].0, .1, .2, .3)` -- the six
+/// arguments in order, then the `call`. Reads `requests[i]`/`profiles[i]` for the
+/// current loop index `i` ([`ALLOC_I`]); `price` is the f64 in `price_local`.
+fn emit_alloc_curve_call(f: &mut Function, price_local: u32, alloc_curve_idx: u32) {
+    f.instruction(&Ins::LocalGet(price_local)); // p
+    emit_load_request(f); // request = requests[i]
+    emit_load_profile_field(f, 0); // ptype
+    emit_load_profile_field(f, 1); // ppriority
+    emit_load_profile_field(f, 2); // pwidth
+    emit_load_profile_field(f, 3); // pextra
+    f.instruction(&Ins::Call(alloc_curve_idx));
+}
+
+/// Emit a runtime `for i in 0..n` loop (`ALLOC_I` is the index), invoking `body`
+/// once per iteration. `body` must be operand-stack balanced.
+fn emit_for_n(f: &mut Function, body: impl FnOnce(&mut Function)) {
+    // i = 0
+    f.instruction(&Ins::I32Const(0));
+    f.instruction(&Ins::LocalSet(ALLOC_I));
+
+    f.instruction(&Ins::Block(BlockType::Empty)); // $exit
+    f.instruction(&Ins::Loop(BlockType::Empty)); // $loop
+
+    // if !(i < n) break $exit  (br depth 1)
+    f.instruction(&Ins::LocalGet(ALLOC_I));
+    f.instruction(&Ins::LocalGet(ALLOC_N));
+    f.instruction(&Ins::I32LtS);
+    f.instruction(&Ins::I32Eqz);
+    f.instruction(&Ins::BrIf(1));
+
+    body(f);
+
+    // i += 1; continue $loop
+    f.instruction(&Ins::LocalGet(ALLOC_I));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Add);
+    f.instruction(&Ins::LocalSet(ALLOC_I));
+    f.instruction(&Ins::Br(0));
+
+    f.instruction(&Ins::End); // end $loop
+    f.instruction(&Ins::End); // end $exit
+}
+
+/// Push `requests[i]` (the f64 at `requests_ptr + i*8`).
+fn emit_load_request(f: &mut Function) {
+    f.instruction(&Ins::LocalGet(ALLOC_REQ_PTR));
+    f.instruction(&Ins::LocalGet(ALLOC_I));
+    f.instruction(&Ins::I32Const(SLOT_BYTES));
+    f.instruction(&Ins::I32Mul);
+    f.instruction(&Ins::I32Add);
+    f.instruction(&Ins::F64Load(f64_memarg()));
+}
+
+/// Push the store *address* for `out[i]` (`out_ptr + i*8`), to be followed by the
+/// value then an `f64.store` (`f64.store` consumes `[addr_i32, value_f64]`).
+fn emit_out_addr(f: &mut Function) {
+    f.instruction(&Ins::LocalGet(ALLOC_OUT_PTR));
+    f.instruction(&Ins::LocalGet(ALLOC_I));
+    f.instruction(&Ins::I32Const(SLOT_BYTES));
+    f.instruction(&Ins::I32Mul);
+    f.instruction(&Ins::I32Add);
+}
+
+/// Push `profiles[i].field` (the f64 at `profiles_ptr + i*32 + field*8`), with
+/// `field in {0,1,2,3}` for `(ptype, ppriority, pwidth, pextra)`.
+fn emit_load_profile_field(f: &mut Function, field: i32) {
+    f.instruction(&Ins::LocalGet(ALLOC_PROF_PTR));
+    f.instruction(&Ins::LocalGet(ALLOC_I));
+    f.instruction(&Ins::I32Const(PROFILE_BYTES));
+    f.instruction(&Ins::I32Mul);
+    f.instruction(&Ins::I32Add);
+    // field offset rides in the memarg.offset (a compile-time constant).
+    f.instruction(&Ins::F64Load(f64_memarg_off((field * SLOT_BYTES) as u64)));
+}
+
+/// Push `f64::min(a, b)` for `[a, b]` on the stack, reproducing Rust's
+/// NaN-ignoring `f64::min` (the search-range profiles never carry NaN in a real
+/// model, but the reference uses `f64::min`, so the NaN-ignoring form is kept).
+/// Built as nested `select`s; mirrors `super::vector`'s `emit_f64_minmax_rust`,
+/// but uses this helper's own scratch locals.
+fn emit_f64_min(f: &mut Function) {
+    emit_f64_minmax(f, true);
+}
+
+/// Push `f64::max(a, b)` for `[a, b]` on the stack (NaN-ignoring).
+fn emit_f64_max(f: &mut Function) {
+    emit_f64_minmax(f, false);
+}
+
+/// Shared body of [`emit_f64_min`]/[`emit_f64_max`]: consume `[a, b]` and push
+/// `f64::min(a,b)` (`want_min`) or `f64::max(a,b)`, ignoring a NaN operand (if
+/// both NaN, NaN). Parks `a`/`b` in scratch locals reused from the bisection
+/// (`ALLOC_LO`/`ALLOC_HI`/`ALLOC_MID` are not yet live when the search range is
+/// computed, so they are free here). Three nested `select`s in the wasm
+/// "deeper operand wins when cond != 0" form, matching `crate::vm`'s `f64::min`/
+/// `max` reductions.
+fn emit_f64_minmax(f: &mut Function, want_min: bool) {
+    let a = ALLOC_LO;
+    let b = ALLOC_HI;
+    let r = ALLOC_MID;
+    // [a, b] on the stack (b on top); park them.
+    f.instruction(&Ins::LocalSet(b));
+    f.instruction(&Ins::LocalSet(a));
+
+    // core = (a {<,>} b) ? a : b -> r
+    f.instruction(&Ins::LocalGet(a));
+    f.instruction(&Ins::LocalGet(b));
+    f.instruction(&Ins::LocalGet(a));
+    f.instruction(&Ins::LocalGet(b));
+    if want_min {
+        f.instruction(&Ins::F64Lt);
+    } else {
+        f.instruction(&Ins::F64Gt);
+    }
+    f.instruction(&Ins::Select);
+    f.instruction(&Ins::LocalSet(r));
+
+    // r = (b is NaN) ? a : r
+    f.instruction(&Ins::LocalGet(a));
+    f.instruction(&Ins::LocalGet(r));
+    f.instruction(&Ins::LocalGet(b));
+    f.instruction(&Ins::LocalGet(b));
+    f.instruction(&Ins::F64Ne); // b != b
+    f.instruction(&Ins::Select);
+    f.instruction(&Ins::LocalSet(r));
+
+    // result = (a is NaN) ? b : r
+    f.instruction(&Ins::LocalGet(b));
+    f.instruction(&Ins::LocalGet(r));
+    f.instruction(&Ins::LocalGet(a));
+    f.instruction(&Ins::LocalGet(a));
+    f.instruction(&Ins::F64Ne); // a != a
+    f.instruction(&Ins::Select);
+}
+
+/// An 8-byte (f64) memory access at offset 0, naturally aligned (the scratch
+/// region is 8-byte aligned).
+fn f64_memarg() -> wasm_encoder::MemArg {
+    f64_memarg_off(0)
+}
+
+/// An 8-byte (f64) memory access at a static byte `offset`.
+fn f64_memarg_off(offset: u64) -> wasm_encoder::MemArg {
+    wasm_encoder::MemArg {
+        offset,
+        align: 3, // log2(8): an 8-byte f64 access
+        memory_index: 0,
+    }
+}
+
+// ── opcode lowering arms (vm.rs:2631-2794) ───────────────────────────────────
+
+/// Lower `AllocateAvailable { write_temp_id }`, mirroring `vm.rs:2631-2721`. The
+/// views are `profile_view = top`, `requests_view = top-1`; `avail` is the f64
+/// on top of the wasm operand stack (the VM pops it). Gathers the `n =
+/// requests_view.size()` request values + the per-requester profile tuples into
+/// the allocation scratch region, `call`s the [`emit_allocate_available`] helper,
+/// then copies the `n` results into temp `write_temp_id`. An invalid input view
+/// fills the whole destination temp region with NaN.
+///
+/// `pp_cols` reproduces the VM's `if !pp_values.is_empty() && n>0 &&
+/// pp_size%n==0 { pp_size/n } else { 4 }`, and each profile field
+/// `(ptype, ppriority, pwidth, pextra)` is read from `pp_values[i*pp_cols + j]`
+/// with the VM's defaults `(0.0, 0.0, 1.0, 0.0)` when the index is out of range
+/// -- all resolved at compile time (the view sizes and indices are static).
+pub(crate) fn emit_allocate_available_op(
+    requests_view: &ViewDesc,
+    profile_view: &ViewDesc,
+    write_temp_id: u8,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    // A dynamically-subscripted view is fine: the per-element gather routes
+    // through `emit_view_element_load`, which folds the view's runtime offset
+    // addend and per-element validity guard, and the op-level gate below takes
+    // the VM's whole-op `!is_valid -> fill_temp_nan` short-circuit.
+
+    // Pop `avail` (top) into the scratch f64 before the gate, so both gate arms
+    // are operand-balanced.
+    let avail = ctx.scratch_local;
+    f.instruction(&Ins::LocalSet(avail));
+
+    let n = requests_view.size();
+    let pp_size = profile_view.size();
+    // pp_cols: pp_size/n when the flattened profile array divides evenly into n
+    // requesters, else 4 (vm.rs:2680-2685).
+    let pp_cols = if pp_size > 0 && n > 0 && pp_size.is_multiple_of(n) {
+        pp_size / n
+    } else {
+        4
+    };
+
+    emit_with_validity_gate(
+        &[requests_view, profile_view],
+        write_temp_id,
+        ctx,
+        f,
+        |ctx, f| {
+            // Gather requests[i] -> scratch req region.
+            let (req_base, prof_base, out_base) = alloc_scratch_layout(ctx, n);
+            for i in 0..n {
+                f.instruction(&Ins::I32Const(0));
+                emit_view_element_load(requests_view, i, ctx, f)?;
+                f.instruction(&Ins::F64Store(memarg(
+                    req_base + (i as u64) * u64::from(SLOT_SIZE),
+                )));
+            }
+
+            // Build per-requester profile tuples (ptype, ppriority, pwidth, pextra)
+            // from pp_values[i*pp_cols + j], defaulting (0,0,1,0) out of range.
+            const DEFAULTS: [f64; 4] = [0.0, 0.0, 1.0, 0.0];
+            for i in 0..n {
+                for (j, &default) in DEFAULTS.iter().enumerate() {
+                    let prof_addr =
+                        prof_base + (i as u64) * (PROFILE_BYTES as u64) + (j as u64) * 8;
+                    f.instruction(&Ins::I32Const(0));
+                    let flat = i * pp_cols + j;
+                    if flat < pp_size {
+                        emit_view_element_load(profile_view, flat, ctx, f)?;
+                    } else {
+                        f.instruction(&f64_const(default));
+                    }
+                    f.instruction(&Ins::F64Store(memarg(prof_addr)));
+                }
+            }
+
+            // allocate_available(req_base, n, prof_base, avail, out_base)
+            f.instruction(&Ins::I32Const(req_base as i32));
+            f.instruction(&Ins::I32Const(n as i32));
+            f.instruction(&Ins::I32Const(prof_base as i32));
+            f.instruction(&Ins::LocalGet(avail));
+            f.instruction(&Ins::I32Const(out_base as i32));
+            f.instruction(&Ins::Call(ctx.helpers.allocate_available));
+
+            // Copy out[i] -> temp[write_temp_id][i].
+            emit_copy_out_to_temp(out_base, n, write_temp_id, ctx, f)
+        },
+    )
+}
+
+/// Lower `AllocateByPriority { write_temp_id }`, mirroring `vm.rs:2723-2794`. The
+/// views are `priority_view = top`, `requests_view = top-1`; the operand stack
+/// holds `supply` on top and `width` beneath (the VM pops `supply` then
+/// `width`). Gathers requests, synthesizes rectangular profiles `(1.0,
+/// priorities[i] or 0.0, width, 0.0)`, `call`s [`emit_allocate_available`] with
+/// `supply` as the available amount, then copies results into the temp.
+pub(crate) fn emit_allocate_by_priority_op(
+    requests_view: &ViewDesc,
+    priority_view: &ViewDesc,
+    write_temp_id: u8,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    // A dynamically-subscripted view is handled by `emit_view_element_load`
+    // (runtime offset + per-element validity) and the op-level gate below; see
+    // `emit_allocate_available_op`.
+
+    // Pop `supply` (top) then `width` into scratch f64s, before the gate.
+    let supply = ctx.scratch_local;
+    let width = ctx.vector_f64_locals[0];
+    f.instruction(&Ins::LocalSet(supply));
+    f.instruction(&Ins::LocalSet(width));
+
+    let n = requests_view.size();
+    let pri_size = priority_view.size();
+
+    emit_with_validity_gate(
+        &[requests_view, priority_view],
+        write_temp_id,
+        ctx,
+        f,
+        |ctx, f| {
+            let (req_base, prof_base, out_base) = alloc_scratch_layout(ctx, n);
+            // Gather requests[i].
+            for i in 0..n {
+                f.instruction(&Ins::I32Const(0));
+                emit_view_element_load(requests_view, i, ctx, f)?;
+                f.instruction(&Ins::F64Store(memarg(
+                    req_base + (i as u64) * u64::from(SLOT_SIZE),
+                )));
+            }
+
+            // Rectangular profiles: (ptype=1, ppriority=priorities[i] or 0, pwidth=
+            // width, pextra=0). Fields 0/3 are the constants 1.0/0.0; field 1 is the
+            // priority view element (default 0.0 out of range); field 2 is the
+            // runtime `width` local.
+            for i in 0..n {
+                let base = prof_base + (i as u64) * (PROFILE_BYTES as u64);
+                // ptype = 1.0
+                f.instruction(&Ins::I32Const(0));
+                f.instruction(&f64_const(1.0));
+                f.instruction(&Ins::F64Store(memarg(base)));
+                // ppriority = priorities[i] or 0.0
+                f.instruction(&Ins::I32Const(0));
+                if i < pri_size {
+                    emit_view_element_load(priority_view, i, ctx, f)?;
+                } else {
+                    f.instruction(&f64_const(0.0));
+                }
+                f.instruction(&Ins::F64Store(memarg(base + 8)));
+                // pwidth = width (runtime)
+                f.instruction(&Ins::I32Const(0));
+                f.instruction(&Ins::LocalGet(width));
+                f.instruction(&Ins::F64Store(memarg(base + 16)));
+                // pextra = 0.0
+                f.instruction(&Ins::I32Const(0));
+                f.instruction(&f64_const(0.0));
+                f.instruction(&Ins::F64Store(memarg(base + 24)));
+            }
+
+            // allocate_available(req_base, n, prof_base, supply, out_base)
+            f.instruction(&Ins::I32Const(req_base as i32));
+            f.instruction(&Ins::I32Const(n as i32));
+            f.instruction(&Ins::I32Const(prof_base as i32));
+            f.instruction(&Ins::LocalGet(supply));
+            f.instruction(&Ins::I32Const(out_base as i32));
+            f.instruction(&Ins::Call(ctx.helpers.allocate_available));
+
+            emit_copy_out_to_temp(out_base, n, write_temp_id, ctx, f)
+        },
+    )
+}
+
+/// The three consecutive scratch sub-region byte bases for an allocation of `n`
+/// requesters: `requests` (n f64) at `alloc_scratch_base`, `profiles` (4n f64)
+/// after it, `out` (n f64) after that. All three are live across the
+/// `allocate_available` call; `module.rs` sizes the region for the largest `n`.
+fn alloc_scratch_layout(ctx: &EmitCtx, n: usize) -> (u64, u64, u64) {
+    let base = u64::from(ctx.alloc_scratch_base);
+    let req_base = base;
+    let prof_base = req_base + (n as u64) * u64::from(SLOT_SIZE);
+    let out_base = prof_base + (n as u64) * (PROFILE_BYTES as u64);
+    (req_base, prof_base, out_base)
+}
+
+/// Copy the `n` allocations the helper wrote at `out_base` into temp
+/// `write_temp_id` (`temp[temp_off + i] = out[i]`). Unrolled over `n`.
+fn emit_copy_out_to_temp(
+    out_base: u64,
+    n: usize,
+    write_temp_id: u8,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    for i in 0..n {
+        let temp_addr = temp_element_byte_addr(ctx, write_temp_id, i as u32)?;
+        f.instruction(&Ins::I32Const(0));
+        f.instruction(&Ins::I32Const(0));
+        f.instruction(&Ins::F64Load(memarg(
+            out_base + (i as u64) * u64::from(SLOT_SIZE),
+        )));
+        f.instruction(&Ins::F64Store(memarg(temp_addr)));
+    }
+    Ok(())
+}
+
+/// Emit `body` gated on the VM's "`!is_valid` -> fill_temp_nan" short-circuit
+/// for the allocation arms. When no input view carries a runtime validity flag
+/// (the common static/temp/full-var case), `body` is emitted directly with no
+/// runtime check; otherwise `if all_valid { body } else { fill_temp_nan }`.
+/// Mirrors `super::vector::emit_with_validity_gate`.
+fn emit_with_validity_gate(
+    views: &[&ViewDesc],
+    write_temp_id: u8,
+    ctx: &EmitCtx,
+    f: &mut Function,
+    body: impl FnOnce(&EmitCtx, &mut Function) -> Result<(), WasmGenError>,
+) -> Result<(), WasmGenError> {
+    let valids: Vec<u32> = views.iter().filter_map(|v| v.valid_local).collect();
+    if valids.is_empty() {
+        return body(ctx, f);
+    }
+    f.instruction(&Ins::LocalGet(valids[0]));
+    for &v in &valids[1..] {
+        f.instruction(&Ins::LocalGet(v));
+        f.instruction(&Ins::I32And);
+    }
+    f.instruction(&Ins::If(BlockType::Empty));
+    body(ctx, f)?;
+    f.instruction(&Ins::Else);
+    emit_fill_temp_nan(ctx, write_temp_id, f)?;
+    f.instruction(&Ins::End);
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::super::lower::build_helpers;
+    use super::f64_memarg_off;
+    use checked::Store;
+    use wasm::validate;
+    use wasm_encoder::{
+        CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction as Ins,
+        MemorySection, MemoryType, Module, TypeSection, ValType,
+    };
+
+    // The allocation helpers are bit-faithful ports of `crate::alloc`. Their
+    // leaf transcendental helpers (`exp`/`pow`) are NOT bit-identical to the
+    // VM's libm -- they are the open-coded approximations of Phase 2, pinned in
+    // `super::super::math` to abs 0.0 / rel ~1e-12 vs `f64`. So the alloc helpers
+    // can only match the Rust `crate::alloc` reference (which uses libm) to that
+    // leaf tolerance, propagated through the curves and the bisection.
+    //
+    // Documented tolerances (all far inside the corpus bar of abs 2e-3 /
+    // rel 5e-6):
+    // - erfc_approx / normal_cdf: abs 1e-12 OR rel 1e-12. erfc's only
+    //   transcendental is one `exp` call (rel ~1e-12); the polynomial is exact
+    //   arithmetic, so the wasm result tracks `crate::alloc::erfc_approx` to the
+    //   exp helper's tolerance.
+    // - alloc_curve: abs 1e-9 OR rel 1e-9 across all six branches. Most use at
+    //   most one exp/normal_cdf (rel ~1e-12); CES adds a `pow = exp(y*ln x)`
+    //   (pinned at rel ~2.3e-12). The uniform 1e-9 bar leaves ample slack for
+    //   the leaf approximations + DLR-FT-vs-native rounding drift.
+    // - allocate_available: abs 1e-9 OR rel 1e-9 -- the converged price rides on
+    //   the curve tolerance, and the per-requester allocation is one more curve
+    //   evaluation at that price.
+    const ERFC_ABS: f64 = 1e-12;
+    const ERFC_REL: f64 = 1e-12;
+    const CURVE_ABS: f64 = 1e-9;
+    const CURVE_REL: f64 = 1e-9;
+    const ALLOC_ABS: f64 = 1e-9;
+    const ALLOC_REL: f64 = 1e-9;
+
+    /// Assert `got` matches `want` within absolute *or* relative tolerance,
+    /// propagating NaN/inf. Mirrors `super::super::math`'s `assert_close`.
+    fn assert_close(name: &str, got: f64, want: f64, abs_tol: f64, rel_tol: f64) {
+        if want.is_nan() {
+            assert!(got.is_nan(), "{name}: expected NaN, got {got}");
+            return;
+        }
+        assert!(!got.is_nan(), "{name}: got NaN, expected {want}");
+        if want.is_infinite() {
+            assert_eq!(got, want, "{name}: expected {want}, got {got}");
+            return;
+        }
+        let abs = (got - want).abs();
+        let rel = if want != 0.0 { abs / want.abs() } else { abs };
+        assert!(
+            abs <= abs_tol || rel <= rel_tol,
+            "{name}: got {got}, want {want} (abs {abs:.3e}, rel {rel:.3e})"
+        );
+    }
+
+    /// A linear sample of `n+1` points across `[lo, hi]` inclusive.
+    fn linspace(lo: f64, hi: f64, n: usize) -> Vec<f64> {
+        (0..=n)
+            .map(|i| lo + (hi - lo) * (i as f64) / (n as f64))
+            .collect()
+    }
+
+    /// Which value-producing alloc helper a test module exports as `f`.
+    ///
+    /// The DLR-FT interop only types tuples up to arity 3, so the unary helpers
+    /// (`Erfc`/`NormalCdf`) export `f(x: f64) -> f64` directly, while the
+    /// six-argument `AllocCurve` exports `f(args_ptr: i32) -> f64` and reads its
+    /// six f64 arguments from `mem[args_ptr + k*8]`.
+    #[derive(Clone, Copy)]
+    enum Which {
+        Erfc,
+        NormalCdf,
+        AllocCurve,
+    }
+
+    fn helper_index(which: Which) -> u32 {
+        let h = build_helpers().fns;
+        match which {
+            Which::Erfc => h.erfc_approx,
+            Which::NormalCdf => h.normal_cdf,
+            Which::AllocCurve => h.alloc_curve,
+        }
+    }
+
+    /// Build a module with every helper body plus a thin exported `f` forwarding
+    /// to the helper under test, and a memory (the GF lookup helpers, also
+    /// bundled, `f64.load` from memory 0). For a unary helper `f(x: f64) -> f64`
+    /// calls directly; for `AllocCurve` (six args) `f(args_ptr: i32) -> f64`
+    /// loads the six args from `mem[args_ptr + k*8]` and calls the helper.
+    /// Mirrors `super::super::math`'s `build_helper_module` layout (helpers at
+    /// `0..N`, wrapper at `N`).
+    fn build_value_module(which: Which) -> Vec<u8> {
+        let helpers = build_helpers();
+        let n_helpers = helpers.functions.len() as u32;
+        let target = helper_index(which);
+        let is_curve = matches!(which, Which::AllocCurve);
+
+        let mut module = Module::new();
+
+        let mut types = TypeSection::new();
+        if is_curve {
+            types.ty().function([ValType::I32], [ValType::F64]);
+        } else {
+            types.ty().function([ValType::F64], [ValType::F64]);
+        }
+        for hf in &helpers.functions {
+            types.ty().function(hf.params.clone(), hf.results.clone());
+        }
+        module.section(&types);
+
+        let mut functions = FunctionSection::new();
+        for (i, _) in helpers.functions.iter().enumerate() {
+            functions.function(1 + i as u32);
+        }
+        functions.function(0);
+        module.section(&functions);
+
+        let mut memories = MemorySection::new();
+        memories.memory(MemoryType {
+            minimum: 1,
+            maximum: None,
+            memory64: false,
+            shared: false,
+            page_size_log2: None,
+        });
+        module.section(&memories);
+
+        let mut exports = ExportSection::new();
+        exports.export("f", ExportKind::Func, n_helpers);
+        exports.export("mem", ExportKind::Memory, 0);
+        module.section(&exports);
+
+        let mut code = CodeSection::new();
+        for hf in &helpers.functions {
+            code.function(&hf.body);
+        }
+        let mut wrapper = Function::new([]);
+        if is_curve {
+            // Load the six f64 args from mem[args_ptr + k*8] (args_ptr is param 0).
+            for k in 0..6u64 {
+                wrapper.instruction(&Ins::LocalGet(0));
+                wrapper.instruction(&Ins::F64Load(f64_memarg_off(k * 8)));
+            }
+        } else {
+            wrapper.instruction(&Ins::LocalGet(0));
+        }
+        wrapper.instruction(&Ins::Call(target));
+        wrapper.instruction(&Ins::End);
+        code.function(&wrapper);
+        module.section(&code);
+
+        module.finish()
+    }
+
+    fn run_unary(which: Which, x: f64) -> f64 {
+        let bytes = build_value_module(which);
+        let info = validate(&bytes).expect("helper module must validate");
+        let mut store = Store::new(());
+        let module = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("instantiate")
+            .module_addr;
+        let f = store
+            .instance_export(module, "f")
+            .unwrap()
+            .as_func()
+            .unwrap();
+        store
+            .invoke_simple_typed::<(f64,), f64>(f, (x,))
+            .expect("invoke")
+    }
+
+    /// Byte address the `AllocCurve` wrapper reads its six f64 args from.
+    const CURVE_ARGS_BASE: u32 = 512;
+
+    /// Run `alloc_curve(p, request, ptype, ppriority, pwidth, pextra)` under the
+    /// interpreter. The six args are seeded into memory at [`CURVE_ARGS_BASE`]
+    /// (`ptype` as an integer-valued f64) and the wrapper reads them back.
+    fn run_alloc_curve(p: f64, request: f64, ptype: i32, pp: f64, pw: f64, pe: f64) -> f64 {
+        let bytes = build_value_module(Which::AllocCurve);
+        let info = validate(&bytes).expect("alloc_curve module must validate");
+        let mut store = Store::new(());
+        let module = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("instantiate")
+            .module_addr;
+        let args = [p, request, ptype as f64, pp, pw, pe];
+        let mem = store
+            .instance_export(module, "mem")
+            .unwrap()
+            .as_mem()
+            .unwrap();
+        store.mem_access_mut_slice(mem, |b| {
+            for (k, &v) in args.iter().enumerate() {
+                let a = CURVE_ARGS_BASE as usize + k * 8;
+                b[a..a + 8].copy_from_slice(&v.to_le_bytes());
+            }
+        });
+        let f = store
+            .instance_export(module, "f")
+            .unwrap()
+            .as_func()
+            .unwrap();
+        store
+            .invoke_simple_typed::<(i32,), f64>(f, (CURVE_ARGS_BASE as i32,))
+            .expect("invoke")
+    }
+
+    // ── erfc_approx parity vs crate::alloc::erfc_approx (AC7.1) ──────────────
+
+    #[test]
+    fn erfc_approx_matches_rust_over_sampled_range() {
+        // Sweep both signs (z<0 takes the `2 - erfc_approx(-z)` symmetry branch)
+        // across the range where erfc is numerically interesting; the A-S 26.2.17
+        // approximation is what the Rust reference uses too, so the wasm result
+        // tracks it to the `exp` helper's tolerance.
+        for z in linspace(-6.0, 6.0, 400) {
+            let got = run_unary(Which::Erfc, z);
+            let want = crate::alloc::erfc_approx(z);
+            assert_close(&format!("erfc_approx({z})"), got, want, ERFC_ABS, ERFC_REL);
+        }
+        // Anchor at z=0 (t=1): the wasm result tracks the Rust reference, which
+        // is ~0.9999999990 there -- the A-S 26.2.17 approximation, not the
+        // mathematical erfc(0)=1.
+        assert_close(
+            "erfc_approx(0)",
+            run_unary(Which::Erfc, 0.0),
+            crate::alloc::erfc_approx(0.0),
+            ERFC_ABS,
+            ERFC_REL,
+        );
+    }
+
+    // ── normal_cdf parity vs crate::alloc::normal_cdf (AC7.1) ────────────────
+
+    #[test]
+    fn normal_cdf_matches_rust_over_sampled_range() {
+        for x in linspace(-6.0, 6.0, 400) {
+            let got = run_unary(Which::NormalCdf, x);
+            let want = crate::alloc::normal_cdf(x);
+            assert_close(&format!("normal_cdf({x})"), got, want, ERFC_ABS, ERFC_REL);
+        }
+        // NaN propagates (the explicit `x.is_nan()` guard).
+        assert!(run_unary(Which::NormalCdf, f64::NAN).is_nan());
+        // normal_cdf(0) tracks the Rust reference. (The A-S 26.2.17 erfc
+        // polynomial is ~0.4999999995 at x=0, NOT exactly 0.5 -- the ~1.5e-7
+        // approximation error is a property of the reference itself, so parity
+        // is judged against `crate::alloc::normal_cdf`, not ideal math.)
+        assert_close(
+            "normal_cdf(0)",
+            run_unary(Which::NormalCdf, 0.0),
+            crate::alloc::normal_cdf(0.0),
+            ERFC_ABS,
+            ERFC_REL,
+        );
+    }
+
+    // ── alloc_curve parity for each of the 6 profile types + the >=10 floor ──
+
+    /// Assert the emitted `alloc_curve` matches `crate::alloc::alloc_curve` over
+    /// a grid of prices for one profile `(ptype, ppriority, pwidth, pextra)` and
+    /// a fixed positive request.
+    fn assert_curve_matches(ptype: i32, pp: f64, pw: f64, pe: f64, request: f64) {
+        for p in linspace(-3.0, 8.0, 120) {
+            let got = run_alloc_curve(p, request, ptype, pp, pw, pe);
+            let want = crate::alloc::alloc_curve(p, request, ptype, pp, pw, pe);
+            assert_close(
+                &format!("alloc_curve(p={p}, ptype={ptype}, pp={pp}, pw={pw}, pe={pe})"),
+                got,
+                want,
+                CURVE_ABS,
+                CURVE_REL,
+            );
+        }
+    }
+
+    #[test]
+    fn alloc_curve_fixed_matches_rust() {
+        // ptype 0: fixed quantity (p <= ppriority ? request : 0).
+        assert_curve_matches(0, 2.0, 1.0, 0.0, 5.0);
+    }
+
+    #[test]
+    fn alloc_curve_rectangular_matches_rust() {
+        // ptype 1: rectangular survival.
+        assert_curve_matches(1, 3.0, 1.5, 0.0, 4.0);
+    }
+
+    #[test]
+    fn alloc_curve_triangular_matches_rust() {
+        // ptype 2: triangular survival (both p<=ppriority and p>ppriority arms).
+        assert_curve_matches(2, 2.5, 2.0, 0.0, 7.0);
+    }
+
+    #[test]
+    fn alloc_curve_normal_matches_rust() {
+        // ptype 3: normal survival via normal_cdf. Also exercise the pwidth<=0
+        // degenerate-to-fixed arm.
+        assert_curve_matches(3, 2.0, 1.0, 0.0, 6.0);
+        assert_curve_matches(3, 2.0, 0.0, 0.0, 6.0); // pwidth <= 0 -> fixed
+    }
+
+    #[test]
+    fn alloc_curve_exponential_matches_rust() {
+        // ptype 4: symmetric exponential (both z>0 and z<=0 arms). Also the
+        // pwidth<=0 degenerate-to-fixed arm.
+        assert_curve_matches(4, 2.0, 1.0, 0.0, 8.0);
+        assert_curve_matches(4, 2.0, -1.0, 0.0, 8.0); // pwidth <= 0 -> fixed
+    }
+
+    #[test]
+    fn alloc_curve_ces_matches_rust() {
+        // ptype 5: CES (uses pow). pextra is the elasticity. The grid spans
+        // p<=0 (->1), ppriority>0 normal case, and large-elasticity values that
+        // push q toward +inf (->1).
+        assert_curve_matches(5, 3.0, 1.0, 1.0, 5.0);
+        assert_curve_matches(5, 3.0, 1.0, 4.0, 5.0);
+        // ppriority <= 0 -> 0 for any positive price.
+        assert_curve_matches(5, 0.0, 1.0, 2.0, 5.0);
+    }
+
+    #[test]
+    fn alloc_curve_floor_flag_matches_rust() {
+        // ptype >= 10 floors the allocation. ptype 10 is rectangular(0)+floor,
+        // 11 is rectangular(1)+floor, etc. Pick a request that yields a
+        // fractional allocation so the floor is observable.
+        for ptype in [10, 11, 13, 14, 15] {
+            assert_curve_matches(ptype, 2.5, 1.5, 1.0, 3.3);
+        }
+    }
+
+    #[test]
+    fn alloc_curve_nonpositive_request_is_zero() {
+        // request <= 0 -> 0 for every profile, regardless of price/type.
+        for &request in &[0.0, -1.0, -100.0] {
+            for ptype in 0..6 {
+                let got = run_alloc_curve(1.0, request, ptype, 2.0, 1.0, 1.0);
+                let want = crate::alloc::alloc_curve(1.0, request, ptype, 2.0, 1.0, 1.0);
+                assert_eq!(got, want, "request {request}, ptype {ptype}");
+                assert_eq!(got, 0.0);
+            }
+        }
+    }
+
+    // ── allocate_available parity vs crate::alloc::allocate_available ────────
+
+    // Scratch byte layout for the `allocate_available` helper test: the i32
+    // requester count at N_ADDR, requests at REQ_BASE, profiles at PROF_BASE
+    // (4 f64/requester), out at OUT_BASE. All 8-byte aligned (N_ADDR 4-byte),
+    // comfortably inside the single 64 KiB memory page.
+    const N_ADDR: u32 = 64;
+    const REQ_BASE: u32 = 256;
+    const PROF_BASE: u32 = 1024;
+    const OUT_BASE: u32 = 4096;
+
+    /// Build a module with every helper body plus an exported `alloc(avail: f64)`
+    /// wrapper that calls `allocate_available(REQ_BASE, n, PROF_BASE, avail,
+    /// OUT_BASE)` with the array pointers hard-coded to the test's scratch bases
+    /// and `n` read from `mem[N_ADDR]` (an i32). A single f64 param keeps the
+    /// wrapper inside the DLR-FT interop's typed-tuple arity limit; the array
+    /// pointers and `n` are seeded into memory by the test.
+    fn build_allocate_module() -> Vec<u8> {
+        let helpers = build_helpers();
+        let n_helpers = helpers.functions.len() as u32;
+        let target = helpers.fns.allocate_available;
+
+        let mut module = Module::new();
+
+        let mut types = TypeSection::new();
+        // alloc(avail: f64) -> ()
+        types.ty().function([ValType::F64], []);
+        for hf in &helpers.functions {
+            types.ty().function(hf.params.clone(), hf.results.clone());
+        }
+        module.section(&types);
+
+        let mut functions = FunctionSection::new();
+        for (i, _) in helpers.functions.iter().enumerate() {
+            functions.function(1 + i as u32);
+        }
+        functions.function(0);
+        module.section(&functions);
+
+        let mut memories = MemorySection::new();
+        memories.memory(MemoryType {
+            minimum: 1,
+            maximum: None,
+            memory64: false,
+            shared: false,
+            page_size_log2: None,
+        });
+        module.section(&memories);
+
+        let mut exports = ExportSection::new();
+        exports.export("alloc", ExportKind::Func, n_helpers);
+        exports.export("mem", ExportKind::Memory, 0);
+        module.section(&exports);
+
+        let mut code = CodeSection::new();
+        for hf in &helpers.functions {
+            code.function(&hf.body);
+        }
+        let mut wrapper = Function::new([]);
+        // allocate_available(REQ_BASE, mem[N_ADDR] as i32, PROF_BASE, avail, OUT_BASE)
+        wrapper.instruction(&Ins::I32Const(REQ_BASE as i32));
+        wrapper.instruction(&Ins::I32Const(0));
+        wrapper.instruction(&Ins::I32Load(wasm_encoder::MemArg {
+            offset: u64::from(N_ADDR),
+            align: 2,
+            memory_index: 0,
+        }));
+        wrapper.instruction(&Ins::I32Const(PROF_BASE as i32));
+        wrapper.instruction(&Ins::LocalGet(0)); // avail (f64 param)
+        wrapper.instruction(&Ins::I32Const(OUT_BASE as i32));
+        wrapper.instruction(&Ins::Call(target));
+        wrapper.instruction(&Ins::End);
+        code.function(&wrapper);
+        module.section(&code);
+
+        module.finish()
+    }
+
+    /// Run the emitted `allocate_available` over `requests`/`profiles` and read
+    /// back the `n` result slots; compare against `crate::alloc::allocate_available`.
+    fn assert_allocate_matches(requests: &[f64], profiles: &[(f64, f64, f64, f64)], avail: f64) {
+        assert_eq!(requests.len(), profiles.len());
+        let n = requests.len();
+        let bytes = build_allocate_module();
+        let info = validate(&bytes).expect("allocate module must validate");
+        let mut store = Store::new(());
+        let inst = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("instantiate")
+            .module_addr;
+
+        // Seed n, requests, and profiles into scratch memory.
+        let mem = store
+            .instance_export(inst, "mem")
+            .unwrap()
+            .as_mem()
+            .unwrap();
+        store.mem_access_mut_slice(mem, |b| {
+            let na = N_ADDR as usize;
+            b[na..na + 4].copy_from_slice(&(n as i32).to_le_bytes());
+            for (i, &r) in requests.iter().enumerate() {
+                let a = REQ_BASE as usize + i * 8;
+                b[a..a + 8].copy_from_slice(&r.to_le_bytes());
+            }
+            for (i, &(pt, pp, pw, pe)) in profiles.iter().enumerate() {
+                let base = PROF_BASE as usize + i * 32;
+                b[base..base + 8].copy_from_slice(&pt.to_le_bytes());
+                b[base + 8..base + 16].copy_from_slice(&pp.to_le_bytes());
+                b[base + 16..base + 24].copy_from_slice(&pw.to_le_bytes());
+                b[base + 24..base + 32].copy_from_slice(&pe.to_le_bytes());
+            }
+        });
+
+        let alloc = store
+            .instance_export(inst, "alloc")
+            .unwrap()
+            .as_func()
+            .unwrap();
+        store
+            .invoke_simple_typed::<(f64,), ()>(alloc, (avail,))
+            .expect("invoke");
+
+        let got: Vec<f64> = store.mem_access_mut_slice(mem, |b| {
+            (0..n)
+                .map(|i| {
+                    let a = OUT_BASE as usize + i * 8;
+                    f64::from_le_bytes(b[a..a + 8].try_into().unwrap())
+                })
+                .collect()
+        });
+        let want = crate::alloc::allocate_available(requests, profiles, avail);
+        assert_eq!(want.len(), n);
+        for (i, (&g, &w)) in got.iter().zip(want.iter()).enumerate() {
+            assert_close(
+                &format!("allocate_available[{i}]"),
+                g,
+                w,
+                ALLOC_ABS,
+                ALLOC_REL,
+            );
+        }
+    }
+
+    #[test]
+    fn allocate_available_full_grant_when_supply_exceeds_demand() {
+        // avail >= total_demand: each requester gets r.max(0). A negative request
+        // clamps to 0 (the `r.max(0.0)` arm).
+        let requests = [3.0, 2.0, -1.0, 4.0];
+        let profiles = [
+            (1.0, 1.0, 1.0, 0.0),
+            (1.0, 2.0, 1.0, 0.0),
+            (1.0, 3.0, 1.0, 0.0),
+            (1.0, 1.5, 1.0, 0.0),
+        ];
+        // total_demand = 3+2+4 = 9 (the negative request is excluded).
+        assert_allocate_matches(&requests, &profiles, 100.0);
+    }
+
+    #[test]
+    fn allocate_available_zeros_when_supply_nonpositive() {
+        // avail <= 0: all zeros.
+        let requests = [3.0, 2.0, 4.0];
+        let profiles = [
+            (1.0, 1.0, 1.0, 0.0),
+            (1.0, 2.0, 1.0, 0.0),
+            (1.0, 3.0, 1.0, 0.0),
+        ];
+        assert_allocate_matches(&requests, &profiles, 0.0);
+        assert_allocate_matches(&requests, &profiles, -5.0);
+    }
+
+    #[test]
+    fn allocate_available_partial_bisection_rectangular() {
+        // The interesting case: 0 < avail < total_demand, so the bisection runs.
+        // Rectangular profiles (ptype 1) with distinct priorities, mirroring the
+        // `allocate.mdl` shape.
+        let requests = [3.0, 2.0, 4.0];
+        let profiles = [
+            (1.0, 1.0, 1.0, 0.0),
+            (1.0, 2.0, 1.0, 0.0),
+            (1.0, 3.0, 1.0, 0.0),
+        ];
+        // total_demand = 9; supply 5 forces a partial allocation.
+        for avail in [1.0, 3.0, 5.0, 7.0, 8.5] {
+            assert_allocate_matches(&requests, &profiles, avail);
+        }
+    }
+
+    #[test]
+    fn allocate_available_partial_bisection_across_profile_types() {
+        // Partial allocation with a mix of profile types, exercising the
+        // search-range `spread` per type and the per-requester curve at the
+        // converged price.
+        let requests = [4.0, 3.0, 5.0, 2.0, 6.0];
+        let profiles = [
+            (0.0, 2.0, 1.0, 0.0), // fixed
+            (2.0, 3.0, 1.5, 0.0), // triangular
+            (3.0, 2.5, 1.0, 0.0), // normal
+            (4.0, 2.0, 1.2, 0.0), // exponential
+            (5.0, 3.0, 1.0, 2.0), // CES
+        ];
+        // total_demand = 20; sweep several partial supplies.
+        for avail in [2.0, 6.0, 10.0, 15.0, 19.0] {
+            assert_allocate_matches(&requests, &profiles, avail);
+        }
+    }
+
+    #[test]
+    fn allocate_available_empty_requesters_is_noop() {
+        // n == 0: nothing is written (the helper returns immediately). Exercised
+        // by passing zero requesters; the read-back loop covers zero slots, so
+        // this simply must not trap.
+        assert_allocate_matches(&[], &[], 10.0);
+    }
+}
diff --git a/src/simlin-engine/src/wasmgen/lookup.rs b/src/simlin-engine/src/wasmgen/lookup.rs
new file mode 100644
index 000000000..7701b2975
--- /dev/null
+++ b/src/simlin-engine/src/wasmgen/lookup.rs
@@ -0,0 +1,657 @@
+// Copyright 2026 The Simlin Authors. All rights reserved.
+// Use of this source code is governed by the Apache License,
+// Version 2.0, that can be found in the LICENSE file.
+
+// pattern: Functional Core
+// Pure transformation: each public function emits a self-contained wasm helper
+// `Function` (instruction sequence) for one graphical-function lookup mode. No
+// I/O; the only side effect is in `#[cfg(test)]`, which executes the emitted
+// helpers under the DLR-FT interpreter and compares against the VM's lookup
+// functions.
+
+//! Graphical-function lookup helper functions for the wasm simulation backend.
+//!
+//! The bytecode VM resolves a `Lookup` opcode against a `&[(f64, f64)]` table
+//! through one of three functions (`vm.rs:3055-3186`): `lookup` (linear
+//! interpolation), `lookup_forward` (step up), and `lookup_backward` (step
+//! down). This module emits one wasm helper per mode -- `lookup_interp`,
+//! `lookup_forward`, `lookup_backward` -- each over a flat
+//! `(data_off: i32, count: i32, index: f64) -> f64` interface, where the table
+//! lives in linear memory as `count` consecutive f64 LE `(x, y)` knot pairs
+//! starting at byte offset `data_off` (so knot `k` is
+//! `x = f64.load[data_off + 16*k]`, `y = f64.load[data_off + 16*k + 8]`).
+//! `module.rs` lays these regions out (see `build_gf_regions`); the `Lookup`
+//! opcode (`lower.rs`) reads `(data_off, count)` from the GF directory and
+//! `call`s the mode's helper.
+//!
+//! ## The three functions are NOT one function
+//!
+//! They differ in three ways, mirrored here exactly so the backend takes the
+//! same branch the VM does:
+//! - **edge clamps**: `lookup_interp` clamps *strictly* (`index < x[0]` /
+//!   `index > x[n-1]`); `forward`/`backward` clamp *inclusively* (`<=` / `>=`).
+//! - **search**: `interp`/`forward` use a *lower-bound* search
+//!   (`x[mid] < index`); `backward` uses an *upper-bound* search
+//!   (`x[mid] <= index`).
+//! - **result**: `interp` either returns `y[low]` exactly (when
+//!   `approx_eq(x[low], index)`, via the Phase 2 helper) or linearly
+//!   interpolates between knots `low-1` and `low`; `forward` returns `y[low]`;
+//!   `backward` returns `y[low-1]` (the last knot with `x <= index`; for
+//!   duplicate x-values, the LAST such knot, since the upper-bound search lands
+//!   past every equal x).
+//!
+//! Each helper guards `count == 0` and a NaN `index` by returning NaN, matching
+//! the VM's `table.is_empty()` / `index.is_nan()` early returns.
+
+use wasm_encoder::{BlockType, Function, Instruction as Ins, MemArg, ValType};
+
+/// Bytes per knot: an f64 `x` followed by an f64 `y`.
+const KNOT_BYTES: i32 = 16;
+
+// Helper local layout. Params 0..2 are `data_off`/`count`/`index`; the i32
+// search cursors follow.
+const DATA_OFF: u32 = 0; // i32 byte offset of knot 0
+const COUNT: u32 = 1; // i32 point count
+const INDEX: u32 = 2; // f64 lookup index
+const LOW: u32 = 3; // i32 binary-search low
+const HIGH: u32 = 4; // i32 binary-search high
+const MID: u32 = 5; // i32 binary-search midpoint
+
+/// An 8-byte (f64) memory access with a static byte `offset` on top of the
+/// dynamic address already on the stack. The data region is 8-byte aligned (see
+/// `module.rs`), so the natural-alignment hint is valid.
+fn knot_memarg(offset: u64) -> MemArg {
+    MemArg {
+        offset,
+        align: 3, // log2(8): an 8-byte f64 access
+        memory_index: 0,
+    }
+}
+
+/// Push the byte address of knot `k` (the i32 in `k_local`):
+/// `data_off + 16*k`. A subsequent `f64.load` with `knot_memarg(0)` reads `x`,
+/// `knot_memarg(8)` reads `y`.
+fn push_knot_addr(f: &mut Function, k_local: u32) {
+    f.instruction(&Ins::LocalGet(DATA_OFF));
+    f.instruction(&Ins::LocalGet(k_local));
+    f.instruction(&Ins::I32Const(KNOT_BYTES));
+    f.instruction(&Ins::I32Mul);
+    f.instruction(&Ins::I32Add);
+}
+
+/// Push `x[k]` for the knot index in `k_local`.
+fn push_x(f: &mut Function, k_local: u32) {
+    push_knot_addr(f, k_local);
+    f.instruction(&Ins::F64Load(knot_memarg(0)));
+}
+
+/// Push `y[k]` for the knot index in `k_local`.
+fn push_y(f: &mut Function, k_local: u32) {
+    push_knot_addr(f, k_local);
+    f.instruction(&Ins::F64Load(knot_memarg(8)));
+}
+
+/// Push `x[count-1]` (the last knot's x). Computed without a dedicated local by
+/// pushing the address `data_off + 16*(count-1)` inline.
+fn push_last_x(f: &mut Function) {
+    f.instruction(&Ins::LocalGet(DATA_OFF));
+    f.instruction(&Ins::LocalGet(COUNT));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Sub);
+    f.instruction(&Ins::I32Const(KNOT_BYTES));
+    f.instruction(&Ins::I32Mul);
+    f.instruction(&Ins::I32Add);
+    f.instruction(&Ins::F64Load(knot_memarg(0)));
+}
+
+/// Push `y[count-1]` (the last knot's y).
+fn push_last_y(f: &mut Function) {
+    f.instruction(&Ins::LocalGet(DATA_OFF));
+    f.instruction(&Ins::LocalGet(COUNT));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Sub);
+    f.instruction(&Ins::I32Const(KNOT_BYTES));
+    f.instruction(&Ins::I32Mul);
+    f.instruction(&Ins::I32Add);
+    f.instruction(&Ins::F64Load(knot_memarg(8)));
+}
+
+/// Emit the two early guards every lookup function shares: `count == 0 -> NaN`
+/// and `index != index (NaN) -> NaN`. Mirrors the VM's `table.is_empty()` and
+/// `index.is_nan()` early returns.
+fn emit_empty_and_nan_guards(f: &mut Function) {
+    // if count == 0 { return NaN }
+    f.instruction(&Ins::LocalGet(COUNT));
+    f.instruction(&Ins::I32Eqz);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&Ins::F64Const(f64::NAN.into()));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // if index != index { return NaN }  (the NaN test)
+    f.instruction(&Ins::LocalGet(INDEX));
+    f.instruction(&Ins::LocalGet(INDEX));
+    f.instruction(&Ins::F64Ne);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&Ins::F64Const(f64::NAN.into()));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+}
+
+/// Emit the binary search over `[LOW, HIGH)` into `LOW`. `mid_cmp_is_lt` selects
+/// the predicate: `true` -> lower bound (`x[mid] < index`), `false` -> upper
+/// bound (`x[mid] <= index`). On exit `LOW` is the first index whose `x` fails
+/// the predicate (the lower/upper bound), exactly matching the VM's
+/// `while low < high { mid; if pred { low = mid+1 } else { high = mid } }`.
+///
+/// `LOW`/`HIGH` must already be initialized (to `0`/`count`).
+fn emit_binary_search(f: &mut Function, mid_cmp_is_lt: bool) {
+    f.instruction(&Ins::Block(BlockType::Empty)); // $exit
+    f.instruction(&Ins::Loop(BlockType::Empty)); // $top
+
+    // while-head: if !(low < high) break  (br depth 1 -> $exit)
+    f.instruction(&Ins::LocalGet(LOW));
+    f.instruction(&Ins::LocalGet(HIGH));
+    f.instruction(&Ins::I32LtS);
+    f.instruction(&Ins::I32Eqz);
+    f.instruction(&Ins::BrIf(1));
+
+    // mid = low + (high - low) / 2  (all non-negative; signed div is exact)
+    f.instruction(&Ins::LocalGet(LOW));
+    f.instruction(&Ins::LocalGet(HIGH));
+    f.instruction(&Ins::LocalGet(LOW));
+    f.instruction(&Ins::I32Sub);
+    f.instruction(&Ins::I32Const(2));
+    f.instruction(&Ins::I32DivS);
+    f.instruction(&Ins::I32Add);
+    f.instruction(&Ins::LocalSet(MID));
+
+    // pred = x[mid] {<, <=} index
+    push_x(f, MID);
+    f.instruction(&Ins::LocalGet(INDEX));
+    if mid_cmp_is_lt {
+        f.instruction(&Ins::F64Lt);
+    } else {
+        f.instruction(&Ins::F64Le);
+    }
+    f.instruction(&Ins::If(BlockType::Empty));
+    // low = mid + 1
+    f.instruction(&Ins::LocalGet(MID));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Add);
+    f.instruction(&Ins::LocalSet(LOW));
+    f.instruction(&Ins::Else);
+    // high = mid
+    f.instruction(&Ins::LocalGet(MID));
+    f.instruction(&Ins::LocalSet(HIGH));
+    f.instruction(&Ins::End);
+
+    f.instruction(&Ins::Br(0)); // continue -> $top
+    f.instruction(&Ins::End); // end loop
+    f.instruction(&Ins::End); // end block
+}
+
+/// Initialize `LOW = 0; HIGH = count`.
+fn emit_init_search_bounds(f: &mut Function) {
+    f.instruction(&Ins::I32Const(0));
+    f.instruction(&Ins::LocalSet(LOW));
+    f.instruction(&Ins::LocalGet(COUNT));
+    f.instruction(&Ins::LocalSet(HIGH));
+}
+
+/// Build the body of `lookup_interp(data_off: i32, count: i32, index: f64)
+/// -> f64`, reproducing the VM's `lookup` (`vm.rs:3055-3102`) exactly:
+/// empty/NaN -> NaN; **strict** edge clamps (`index < x[0]` -> `y[0]`,
+/// `index > x[n-1]` -> `y[n-1]`); lower-bound binary search; then at `i = low`,
+/// `approx_eq(x[i], index)` -> `y[i]`, else linear interpolation between knots
+/// `i-1` and `i`.
+///
+/// `approx_eq_idx` is the module function index of the Phase 2 `approx_eq`
+/// helper (`lower::HelperFns::approx_eq`); the at-knot exact-hit test `call`s it
+/// so the backend matches the VM's `crate::float::approx_eq` branch.
+pub(crate) fn emit_lookup_interp(approx_eq_idx: u32) -> Function {
+    let mut f = Function::new([(3, ValType::I32)]); // LOW/HIGH/MID
+
+    emit_empty_and_nan_guards(&mut f);
+
+    // if index < x[0] { return y[0] }  (strict)
+    f.instruction(&Ins::LocalGet(INDEX));
+    push_x_const0(&mut f);
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::If(BlockType::Empty));
+    push_y_const0(&mut f);
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // if index > x[count-1] { return y[count-1] }  (strict)
+    f.instruction(&Ins::LocalGet(INDEX));
+    push_last_x(&mut f);
+    f.instruction(&Ins::F64Gt);
+    f.instruction(&Ins::If(BlockType::Empty));
+    push_last_y(&mut f);
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    emit_init_search_bounds(&mut f);
+    emit_binary_search(&mut f, true); // lower bound
+
+    // i = low. if approx_eq(x[i], index) { return y[i] }
+    push_x(&mut f, LOW);
+    f.instruction(&Ins::LocalGet(INDEX));
+    f.instruction(&Ins::Call(approx_eq_idx));
+    f.instruction(&Ins::If(BlockType::Empty));
+    push_y(&mut f, LOW);
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // else linear interp:
+    //   slope = (y[i] - y[i-1]) / (x[i] - x[i-1])
+    //   result = (index - x[i-1]) * slope + y[i-1]
+    // Reuse MID as the i32 holding `i-1` so x[i-1]/y[i-1] reuse push_x/push_y.
+    f.instruction(&Ins::LocalGet(LOW));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Sub);
+    f.instruction(&Ins::LocalSet(MID)); // MID = i-1
+
+    // (index - x[i-1])
+    f.instruction(&Ins::LocalGet(INDEX));
+    push_x(&mut f, MID);
+    f.instruction(&Ins::F64Sub);
+    // * slope
+    push_y(&mut f, LOW);
+    push_y(&mut f, MID);
+    f.instruction(&Ins::F64Sub); // y[i] - y[i-1]
+    push_x(&mut f, LOW);
+    push_x(&mut f, MID);
+    f.instruction(&Ins::F64Sub); // x[i] - x[i-1]
+    f.instruction(&Ins::F64Div); // slope
+    f.instruction(&Ins::F64Mul); // (index - x[i-1]) * slope
+    // + y[i-1]
+    push_y(&mut f, MID);
+    f.instruction(&Ins::F64Add);
+
+    f.instruction(&Ins::End);
+    f
+}
+
+/// Build the body of `lookup_forward(data_off, count, index) -> f64`,
+/// reproducing the VM's `lookup_forward` (`vm.rs:3104-3142`): empty/NaN -> NaN;
+/// **inclusive** edge clamps (`index <= x[0]` -> `y[0]`, `index >= x[n-1]` ->
+/// `y[n-1]`); the same lower-bound binary search; return `y[low]`. No
+/// `approx_eq`, no interpolation.
+pub(crate) fn emit_lookup_forward() -> Function {
+    let mut f = Function::new([(3, ValType::I32)]); // LOW/HIGH/MID
+
+    emit_empty_and_nan_guards(&mut f);
+
+    // if index <= x[0] { return y[0] }  (inclusive)
+    f.instruction(&Ins::LocalGet(INDEX));
+    push_x_const0(&mut f);
+    f.instruction(&Ins::F64Le);
+    f.instruction(&Ins::If(BlockType::Empty));
+    push_y_const0(&mut f);
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // if index >= x[count-1] { return y[count-1] }  (inclusive)
+    f.instruction(&Ins::LocalGet(INDEX));
+    push_last_x(&mut f);
+    f.instruction(&Ins::F64Ge);
+    f.instruction(&Ins::If(BlockType::Empty));
+    push_last_y(&mut f);
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    emit_init_search_bounds(&mut f);
+    emit_binary_search(&mut f, true); // lower bound
+
+    // return y[low]
+    push_y(&mut f, LOW);
+
+    f.instruction(&Ins::End);
+    f
+}
+
+/// Build the body of `lookup_backward(data_off, count, index) -> f64`,
+/// reproducing the VM's `lookup_backward` (`vm.rs:3144-3186`): empty/NaN ->
+/// NaN; **inclusive** edge clamps; an **upper-bound** binary search
+/// (`x[mid] <= index`); return `y[low-1]` (the last knot with `x <= index`; for
+/// duplicate x-values, the LAST one). No `approx_eq`, no interpolation.
+pub(crate) fn emit_lookup_backward() -> Function {
+    let mut f = Function::new([(3, ValType::I32)]); // LOW/HIGH/MID
+
+    emit_empty_and_nan_guards(&mut f);
+
+    // if index <= x[0] { return y[0] }  (inclusive)
+    f.instruction(&Ins::LocalGet(INDEX));
+    push_x_const0(&mut f);
+    f.instruction(&Ins::F64Le);
+    f.instruction(&Ins::If(BlockType::Empty));
+    push_y_const0(&mut f);
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // if index >= x[count-1] { return y[count-1] }  (inclusive)
+    f.instruction(&Ins::LocalGet(INDEX));
+    push_last_x(&mut f);
+    f.instruction(&Ins::F64Ge);
+    f.instruction(&Ins::If(BlockType::Empty));
+    push_last_y(&mut f);
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    emit_init_search_bounds(&mut f);
+    emit_binary_search(&mut f, false); // upper bound
+
+    // return y[low-1]  (reuse MID as low-1)
+    f.instruction(&Ins::LocalGet(LOW));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Sub);
+    f.instruction(&Ins::LocalSet(MID));
+    push_y(&mut f, MID);
+
+    f.instruction(&Ins::End);
+    f
+}
+
+/// Push `x[0]` (`f64.load[data_off + 0]`). The knot-0 address is just
+/// `data_off`, so no index arithmetic is needed.
+fn push_x_const0(f: &mut Function) {
+    f.instruction(&Ins::LocalGet(DATA_OFF));
+    f.instruction(&Ins::F64Load(knot_memarg(0)));
+}
+
+/// Push `y[0]` (`f64.load[data_off + 8]`).
+fn push_y_const0(f: &mut Function) {
+    f.instruction(&Ins::LocalGet(DATA_OFF));
+    f.instruction(&Ins::F64Load(knot_memarg(8)));
+}
+
+#[cfg(test)]
+mod tests {
+    use super::super::lower::build_helpers;
+    use checked::Store;
+    use wasm::validate;
+    use wasm_encoder::{
+        CodeSection, ConstExpr, DataSection, ExportKind, ExportSection, Function, FunctionSection,
+        Instruction, MemorySection, MemoryType, Module, TypeSection, ValType,
+    };
+
+    /// Which lookup helper a test module exports as `f`.
+    #[derive(Clone, Copy, Debug)]
+    enum Mode {
+        Interp,
+        Forward,
+        Backward,
+    }
+
+    /// Resolve a [`Mode`] to its helper function index in the assembled table.
+    fn helper_index(mode: Mode) -> u32 {
+        let h = build_helpers().fns;
+        match mode {
+            Mode::Interp => h.lookup_interp,
+            Mode::Forward => h.lookup_forward,
+            Mode::Backward => h.lookup_backward,
+        }
+    }
+
+    /// The byte offset the test harness writes the table to (one f64 in, so a
+    /// non-zero `data_off` is exercised rather than the degenerate 0).
+    const TABLE_BASE: u32 = 8;
+
+    /// Build a module containing *every* helper body (so `lookup_interp`'s
+    /// `call approx_eq` resolves) plus a thin exported wrapper
+    /// `f(data_off: i32, count: i32, index: f64) -> f64` forwarding to the
+    /// helper-under-test, and an exported `memory` seeded with `knots` at
+    /// [`TABLE_BASE`] via an active data segment. Mirrors `lower.rs`'s
+    /// production assembly: helpers occupy function indices `0..N`, the wrapper
+    /// follows at `N`.
+    fn build_lookup_module(mode: Mode, knots: &[(f64, f64)]) -> Vec<u8> {
+        let helpers = build_helpers();
+        let n_helpers = helpers.functions.len() as u32;
+        let target = helper_index(mode);
+
+        let mut module = Module::new();
+
+        // Type 0 is the wrapper `(i32, i32, f64) -> f64`; helper types follow.
+        let mut types = TypeSection::new();
+        types
+            .ty()
+            .function([ValType::I32, ValType::I32, ValType::F64], [ValType::F64]);
+        for hf in &helpers.functions {
+            types.ty().function(hf.params.clone(), hf.results.clone());
+        }
+        module.section(&types);
+
+        let mut functions = FunctionSection::new();
+        for (i, _) in helpers.functions.iter().enumerate() {
+            functions.function(1 + i as u32);
+        }
+        functions.function(0);
+        module.section(&functions);
+
+        let mut memories = MemorySection::new();
+        memories.memory(MemoryType {
+            minimum: 1,
+            maximum: None,
+            memory64: false,
+            shared: false,
+            page_size_log2: None,
+        });
+        module.section(&memories);
+
+        let mut exports = ExportSection::new();
+        exports.export("f", ExportKind::Func, n_helpers);
+        exports.export("memory", ExportKind::Memory, 0);
+        module.section(&exports);
+
+        let mut code = CodeSection::new();
+        for hf in &helpers.functions {
+            code.function(&hf.body);
+        }
+        // wrapper: forward (data_off, count, index) to the helper-under-test.
+        let mut wrapper = Function::new([]);
+        wrapper.instruction(&Instruction::LocalGet(0));
+        wrapper.instruction(&Instruction::LocalGet(1));
+        wrapper.instruction(&Instruction::LocalGet(2));
+        wrapper.instruction(&Instruction::Call(target));
+        wrapper.instruction(&Instruction::End);
+        code.function(&wrapper);
+        module.section(&code);
+
+        // Seed the table at TABLE_BASE as interleaved f64 LE x,y pairs.
+        let mut bytes: Vec<u8> = Vec::with_capacity(knots.len() * 16);
+        for &(x, y) in knots {
+            bytes.extend_from_slice(&x.to_le_bytes());
+            bytes.extend_from_slice(&y.to_le_bytes());
+        }
+        let mut data = DataSection::new();
+        data.active(0, &ConstExpr::i32_const(TABLE_BASE as i32), bytes);
+        module.section(&data);
+
+        module.finish()
+    }
+
+    /// Run the emitted lookup helper for `mode` over `knots` at `index` under
+    /// the DLR-FT interpreter. The module is (re)built per call; the tables are
+    /// tiny (a handful of knots) so this stays well under the per-test budget.
+    fn run_lookup(mode: Mode, knots: &[(f64, f64)], index: f64) -> f64 {
+        let bytes = build_lookup_module(mode, knots);
+        let info = validate(&bytes).expect("lookup module must validate");
+        let mut store = Store::new(());
+        let module = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("lookup module must instantiate")
+            .module_addr;
+        let f = store
+            .instance_export(module, "f")
+            .unwrap()
+            .as_func()
+            .unwrap();
+        store
+            .invoke_simple_typed::<(i32, i32, f64), f64>(
+                f,
+                (TABLE_BASE as i32, knots.len() as i32, index),
+            )
+            .expect("invocation must succeed")
+    }
+
+    /// The VM oracle for `mode` (the exact function the helper reproduces).
+    fn vm_lookup(mode: Mode, knots: &[(f64, f64)], index: f64) -> f64 {
+        match mode {
+            Mode::Interp => crate::vm::lookup(knots, index),
+            Mode::Forward => crate::vm::lookup_forward(knots, index),
+            Mode::Backward => crate::vm::lookup_backward(knots, index),
+        }
+    }
+
+    /// Assert the emitted helper agrees bit-for-bit with the VM oracle at
+    /// `index` (NaN compares as NaN). The interp helper routes its at-knot test
+    /// through the same `approx_eq` the VM uses, and neither helper does any
+    /// transcendental math, so equality is exact -- not within a tolerance.
+    fn assert_matches_vm(mode: Mode, knots: &[(f64, f64)], index: f64) {
+        let got = run_lookup(mode, knots, index);
+        let want = vm_lookup(mode, knots, index);
+        if want.is_nan() {
+            assert!(
+                got.is_nan(),
+                "{mode:?} lookup at index {index}: expected NaN, got {got}"
+            );
+        } else {
+            assert_eq!(
+                got, want,
+                "{mode:?} lookup at index {index}: got {got}, want {want}"
+            );
+        }
+    }
+
+    /// A monotonic-x table with non-uniform spacing and a non-monotone y, so
+    /// interpolation, forward, and backward all give distinguishable results.
+    const TABLE: &[(f64, f64)] = &[
+        (0.0, 10.0),
+        (1.0, 20.0),
+        (2.5, 5.0),
+        (4.0, 40.0),
+        (10.0, 0.0),
+    ];
+
+    /// A representative set of probe indices spanning every regime: below
+    /// range, exactly on each knot, strictly between each pair of knots, and
+    /// above range. Shared by all three modes (each mode's oracle defines the
+    /// right answer).
+    fn probe_indices(knots: &[(f64, f64)]) -> Vec<f64> {
+        let mut idx = vec![knots[0].0 - 5.0, knots[knots.len() - 1].0 + 5.0];
+        for w in knots.windows(2) {
+            let (a, b) = (w[0].0, w[1].0);
+            idx.push(a); // on a knot
+            idx.push((a + b) / 2.0); // strictly between
+            // a point near but not on the knot, to exercise the approx_eq edge
+            idx.push(a + (b - a) * 1e-3);
+        }
+        idx.push(knots[knots.len() - 1].0); // the final knot
+        idx
+    }
+
+    #[test]
+    fn lookup_interp_matches_vm_over_domain() {
+        for &index in &probe_indices(TABLE) {
+            assert_matches_vm(Mode::Interp, TABLE, index);
+        }
+    }
+
+    #[test]
+    fn lookup_forward_matches_vm_over_domain() {
+        for &index in &probe_indices(TABLE) {
+            assert_matches_vm(Mode::Forward, TABLE, index);
+        }
+    }
+
+    #[test]
+    fn lookup_backward_matches_vm_over_domain() {
+        for &index in &probe_indices(TABLE) {
+            assert_matches_vm(Mode::Backward, TABLE, index);
+        }
+    }
+
+    #[test]
+    fn lookup_all_modes_below_and_above_range() {
+        // The edge clamps differ (interp strict, forward/backward inclusive) but
+        // all three return the boundary y for an out-of-range index; assert each
+        // against its own oracle so the strict-vs-inclusive distinction is
+        // exercised at the boundary itself.
+        for mode in [Mode::Interp, Mode::Forward, Mode::Backward] {
+            assert_matches_vm(mode, TABLE, -100.0); // below x[0]
+            assert_matches_vm(mode, TABLE, 1000.0); // above x[n-1]
+            assert_matches_vm(mode, TABLE, TABLE[0].0); // exactly x[0]
+            assert_matches_vm(mode, TABLE, TABLE[TABLE.len() - 1].0); // exactly x[n-1]
+        }
+    }
+
+    #[test]
+    fn lookup_single_point_table() {
+        // A one-knot table: every index clamps to that knot's y for all modes.
+        let single: &[(f64, f64)] = &[(3.0, 7.0)];
+        for mode in [Mode::Interp, Mode::Forward, Mode::Backward] {
+            for &index in &[-1.0, 3.0, 3.0 - 1e-9, 3.0 + 1e-9, 100.0] {
+                assert_matches_vm(mode, single, index);
+            }
+        }
+    }
+
+    #[test]
+    fn lookup_backward_duplicate_x_returns_last() {
+        // Duplicate x-values: backward must return the y of the LAST knot with
+        // that x (the upper-bound search lands past every equal x, then steps
+        // back one). The interp/forward modes are also checked for consistency
+        // with their own oracle on the same table.
+        let dup: &[(f64, f64)] = &[
+            (0.0, 0.0),
+            (2.0, 10.0),
+            (2.0, 20.0),
+            (2.0, 30.0),
+            (5.0, 50.0),
+        ];
+        // Exactly on the duplicated x, and just inside either side of it.
+        for &index in &[2.0, 1.999, 2.001, 0.0, 5.0, 3.5] {
+            assert_matches_vm(Mode::Backward, dup, index);
+            assert_matches_vm(Mode::Forward, dup, index);
+            assert_matches_vm(Mode::Interp, dup, index);
+        }
+    }
+
+    #[test]
+    fn lookup_nan_index_returns_nan_all_modes() {
+        for mode in [Mode::Interp, Mode::Forward, Mode::Backward] {
+            assert!(
+                run_lookup(mode, TABLE, f64::NAN).is_nan(),
+                "{mode:?} lookup of a NaN index must be NaN"
+            );
+        }
+    }
+
+    #[test]
+    fn lookup_empty_table_returns_nan_all_modes() {
+        // count == 0 -> NaN for every mode (matching the VM's table.is_empty()).
+        // The wrapper passes count = 0; data_off is irrelevant (never read).
+        for mode in [Mode::Interp, Mode::Forward, Mode::Backward] {
+            assert!(
+                run_lookup(mode, &[], 1.0).is_nan(),
+                "{mode:?} lookup of an empty table must be NaN"
+            );
+        }
+    }
+
+    #[test]
+    fn lookup_interp_exact_knot_uses_approx_eq() {
+        // The interp helper returns y[i] exactly when approx_eq(x[i], index),
+        // matching the VM. A one-ULP-perturbed index at a knot is approx-equal,
+        // so it must return that knot's y exactly (NOT an interpolated value).
+        // The VM oracle encodes the same approx_eq decision.
+        let knot_x = TABLE[2].0; // 2.5
+        let perturbed = f64::from_bits(knot_x.to_bits() + 1);
+        assert_matches_vm(Mode::Interp, TABLE, perturbed);
+        // And the exact knot returns its y exactly.
+        let got = run_lookup(Mode::Interp, TABLE, knot_x);
+        assert_eq!(got, TABLE[2].1, "interp at the exact knot returns y[i]");
+    }
+}
diff --git a/src/simlin-engine/src/wasmgen/lower.rs b/src/simlin-engine/src/wasmgen/lower.rs
new file mode 100644
index 000000000..8cb7ba378
--- /dev/null
+++ b/src/simlin-engine/src/wasmgen/lower.rs
@@ -0,0 +1,3256 @@
+// Copyright 2026 The Simlin Authors. All rights reserved.
+// Use of this source code is governed by the Apache License,
+// Version 2.0, that can be found in the LICENSE file.
+
+// pattern: Functional Core
+// Pure transformation: bytecode + layout data in, wasm `Function` bodies /
+// instruction sequences out. No I/O; the only side effect is in `#[cfg(test)]`,
+// which executes the emitted modules under the DLR-FT interpreter.
+
+//! Lowering of the bytecode VM's scalar-core opcode set to WebAssembly.
+//!
+//! The runtime data model mirrors the bytecode VM (`crate::vm`): all variable
+//! values live in one flat f64 "slab" in linear memory, addressed by slot
+//! offset. A model runs over two chunks at a time -- `curr` (the values at the
+//! current timestep) and `next` (the values being computed for the following
+//! timestep). `LoadVar` reads from `curr`; `AssignCurr`/`AssignNext` store into
+//! `curr`/`next`.
+//!
+//! Each `Opcode` lowers to a short, mostly 1:1 wasm instruction sequence over
+//! the wasm operand stack, reproducing the matching arm of `eval_bytecode`
+//! (`vm.rs:1257+`).
+//!
+//! Three compound assignment opcodes beyond the bare scalar set reach a
+//! `CompiledSimulation` consumer, and they all lower here:
+//! - `AssignConstCurr` arrives by *two* routes: `compiler::codegen` emits it
+//!   directly for any constant-RHS `AssignCurr` (`codegen.rs:1164`), and the
+//!   **peephole** pass also fuses a `LoadConstant; AssignCurr` pair into it
+//!   (`bytecode.rs:1830`). Either way it rides through the symbolic layer into
+//!   `CompiledSimulation`; every model with a constant initial/aux carries it.
+//! - `BinOpAssignCurr` / `BinOpAssignNext` are *only* peephole output
+//!   (`bytecode.rs:1837`/`1841`, fusing `Op2; Assign{Curr,Next}`). The peephole
+//!   pass (`ByteCode::peephole_optimize`, run inside
+//!   `Module::compile`/`ByteCodeBuilder::finish`) runs per-variable-fragment in
+//!   the incremental pipeline *before* symbolization, so these ride through
+//!   too. Every scalar Euler stock integration (`stock + delta`) is one, so
+//!   they are part of the scalar core.
+//!
+//! The late **3-address** pass (`ByteCode::fuse_three_address`) instead runs
+//! only on the VM's private execution copy (`vm.rs:395-398`), so its
+//! `BinVarVar` / `AssignAddVarVarCurr` / ... family never reaches a consumer.
+//!
+//! Anything outside the supported scalar core -- an array/module/lookup opcode
+//! or a late-fusion superinstruction that somehow appeared -- returns
+//! `WasmGenError::Unsupported` rather than emitting a wrong module. (Every
+//! `Op2` variant, including `Mod`/`Exp`, is supported as of Phase 2.)
+//!
+//! ## Emitted helper functions
+//!
+//! Equality and truthiness route through a single emitted wasm helper,
+//! `approx_eq(a: f64, b: f64) -> i32`, that reproduces `crate::float::approx_eq`
+//! (`float_cmp` 0.10 defaults) bit-faithfully, so the backend takes the same
+//! branch the VM does. Helper functions are assembled into the module ahead of
+//! the per-program functions ([`build_helpers`] returns their bodies and a
+//! [`HelperFns`] index registry); `module.rs` places them at function indices
+//! `0..N` and the per-program + `run` functions at `N..`. `emit_bytecode`
+//! references a helper by its stable index (held in [`EmitCtx::helpers`]) via a
+//! `call`. Subcomponent B (the transcendental + `pulse` helpers) and later
+//! phases extend this by adding a field to [`HelperFns`] and pushing the
+//! corresponding body in [`build_helpers`]; no helper index is hard-coded
+//! elsewhere, so the per-program offset adjusts automatically.
+
+use std::collections::HashMap;
+
+use wasm_encoder::{Function, Instruction, MemArg, ValType};
+
+use crate::bytecode::{
+    BuiltinId, ByteCode, ByteCodeContext, GraphicalFunctionId, LookupMode, Op2, Opcode,
+};
+use crate::vm::{StepPart, make_module_key};
+
+use super::WasmGenError;
+use super::views::ElementAddr;
+use super::views::{ViewBase, ViewDesc};
+
+/// Bytes per f64 slot.
+pub(crate) const SLOT_SIZE: u32 = 8;
+/// Alignment exponent for an 8-byte f64 access (log2(8)).
+const F64_ALIGN: u32 = 3;
+/// Bytes per GF directory entry (two i32: data byte offset + point count). Must
+/// match `module.rs`'s `GF_DIRECTORY_ENTRY_BYTES`, the layout the `Lookup`
+/// opcode reads.
+pub(crate) const GF_DIRECTORY_ENTRY_BYTES: i32 = 8;
+
+/// Compile-time context for lowering a scalar opcode program over the f64 slab.
+///
+/// `curr_base`/`next_base` are byte offsets of slot 0 of each chunk within the
+/// linear memory. `module_off_local` is the wasm local index holding this
+/// instance's `module_off` (the slot base of the module instance within a
+/// chunk); the per-program functions take it as their single `i32` parameter.
+/// In Phase 1 the root is the only module so `module_off` is always 0, but
+/// emitting with it from the start avoids a Phase 7 rewrite.
+pub(crate) struct EmitCtx<'a> {
+    pub curr_base: u32,
+    pub next_base: u32,
+    /// Byte offset of the GF directory region (8 bytes/entry, indexed by global
+    /// table index: `(data_byte_offset: i32, n_points: i32)`). The `Lookup`
+    /// opcode reads `directory_base + table_idx*8` to map a table index to its
+    /// data location. Both bases are run-invariant: every per-program function
+    /// reads the same read-only GF regions.
+    pub gf_directory_base: u32,
+    /// Byte offset of the GF data region (every table's `(x,y)` knots as f64 LE
+    /// pairs, concatenated). Retained for completeness/Phase-7 reuse; the
+    /// per-table absolute data offset the `Lookup` opcode passes to a helper is
+    /// read from the directory, so opcode lowering does not consult this field.
+    #[allow(dead_code)]
+    pub gf_data_base: u32,
+    /// Byte offset of slot 0 of the `initial_values` snapshot region (n_slots
+    /// wide). `LoadInitial` reads `initial_values[module_off + off]` when the
+    /// program being emitted is *not* the initials program. Mirrors the VM's
+    /// `initial_values` buffer (`vm.rs:617`).
+    pub initial_values_base: u32,
+    /// Byte offset of slot 0 of the `prev_values` snapshot region (n_slots
+    /// wide). `LoadPrev` reads `prev_values[module_off + off]` once the snapshot
+    /// has been taken. Mirrors the VM's `prev_values` buffer.
+    pub prev_values_base: u32,
+    /// Index of the mutable i32 wasm global `use_prev_fallback` (init 1).
+    /// `LoadPrev` gates on it: while set, it yields the caller-supplied fallback
+    /// rather than reading `prev_values`. The flag -- not a `TIME == start`
+    /// comparison -- is the sole gate, because RK stages move `curr[TIME]` to
+    /// trial points before the first snapshot is taken (`vm.rs:1314-1327`).
+    pub use_prev_fallback_global: u32,
+    /// Which opcode program is being lowered. `LoadInitial` resolves its
+    /// "during Initials read `curr`, else read `initial_values`" branch
+    /// (`vm.rs:1332-1340`) at compile time from this, since the emitter knows
+    /// the program statically.
+    pub step_part: StepPart,
+    // dt/start_time/final_time are the run-invariant time globals that back the
+    // seeds `run` writes into the TIME/DT/INITIAL_TIME/FINAL_TIME memory slots.
+    // Opcode lowering reads those values from memory via `LoadGlobalVar` (slots
+    // 0..4) rather than from these fields -- the XMILE time builtins lower to
+    // `LoadGlobalVar`, and the time-driven `Apply` arms (Step/Ramp/Pulse) read
+    // TIME/DT from memory -- so the fields stay unused here. They are retained
+    // because a later phase may fold them into compile-time constants.
+    #[allow(dead_code)]
+    pub dt: f64,
+    #[allow(dead_code)]
+    pub start_time: f64,
+    #[allow(dead_code)]
+    pub final_time: f64,
+    /// wasm local index holding this instance's `module_off` (i32).
+    pub module_off_local: u32,
+    /// wasm local index of a scratch f64, used by `AssignCurr`/`AssignNext` to
+    /// hold the value while the store address is pushed under it.
+    pub scratch_local: u32,
+    /// wasm local indices reserved for the `SetCond`/`If` condition register.
+    /// Used as a stack: `SetCond` writes the top, `If` reads (and pops) it.
+    /// Sized to the program's maximum `If` nesting depth (see
+    /// [`max_condition_depth`]).
+    pub condition_locals: Vec<u32>,
+    /// Three dedicated scratch f64 local indices `[a, b, c]` for the `Apply`
+    /// opcode, which always pops exactly three operands (codegen pads). They
+    /// are distinct from [`scratch_local`](Self::scratch_local) and the
+    /// [`condition_locals`](Self::condition_locals) so an `Apply` inside an
+    /// `If` arm (sharing the function) cannot clobber the condition register.
+    /// Reserved unconditionally by the function builders (3 unused f64 locals
+    /// in a non-`Apply` function are free).
+    pub apply_locals: [u32; 3],
+    /// Function indices of the module's emitted helper functions, so
+    /// value-producing opcodes that need the VM's `approx_eq`/transcendental
+    /// semantics can `call` them. The same registry is shared by every
+    /// per-program function in a module.
+    pub helpers: HelperFns,
+    /// Byte offset of slot 0 of the `temp_storage` region (`temp_total_size`
+    /// f64 wide). The array view machinery addresses temp element `index` of
+    /// temp `temp_id` at `temp_storage_base + (temp_offsets[temp_id] + index)*8`,
+    /// mirroring the VM's `temp_storage[temp_offsets[temp_id] + index]`
+    /// (`vm.rs:584-586`).
+    pub temp_storage_base: u32,
+    /// First wasm local index reserved for the dynamic-subscript scratch i32
+    /// locals (Task 4): the runtime-offset addend and validity flag a
+    /// `ViewSubscriptDynamic` / `PushSubscriptIndex` accumulation draws from. The
+    /// function's local declarations reserve `count_extra_i32_locals(bc)` i32s
+    /// starting here, past the scratch f64 / condition i32s / `Apply` f64s / the
+    /// fixed vector-op i32 scratch block, so these never overlap
+    /// [`apply_locals`](Self::apply_locals) or
+    /// [`vector_i32_locals`](Self::vector_i32_locals). A program with no dynamic
+    /// subscripts reserves none and this base is unused.
+    pub extra_i32_local_base: u32,
+    /// The fixed [`VECTOR_F64_LOCAL_COUNT`] scratch f64 local indices the Phase-6
+    /// vector opcodes draw from (`VectorSelect`'s reduction accumulators, the
+    /// per-element value scratch). Reserved unconditionally by the function
+    /// builders; a non-vector function's unused f64 locals are free.
+    pub vector_f64_locals: [u32; VECTOR_F64_LOCAL_COUNT as usize],
+    /// The fixed [`VECTOR_I32_LOCAL_COUNT`] scratch i32 local indices the Phase-6
+    /// vector opcodes draw from (`VectorSelect`'s action/count/reduce-index,
+    /// `Rank`'s ascending flag + runtime store address, `VectorElmMap`'s runtime
+    /// flat index). Reserved unconditionally by the function builders (a
+    /// non-vector function's unused i32 locals are free) and placed before the
+    /// dynamic-subscript [`extra_i32_local_base`], so it never disturbs the
+    /// `apply_locals` indices.
+    pub vector_i32_locals: [u32; VECTOR_I32_LOCAL_COUNT as usize],
+    /// Byte offset of slot 0 of the vector-op scratch region. `VectorSelect`
+    /// collects its selected expr values here (`size` f64 worst case); the
+    /// `stable_sort` helper (`VectorSortOrder`/`Rank`) sorts `(value, idx)` pairs
+    /// here (`2 * size` f64). The two uses are never live simultaneously within a
+    /// single opcode, so they share the region. Sized by `module.rs` to the
+    /// largest view a vector op could process; the test harness sets a fixed
+    /// high offset within its single memory page.
+    pub vector_scratch_base: u32,
+    /// Byte offset of slot 0 of the allocation scratch region. The Phase-6
+    /// `AllocateAvailable`/`AllocateByPriority` arms stage the gathered request
+    /// values, the per-requester profile tuples (4 f64 each), and the output
+    /// allocations here before/after `call`ing the `allocate_available` helper.
+    /// The three sub-regions (`requests` (n) ++ `profiles` (4n) ++ `out` (n))
+    /// are laid out consecutively and are all live across the helper call.
+    /// Sized by `module.rs` to `6 * max(temp_total_size, n_slots)` f64 (a
+    /// requester count is bounded by a view's element count); reserved
+    /// unconditionally (a model without allocators never reads it). The test
+    /// harness sets a fixed high offset within its single memory page.
+    pub alloc_scratch_base: u32,
+    /// First wasm local index of the `EvalModule` reverse-pop scratch f64s (Phase
+    /// 7). An `EvalModule { n_inputs }` pops its `n_inputs` operands into the first
+    /// `n_inputs` of these (in reverse, matching the VM's `for j in
+    /// (0..n_inputs).rev()`), then pushes `child_module_off` followed by them in
+    /// order before the child `call`. Sized by `module.rs` to the max `n_inputs`
+    /// over the program's `EvalModule` sites; 0 (and unused) for a program with no
+    /// submodule instantiation. See [`module_input_scratch_base`].
+    pub module_input_scratch_base: u32,
+    /// Byte offset of slot 0 of the constants-override region (Phase 7 Task 2),
+    /// an `n_slots`-wide f64 region indexed by *absolute* slab offset and
+    /// initialized to the compiled-default literals at every overridable slot. A
+    /// redirected `AssignConstCurr { off }` (one whose `off` is in
+    /// [`flows_const_offsets`](Self::flows_const_offsets)) sources its value from
+    /// `const_region_base + (module_off + off) * 8` instead of an immediate
+    /// `f64.const`, so the exported `set_value` override takes effect every step
+    /// -- exactly as the VM mutating the bytecode literal does (`vm.rs:994-1008`).
+    /// Indexing by absolute slot (the same `module_off`-relative addressing the
+    /// slab uses) is what lets one shared `CompiledModule` running at several
+    /// `module_off`s pick up each instance's distinct override.
+    pub const_region_base: u32,
+    /// The set of *relative* offsets this instance's module assigns via an
+    /// `AssignConstCurr` in its flows phase -- i.e. the overridable constants of
+    /// this module (mirroring `collect_constant_info`'s flows-only overridability
+    /// rule, `vm.rs:436-450`, computed per module so it is compile-time even for a
+    /// shared module run at several offsets). An `AssignConstCurr { off }` in *any*
+    /// phase (initials/flows/stocks) whose `off` is in this set sources from the
+    /// constants region; one whose `off` is absent emits its immediate literal
+    /// unchanged. This matches the VM applying the override at every location of an
+    /// overridable offset (`collect_constant_info` collects flows + stocks +
+    /// initials locations for each flows-overridable offset).
+    pub flows_const_offsets: &'a std::collections::HashSet<u16>,
+    /// Resolves an `EvalModule { id }` site to the child instance's wasm function
+    /// index for the program being emitted: `module_fn_index[(child_key, part)]`,
+    /// where `child_key = make_module_key(&ctx.modules[id].model_name,
+    /// &ctx.modules[id].input_set)` and `part == step_part`. Built once by
+    /// `module.rs` before any program function is emitted (the module
+    /// instantiation graph is acyclic, so every child index exists by the time its
+    /// caller is emitted). Empty for a single-module (no-submodule) program.
+    pub module_fn_index: &'a HashMap<(crate::vm::ModuleKey, StepPart), u32>,
+    /// The module instance's `ByteCodeContext`, holding the compile-time array
+    /// tables the view opcodes reference by index (`static_views`, `dim_lists`,
+    /// `dimensions`, `subdim_relations`, `temp_offsets`) *and* the `modules`
+    /// declaration table the `EvalModule` arm resolves child keys from. This is
+    /// the *per-instance* context (Phase 7): each instance's functions are emitted
+    /// with its own context, so an `EvalModule`'s `ctx.modules[id]` and the array
+    /// tables refer to the instance whose function is being lowered.
+    pub ctx: &'a ByteCodeContext,
+}
+
+// Reserved global slots (absolute, module-independent), mirroring `crate::vm`.
+// `Apply` reads `curr[TIME_OFF]` / `curr[DT_OFF]` for the time-driven builtins.
+const TIME_OFF: u16 = 0;
+const DT_OFF: u16 = 1;
+
+pub(crate) fn memarg(addr: u64) -> MemArg {
+    MemArg {
+        offset: addr,
+        align: F64_ALIGN,
+        memory_index: 0,
+    }
+}
+
+/// `.into()` keeps this robust to whether `wasm-encoder`'s `F64Const` field is
+/// a bare `f64` or an `Ieee64` wrapper across versions.
+pub(crate) fn f64_const(v: f64) -> Instruction<'static> {
+    Instruction::F64Const(v.into())
+}
+
+// ============================================================================
+// Emitted helper functions
+// ============================================================================
+
+/// Function indices of a module's emitted helper functions.
+///
+/// Helpers occupy the module's first function slots (`0..N`), so their indices
+/// are fixed and known before any per-program function is emitted. This is what
+/// lets a value-producing opcode in `emit_bytecode` reference a helper by index
+/// (`call`). [`build_helpers`] both emits the bodies and assigns these indices,
+/// keeping the index assignment and the body emission in one place.
+///
+/// To add a helper (Subcomponent B's transcendentals + `pulse`, later phases'
+/// lookup/array/allocation helpers): add a field here and push its body in
+/// [`build_helpers`], assigning the field from the pre-push `functions.len()`.
+/// Do not hard-code a helper's index anywhere else.
+#[derive(Clone, Copy)]
+pub(crate) struct HelperFns {
+    /// `approx_eq(a: f64, b: f64) -> i32` (1 = approximately equal, else 0),
+    /// reproducing `crate::float::approx_eq` (`float_cmp` 0.10 defaults).
+    pub approx_eq: u32,
+    /// `mod_euclid(l: f64, r: f64) -> f64`, reproducing `f64::rem_euclid` (the
+    /// VM's `Op2::Mod`): a result in `[0, |r|)`. A self-contained helper (rather
+    /// than an inline sequence) because the euclidean remainder needs both
+    /// operands live across several uses, exceeding the single assign-scratch
+    /// local available to `emit_op2`.
+    pub mod_euclid: u32,
+    /// `pulse(time, dt, volume, first_pulse, interval) -> f64`, reproducing the
+    /// VM's `pulse` (`vm.rs:3036`) including its `while` loop. A helper because
+    /// of the loop (an inline expansion would need a wasm `loop`/`br_if` in the
+    /// middle of `Apply`'s operand handling).
+    pub pulse: u32,
+    /// Open-coded transcendental helpers (`super::math`), each `(f64) -> f64`
+    /// except [`pow`](Self::pow) which is `(f64, f64) -> f64`. The bodies are
+    /// emitted in `super::math`; the composed ones (`tan`/`asin`/`acos`/
+    /// `log10`/`pow`) `call` the leaf ones by the indices recorded here, so the
+    /// leaves are pushed first in [`build_helpers`]. `pow` is consumed by
+    /// `Op2::Exp`; the rest by the `Apply` arm.
+    pub exp: u32,
+    pub ln: u32,
+    pub sin: u32,
+    pub cos: u32,
+    pub tan: u32,
+    pub atan: u32,
+    pub asin: u32,
+    pub acos: u32,
+    pub log10: u32,
+    pub pow: u32,
+    /// Graphical-function lookup helpers (`super::lookup`), each
+    /// `(data_off: i32, count: i32, index: f64) -> f64`, reproducing the VM's
+    /// `lookup`/`lookup_forward`/`lookup_backward` (`vm.rs:3055-3186`). The
+    /// `Lookup` opcode (`emit_bytecode`) reads `(data_off, count)` from the GF
+    /// directory and `call`s the mode's helper. [`lookup_interp`](Self::lookup_interp)
+    /// `call`s [`approx_eq`](Self::approx_eq) for its at-knot exact-hit test, so
+    /// `approx_eq` is pushed before it in [`build_helpers`].
+    pub lookup_interp: u32,
+    pub lookup_forward: u32,
+    pub lookup_backward: u32,
+    /// `stable_sort(pairs_ptr: i32, n: i32, ascending: i32) -> ()`
+    /// (`super::vector`), an in-place stable comparison sort of `n` `(value: f64,
+    /// idx: f64)` pairs by `value`, used by `VectorSortOrder`/`Rank`. A runtime
+    /// loop (insertion sort) -- never unrolled -- because the element count is a
+    /// runtime view size and an unrolled O(n^2) body would blow up. NaN
+    /// comparisons sort as `Equal` (the comparison is a strict `f64.lt`/`f64.gt`,
+    /// which is false for NaN), reproducing the VM's stable
+    /// `sort_by(partial_cmp(..).unwrap_or(Equal))`.
+    pub stable_sort: u32,
+    /// Allocation helpers (`super::alloc`), porting `crate::alloc`
+    /// bit-faithfully for the `AllocateAvailable`/`AllocateByPriority` opcodes:
+    /// - `erfc_approx(z: f64) -> f64` (Abramowitz-Stegun 26.2.17; `call`s
+    ///   [`exp`](Self::exp) for the `(-z*z).exp()` factor),
+    /// - `normal_cdf(x: f64) -> f64` (`0.5 * erfc_approx(-x / SQRT_2)`; `call`s
+    ///   [`erfc_approx`](Self::erfc_approx)),
+    /// - `alloc_curve(p, request, ptype, ppriority, pwidth, pextra) -> f64`
+    ///   (all six `ptype % 10` curve branches + the `ptype >= 10` floor flag;
+    ///   `call`s [`normal_cdf`](Self::normal_cdf)/[`exp`](Self::exp)/
+    ///   [`pow`](Self::pow)),
+    /// - `allocate_available(requests_ptr: i32, n: i32, profiles_ptr: i32,
+    ///   avail: f64, out_ptr: i32) -> ()` -- the bisection market-clearing solve
+    ///   over scratch memory (a runtime loop; never unrolled), `call`s
+    ///   [`alloc_curve`](Self::alloc_curve).
+    ///
+    /// Pushed after `exp`/`pow`/`erfc_approx`/`normal_cdf`/`alloc_curve` (in that
+    /// dependency order) in [`build_helpers`], so each inter-helper `call`
+    /// resolves to an already-recorded index.
+    ///
+    /// `erfc_approx`/`normal_cdf`/`alloc_curve` are only consumed *during*
+    /// helper construction (each is passed to the next helper's emitter so its
+    /// `call` resolves) and by the `#[cfg(test)]` parity harness; only
+    /// `allocate_available` is `call`ed from an opcode arm. They are kept as
+    /// named registry fields for discoverability and so the tests can target
+    /// each helper by index, mirroring the rest of `HelperFns`.
+    #[allow(dead_code)]
+    pub erfc_approx: u32,
+    #[allow(dead_code)]
+    pub normal_cdf: u32,
+    #[allow(dead_code)]
+    pub alloc_curve: u32,
+    pub allocate_available: u32,
+}
+
+/// One emitted helper function: its signature (so the assembler can register a
+/// wasm type for it) and its body (the terminating `End` is included).
+pub(crate) struct HelperFn {
+    pub params: Vec<ValType>,
+    pub results: Vec<ValType>,
+    pub body: Function,
+}
+
+/// The emitted helper functions plus the [`HelperFns`] index registry that
+/// names them. `functions[i]` is the body for function index `i`.
+pub(crate) struct BuiltHelpers {
+    pub fns: HelperFns,
+    pub functions: Vec<HelperFn>,
+}
+
+/// Emit every helper function a module needs, assigning each a stable function
+/// index starting at 0.
+///
+/// The returned [`HelperFns`] records the indices; the caller (`module.rs`)
+/// places `functions` at module function indices `0..functions.len()` and emits
+/// the per-program + `run` functions after them, threading [`BuiltHelpers::fns`]
+/// into each [`EmitCtx`].
+pub(crate) fn build_helpers() -> BuiltHelpers {
+    let mut functions: Vec<HelperFn> = Vec::new();
+
+    // Push a `(f64, ...) -> f64`-shaped helper and return its assigned index.
+    // The index is `functions.len()` *before* the push, so it stays valid no
+    // matter how many helpers precede it. Used for every transcendental.
+    let push_unary = |functions: &mut Vec<HelperFn>, body: Function| -> u32 {
+        let idx = functions.len() as u32;
+        functions.push(HelperFn {
+            params: vec![ValType::F64],
+            results: vec![ValType::F64],
+            body,
+        });
+        idx
+    };
+
+    let approx_eq = functions.len() as u32;
+    functions.push(HelperFn {
+        params: vec![ValType::F64, ValType::F64],
+        results: vec![ValType::I32],
+        body: emit_approx_eq(),
+    });
+
+    let mod_euclid = functions.len() as u32;
+    functions.push(HelperFn {
+        params: vec![ValType::F64, ValType::F64],
+        results: vec![ValType::F64],
+        body: emit_mod_euclid(),
+    });
+
+    let pulse = functions.len() as u32;
+    functions.push(HelperFn {
+        params: vec![
+            ValType::F64,
+            ValType::F64,
+            ValType::F64,
+            ValType::F64,
+            ValType::F64,
+        ],
+        results: vec![ValType::F64],
+        body: emit_pulse(),
+    });
+
+    // Leaf transcendentals (no inter-helper calls).
+    let exp = push_unary(&mut functions, super::math::emit_exp());
+    let ln = push_unary(&mut functions, super::math::emit_ln());
+    let sin = push_unary(&mut functions, super::math::emit_sin());
+    let cos = push_unary(&mut functions, super::math::emit_cos());
+    let atan = push_unary(&mut functions, super::math::emit_atan());
+
+    // Composed transcendentals, referencing the leaves by their recorded index.
+    let tan = push_unary(&mut functions, super::math::emit_tan(sin, cos));
+    let asin = push_unary(&mut functions, super::math::emit_asin(atan));
+    let acos = push_unary(&mut functions, super::math::emit_acos(asin));
+    let log10 = push_unary(&mut functions, super::math::emit_log10(ln));
+
+    // `pow` is the only binary helper.
+    let pow = functions.len() as u32;
+    functions.push(HelperFn {
+        params: vec![ValType::F64, ValType::F64],
+        results: vec![ValType::F64],
+        body: super::math::emit_pow(exp, ln),
+    });
+
+    // GF lookup helpers, each `(data_off: i32, count: i32, index: f64) -> f64`.
+    // `lookup_interp` `call`s `approx_eq` (assigned above), so its body is built
+    // with that index.
+    let push_lookup = |functions: &mut Vec<HelperFn>, body: Function| -> u32 {
+        let idx = functions.len() as u32;
+        functions.push(HelperFn {
+            params: vec![ValType::I32, ValType::I32, ValType::F64],
+            results: vec![ValType::F64],
+            body,
+        });
+        idx
+    };
+    let lookup_interp = push_lookup(&mut functions, super::lookup::emit_lookup_interp(approx_eq));
+    let lookup_forward = push_lookup(&mut functions, super::lookup::emit_lookup_forward());
+    let lookup_backward = push_lookup(&mut functions, super::lookup::emit_lookup_backward());
+
+    // `stable_sort(pairs_ptr: i32, n: i32, ascending: i32) -> ()` -- the runtime
+    // insertion sort backing `VectorSortOrder`/`Rank` (`super::vector`).
+    let stable_sort = functions.len() as u32;
+    functions.push(HelperFn {
+        params: vec![ValType::I32, ValType::I32, ValType::I32],
+        results: vec![],
+        body: super::vector::emit_stable_sort(),
+    });
+
+    // Allocation helpers (`super::alloc`). Pushed in dependency order so each
+    // inter-helper `call` resolves to an already-recorded index:
+    // `erfc_approx` -> `exp`; `normal_cdf` -> `erfc_approx`; `alloc_curve` ->
+    // `normal_cdf`/`exp`/`pow`; `allocate_available` -> `alloc_curve`.
+    let erfc_approx = functions.len() as u32;
+    functions.push(HelperFn {
+        params: vec![ValType::F64],
+        results: vec![ValType::F64],
+        body: super::alloc::emit_erfc_approx(exp),
+    });
+    let normal_cdf = functions.len() as u32;
+    functions.push(HelperFn {
+        params: vec![ValType::F64],
+        results: vec![ValType::F64],
+        body: super::alloc::emit_normal_cdf(erfc_approx),
+    });
+    let alloc_curve = functions.len() as u32;
+    functions.push(HelperFn {
+        params: vec![
+            ValType::F64,
+            ValType::F64,
+            ValType::F64,
+            ValType::F64,
+            ValType::F64,
+            ValType::F64,
+        ],
+        results: vec![ValType::F64],
+        body: super::alloc::emit_alloc_curve(normal_cdf, exp, pow),
+    });
+    let allocate_available = functions.len() as u32;
+    functions.push(HelperFn {
+        params: vec![
+            ValType::I32,
+            ValType::I32,
+            ValType::I32,
+            ValType::F64,
+            ValType::I32,
+        ],
+        results: vec![],
+        body: super::alloc::emit_allocate_available(alloc_curve),
+    });
+
+    BuiltHelpers {
+        fns: HelperFns {
+            approx_eq,
+            mod_euclid,
+            pulse,
+            exp,
+            ln,
+            sin,
+            cos,
+            tan,
+            atan,
+            asin,
+            acos,
+            log10,
+            pow,
+            lookup_interp,
+            lookup_forward,
+            lookup_backward,
+            stable_sort,
+            erfc_approx,
+            normal_cdf,
+            alloc_curve,
+            allocate_available,
+        },
+        functions,
+    }
+}
+
+// `approx_eq` helper local layout. Params 0/1 are `a`/`b`; the rest are declared
+// i64 scratch locals.
+const AE_A: u32 = 0;
+const AE_B: u32 = 1;
+const AE_BITS: u32 = 2; // scratch for one operand's raw bits
+const AE_ORD_A: u32 = 3; // ordered(a)
+const AE_ORD_B: u32 = 4; // ordered(b)
+const AE_DIFF: u32 = 5; // ordered(a) - ordered(b)
+const AE_ABS: u32 = 6; // abs(diff) before saturation
+const AE_LOCAL_COUNT: u32 = 5; // declared i64 locals (indices 2..=6)
+
+/// Build the body of the `approx_eq(a: f64, b: f64) -> i32` helper, reproducing
+/// `crate::float::approx_eq` (`float_cmp` 0.10, `f64`, default margin
+/// `epsilon = f64::EPSILON`, `ulps = 4`) bit-faithfully.
+///
+/// The Rust reference (`float_cmp` `eq.rs`) is the short-circuiting OR of three
+/// total, trap-free checks (exact equality / ±inf, absolute-epsilon, ULP):
+///
+/// ```text
+/// a == b  ||  f64abs(a - b) <= f64::EPSILON  ||  saturating_abs(ulps(a, b)) <= 4
+/// ```
+///
+/// where `ulps(a, b) = ordered(a).wrapping_sub(ordered(b))` over `i64` and
+/// `ordered(f) = { let bits = f.to_bits() as i64; if bits < 0 { !bits } else
+/// { bits ^ i64::MIN } }` maps the sign-magnitude bit pattern to a monotonic
+/// integer. Because all three checks are pure and total (no division, no traps),
+/// evaluating them eagerly and OR-ing the i32 results is bit-identical to the
+/// Rust short-circuit; the fast path is only a performance shortcut, not a
+/// semantic difference. Notably this makes `approx_eq(NaN, NaN) == true`
+/// (identical bits -> 0 ULPs) and keeps the finite `crate::float::NA` sentinel
+/// distinct from ordinary values (its exponent is far from theirs).
+fn emit_approx_eq() -> Function {
+    use Instruction as Ins;
+    let mut f = Function::new([(AE_LOCAL_COUNT, ValType::I64)]);
+
+    // check 1: a == b -> i32
+    f.instruction(&Ins::LocalGet(AE_A));
+    f.instruction(&Ins::LocalGet(AE_B));
+    f.instruction(&Ins::F64Eq);
+
+    // check 2: f64.abs(a - b) <= f64::EPSILON -> i32
+    f.instruction(&Ins::LocalGet(AE_A));
+    f.instruction(&Ins::LocalGet(AE_B));
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::F64Abs);
+    f.instruction(&f64_const(f64::EPSILON));
+    f.instruction(&Ins::F64Le);
+
+    // check 3: saturating_abs(ordered(a) - ordered(b)) <= 4 -> i32.
+    emit_ordered_bits(&mut f, AE_A, AE_BITS);
+    f.instruction(&Ins::LocalSet(AE_ORD_A));
+    emit_ordered_bits(&mut f, AE_B, AE_BITS);
+    f.instruction(&Ins::LocalSet(AE_ORD_B));
+
+    // diff = wrapping_sub(ordered_a, ordered_b)  (i64.sub wraps)
+    f.instruction(&Ins::LocalGet(AE_ORD_A));
+    f.instruction(&Ins::LocalGet(AE_ORD_B));
+    f.instruction(&Ins::I64Sub);
+    f.instruction(&Ins::LocalSet(AE_DIFF));
+
+    // abs = if diff < 0 { 0 - diff } else { diff }  (the wrapping negate; for
+    // diff == i64::MIN this stays negative, handled by the saturation below).
+    f.instruction(&Ins::I64Const(0));
+    f.instruction(&Ins::LocalGet(AE_DIFF));
+    f.instruction(&Ins::I64Sub); // 0 - diff
+    f.instruction(&Ins::LocalGet(AE_DIFF)); // [neg, diff]
+    f.instruction(&Ins::LocalGet(AE_DIFF));
+    f.instruction(&Ins::I64Const(0));
+    f.instruction(&Ins::I64LtS); // diff < 0
+    f.instruction(&Ins::Select); // neg if diff<0 else diff
+    f.instruction(&Ins::LocalSet(AE_ABS));
+
+    // sat = if abs < 0 { i64::MAX } else { abs }  (saturating_abs: the only abs
+    // still negative is the i64::MIN overflow, which saturates to i64::MAX).
+    f.instruction(&Ins::I64Const(i64::MAX));
+    f.instruction(&Ins::LocalGet(AE_ABS)); // [i64::MAX, abs]
+    f.instruction(&Ins::LocalGet(AE_ABS));
+    f.instruction(&Ins::I64Const(0));
+    f.instruction(&Ins::I64LtS); // abs < 0
+    f.instruction(&Ins::Select); // i64::MAX if abs<0 else abs
+
+    // sat <= 4 -> i32
+    f.instruction(&Ins::I64Const(4));
+    f.instruction(&Ins::I64LeS);
+
+    // Combine the three i32 booleans: (check1 | check2 | check3). Stack holds
+    // [c1, c2, c3]; two i32.or reduce it to one result.
+    f.instruction(&Ins::I32Or);
+    f.instruction(&Ins::I32Or);
+
+    f.instruction(&Ins::End);
+    f
+}
+
+/// Append the wasm sequence that pushes `ordered(local)` onto the stack, where
+/// `ordered(f) = { let bits = f.to_bits() as i64; if bits < 0 { !bits } else
+/// { bits ^ i64::MIN } }` (float_cmp's sign-magnitude -> monotonic map). `bits`
+/// is materialized once into `bits_local` (i64) and reused for the two branch
+/// values and the sign test; `select` chooses between them. `i64::MIN` is the
+/// `1 << 63` sign mask as a signed `i64`, and `!bits` is `bits ^ -1`.
+fn emit_ordered_bits(f: &mut Function, src_local: u32, bits_local: u32) {
+    use Instruction as Ins;
+    f.instruction(&Ins::LocalGet(src_local));
+    f.instruction(&Ins::I64ReinterpretF64);
+    f.instruction(&Ins::LocalSet(bits_local));
+    // neg case: !bits = bits ^ -1
+    f.instruction(&Ins::LocalGet(bits_local));
+    f.instruction(&Ins::I64Const(-1));
+    f.instruction(&Ins::I64Xor);
+    // pos case: bits ^ i64::MIN  (flip the sign bit)
+    f.instruction(&Ins::LocalGet(bits_local));
+    f.instruction(&Ins::I64Const(i64::MIN));
+    f.instruction(&Ins::I64Xor);
+    // cond: bits < 0  (the sign bit is set)
+    f.instruction(&Ins::LocalGet(bits_local));
+    f.instruction(&Ins::I64Const(0));
+    f.instruction(&Ins::I64LtS);
+    // select(neg, pos, cond): neg if cond != 0 else pos
+    f.instruction(&Ins::Select);
+}
+
+// `mod_euclid` helper local layout. Params 0/1 are `l`/`r`; local 2 is the
+// truncated remainder `r0`.
+const ME_L: u32 = 0;
+const ME_R: u32 = 1;
+const ME_R0: u32 = 2;
+
+/// Build the body of `mod_euclid(l: f64, r: f64) -> f64`, reproducing
+/// `f64::rem_euclid` (the VM's `Op2::Mod`) exactly.
+///
+/// `rem_euclid` is `let r0 = l % r; if r0 < 0 { r0 + r.abs() } else { r0 }`,
+/// where the truncated remainder `l % r` is `l - r * (l / r).trunc()` (wasm has
+/// no `f64.rem`, so it is computed from `f64.div`/`f64.trunc`/`f64.mul`/
+/// `f64.sub`). The branch is a `select`. The result lies in `[0, |r|)` for a
+/// non-zero divisor; this trunc-then-adjust form is correct for negative
+/// divisors too (where a `floor`-based form would not be).
+fn emit_mod_euclid() -> Function {
+    use Instruction as Ins;
+    let mut f = Function::new([(1, ValType::F64)]);
+
+    // r0 = l - r * trunc(l / r)
+    f.instruction(&Ins::LocalGet(ME_L));
+    f.instruction(&Ins::LocalGet(ME_R));
+    f.instruction(&Ins::LocalGet(ME_L));
+    f.instruction(&Ins::LocalGet(ME_R));
+    f.instruction(&Ins::F64Div);
+    f.instruction(&Ins::F64Trunc);
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::LocalSet(ME_R0));
+
+    // select(r0 + |r|, r0, r0 < 0): the adjusted value when r0 is negative,
+    // else r0 unchanged. wasm `select` yields the deeper operand when the cond
+    // is true, so push `r0 + |r|` first.
+    f.instruction(&Ins::LocalGet(ME_R0));
+    f.instruction(&Ins::LocalGet(ME_R));
+    f.instruction(&Ins::F64Abs);
+    f.instruction(&Ins::F64Add);
+    f.instruction(&Ins::LocalGet(ME_R0));
+    f.instruction(&Ins::LocalGet(ME_R0));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::Select);
+
+    f.instruction(&Ins::End);
+    f
+}
+
+// `pulse` helper local layout. Params 0..4 are time/dt/volume/first_pulse/
+// interval; local 5 is the running `next_pulse`.
+const PU_TIME: u32 = 0;
+const PU_DT: u32 = 1;
+const PU_VOLUME: u32 = 2;
+const PU_FIRST: u32 = 3;
+const PU_INTERVAL: u32 = 4;
+const PU_NEXT: u32 = 5;
+
+/// Build the body of `pulse(time, dt, volume, first_pulse, interval) -> f64`,
+/// reproducing the VM's `pulse` (`vm.rs:3036`) including its `while` loop.
+///
+/// ```text
+/// if time < first_pulse { return 0.0 }
+/// next_pulse = first_pulse
+/// loop {                              // while time >= next_pulse
+///     if time < next_pulse { break }
+///     if time < next_pulse + dt { return volume / dt }
+///     if interval <= 0.0 { break }
+///     next_pulse += interval
+/// }
+/// 0.0
+/// ```
+///
+/// The `while time >= next_pulse` head is realized as a `br $exit` when
+/// `time < next_pulse`, inside a `block $exit { loop $top { ... br $top } }`.
+fn emit_pulse() -> Function {
+    use Instruction as Ins;
+    use wasm_encoder::BlockType;
+    let mut f = Function::new([(1, ValType::F64)]);
+
+    // if time < first_pulse { return 0.0 }
+    f.instruction(&Ins::LocalGet(PU_TIME));
+    f.instruction(&Ins::LocalGet(PU_FIRST));
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // next_pulse = first_pulse
+    f.instruction(&Ins::LocalGet(PU_FIRST));
+    f.instruction(&Ins::LocalSet(PU_NEXT));
+
+    // block $exit { loop $top { ... } }
+    f.instruction(&Ins::Block(BlockType::Empty));
+    f.instruction(&Ins::Loop(BlockType::Empty));
+
+    // while-head: if time < next_pulse { break }  (br depth 1 -> $exit)
+    f.instruction(&Ins::LocalGet(PU_TIME));
+    f.instruction(&Ins::LocalGet(PU_NEXT));
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::BrIf(1));
+
+    // if time < next_pulse + dt { return volume / dt }
+    f.instruction(&Ins::LocalGet(PU_TIME));
+    f.instruction(&Ins::LocalGet(PU_NEXT));
+    f.instruction(&Ins::LocalGet(PU_DT));
+    f.instruction(&Ins::F64Add);
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&Ins::LocalGet(PU_VOLUME));
+    f.instruction(&Ins::LocalGet(PU_DT));
+    f.instruction(&Ins::F64Div);
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // else if interval <= 0.0 { break }  (br depth 1 -> $exit)
+    f.instruction(&Ins::LocalGet(PU_INTERVAL));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Le);
+    f.instruction(&Ins::BrIf(1));
+
+    // else next_pulse += interval ; continue (br depth 0 -> $top)
+    f.instruction(&Ins::LocalGet(PU_NEXT));
+    f.instruction(&Ins::LocalGet(PU_INTERVAL));
+    f.instruction(&Ins::F64Add);
+    f.instruction(&Ins::LocalSet(PU_NEXT));
+    f.instruction(&Ins::Br(0));
+
+    f.instruction(&Ins::End); // end loop
+    f.instruction(&Ins::End); // end block
+
+    // fell out of the loop -> 0.0
+    f.instruction(&f64_const(0.0));
+
+    f.instruction(&Ins::End); // end function
+    f
+}
+
+/// Push `call approx_eq` for two f64 operands already on the wasm stack
+/// (`[a, b]`); leaves an i32 (1 = approximately equal) on the stack. Mirrors a
+/// `crate::float::approx_eq(a, b)` call.
+fn emit_call_approx_eq(ctx: &EmitCtx, f: &mut Function) {
+    f.instruction(&Instruction::Call(ctx.helpers.approx_eq));
+}
+
+/// Push the i32 truthiness of the f64 already on the wasm stack, reproducing the
+/// VM's `is_truthy(n) = !approx_eq(n, 0.0)` (`vm.rs:89`): `approx_eq(n, 0.0)`
+/// gives `is_false`, and `i32.eqz` negates it to `is_truthy`.
+pub(crate) fn emit_is_truthy(ctx: &EmitCtx, f: &mut Function) {
+    f.instruction(&f64_const(0.0));
+    emit_call_approx_eq(ctx, f);
+    f.instruction(&Instruction::I32Eqz);
+}
+
+/// The maximum number of simultaneously-live `SetCond` condition registers a
+/// program needs.
+///
+/// `compiler::codegen` lowers an `Expr::If` by walking the *condition* sub-tree
+/// to completion before emitting the pair's own `SetCond`/`If`
+/// (`codegen.rs:1153-1159`: push `t`, push `f`, walk `cond`, then `SetCond`,
+/// `If`). So even when a condition itself contains a nested `If`, the inner
+/// pair is fully emitted before the outer `SetCond`, and the stream is
+/// *sequential* -- `SetCond If SetCond If` -- never interleaved. With current
+/// codegen the condition register therefore never needs to hold more than one
+/// live value (this returns 1 for any model with a conditional).
+///
+/// We still model the register as a LIFO stack and size it from the actual
+/// opcode stream rather than hard-coding 1: it costs one cheap pass, it is
+/// robust if codegen ever emits a genuinely interleaved pair, and it keeps the
+/// emitter's `SetCond`-pushes-/`If`-pops logic symmetric. The depth is computed
+/// here so the caller can reserve exactly that many wasm locals.
+/// Number of dedicated scratch f64 locals the `Apply` opcode reserves
+/// (`a`/`b`/`c`).
+const APPLY_LOCAL_COUNT: u32 = 3;
+
+/// Number of dedicated scratch f64 locals the Phase-6 vector opcodes reserve.
+/// `VectorSelect`'s single-pass reduction needs four running accumulators
+/// (sum/product/min/max) plus one to hold the current value; the other vector
+/// ops draw their f64 scratch from the same block. Reserved unconditionally --
+/// unused f64 locals in a non-vector function are free.
+pub(crate) const VECTOR_F64_LOCAL_COUNT: u32 = 5;
+
+/// Number of dedicated scratch i32 locals the Phase-6 vector opcodes reserve.
+/// `VectorSelect` uses one for the action selector, one for the selected-value
+/// count, and one for its reduce loop index; `Rank` uses one for the
+/// `ascending` flag and one for a runtime store address; `VectorElmMap` uses one
+/// for the runtime flat source index. Three covers every Phase-6 vector op.
+/// Reserved unconditionally -- unused i32 locals in a non-vector function are
+/// free.
+pub(crate) const VECTOR_I32_LOCAL_COUNT: u32 = 3;
+
+/// The first declared-local wasm index of an opcode-program function with
+/// `n_inputs` f64 module-input parameters. Param 0 is `module_off`; params
+/// `1..=n_inputs` are the module inputs (`LoadModuleInput { input }` reads param
+/// `input + 1`); declared locals begin at `1 + n_inputs`. For the root (and every
+/// Phase 1-6 single-module function) `n_inputs == 0`, so this is the historical
+/// index 1 (the scratch f64).
+fn first_local_index(n_inputs: u32) -> u32 {
+    1 + n_inputs
+}
+
+/// The local-declaration list for an opcode-program `Function` carrying
+/// `cond_depth` condition locals, `extra_i32` dynamic-subscript scratch locals,
+/// and `module_input_scratch` f64 locals for the `EvalModule` reverse-pop: one
+/// scratch f64, then `cond_depth` i32 condition locals, then
+/// [`APPLY_LOCAL_COUNT`] f64 `Apply` scratch locals, then
+/// [`VECTOR_F64_LOCAL_COUNT`] f64 vector-op scratch locals, then
+/// [`VECTOR_I32_LOCAL_COUNT`] i32 vector-op scratch locals, then `extra_i32` i32
+/// locals (Task 4 dynamic subscripts; 0 when the program has none), then
+/// `module_input_scratch` f64 locals (Phase 7 `EvalModule`; 0 when the program
+/// instantiates no submodule).
+///
+/// Defined once (and consumed by both `module.rs`'s function builders and the
+/// `#[cfg(test)]` harness) so the declared local *types and order* match the
+/// indices [`apply_locals_for`], [`vector_f64_locals_for`],
+/// [`vector_i32_locals_for`], [`extra_i32_local_base`], and
+/// [`module_input_scratch_base`] hand out. The declared locals always start at
+/// `1 + n_inputs` (past `module_off` and the f64 input params); the vector locals
+/// sit at a *fixed* offset (independent of `extra_i32`) so the dynamic-subscript
+/// extra i32s -- and the module-input scratch after them -- shift by a constant
+/// and never disturb the `apply_locals` indices.
+pub(crate) fn opcode_fn_locals(
+    cond_depth: usize,
+    extra_i32: u32,
+    module_input_scratch: u32,
+) -> Vec<(u32, ValType)> {
+    vec![
+        (1, ValType::F64),
+        (cond_depth as u32, ValType::I32),
+        (APPLY_LOCAL_COUNT, ValType::F64),
+        (VECTOR_F64_LOCAL_COUNT, ValType::F64),
+        (VECTOR_I32_LOCAL_COUNT, ValType::I32),
+        (extra_i32, ValType::I32),
+        (module_input_scratch, ValType::F64),
+    ]
+}
+
+/// The [`VECTOR_F64_LOCAL_COUNT`] vector-op scratch f64 local indices for a
+/// function with `n_inputs` module-input params and `cond_depth` condition
+/// locals. They follow the declared scratch f64 (index `1 + n_inputs`), the
+/// `cond_depth` i32 condition locals, and the [`APPLY_LOCAL_COUNT`] `Apply` f64s.
+/// Threaded into [`EmitCtx::vector_f64_locals`].
+pub(crate) fn vector_f64_locals_for(
+    n_inputs: u32,
+    cond_depth: usize,
+) -> [u32; VECTOR_F64_LOCAL_COUNT as usize] {
+    let base = first_local_index(n_inputs) + 1 + cond_depth as u32 + APPLY_LOCAL_COUNT;
+    [base, base + 1, base + 2, base + 3, base + 4]
+}
+
+/// The [`VECTOR_I32_LOCAL_COUNT`] vector-op scratch i32 local indices for a
+/// function with `n_inputs` module-input params and `cond_depth` condition
+/// locals. They follow the [`VECTOR_F64_LOCAL_COUNT`] vector-op f64s. Threaded
+/// into [`EmitCtx::vector_i32_locals`].
+pub(crate) fn vector_i32_locals_for(
+    n_inputs: u32,
+    cond_depth: usize,
+) -> [u32; VECTOR_I32_LOCAL_COUNT as usize] {
+    let base = first_local_index(n_inputs)
+        + 1
+        + cond_depth as u32
+        + APPLY_LOCAL_COUNT
+        + VECTOR_F64_LOCAL_COUNT;
+    [base, base + 1, base + 2]
+}
+
+/// First wasm local index of the `extra_i32` dynamic-subscript scratch locals for
+/// a function with `n_inputs` module-input params and `cond_depth` condition
+/// locals: past `module_off` + the `n_inputs` f64 input params, the scratch f64
+/// (index `1 + n_inputs`), the `cond_depth` i32 condition locals, the
+/// [`APPLY_LOCAL_COUNT`] `Apply` f64s, the [`VECTOR_F64_LOCAL_COUNT`] vector-op
+/// f64s, and the [`VECTOR_I32_LOCAL_COUNT`] vector-op i32s. Threaded into
+/// [`EmitCtx::extra_i32_local_base`] so the dynamic-subscript local allocator
+/// draws from exactly the declared range.
+pub(crate) fn extra_i32_local_base(n_inputs: u32, cond_depth: usize) -> u32 {
+    first_local_index(n_inputs)
+        + 1
+        + cond_depth as u32
+        + APPLY_LOCAL_COUNT
+        + VECTOR_F64_LOCAL_COUNT
+        + VECTOR_I32_LOCAL_COUNT
+}
+
+/// First wasm local index of the `module_input_scratch` f64 locals (Phase 7
+/// `EvalModule` reverse-pop) for a function with `n_inputs` module-input params,
+/// `cond_depth` condition locals, and `extra_i32` dynamic-subscript i32 locals.
+/// They follow the `extra_i32` block (the last i32 run), so this is
+/// [`extra_i32_local_base`]`+ extra_i32`. The `EvalModule` arm pops a child's
+/// inputs into the first `n` of these (where `n` is that call's `n_inputs`),
+/// matching the VM's reverse pop into `module_inputs`. Threaded into
+/// [`EmitCtx::module_input_scratch_base`].
+pub(crate) fn module_input_scratch_base(n_inputs: u32, cond_depth: usize, extra_i32: u32) -> u32 {
+    extra_i32_local_base(n_inputs, cond_depth) + extra_i32
+}
+
+/// The three `Apply` scratch f64 local indices `[a, b, c]` for a function with
+/// `n_inputs` module-input params and `cond_depth` condition locals. They follow
+/// `module_off` + the `n_inputs` f64 input params, the scratch f64
+/// (index `1 + n_inputs`), and the `cond_depth` i32 condition locals, so they
+/// start at `1 + n_inputs + 1 + cond_depth`. Mirrors the declaration order in
+/// [`opcode_fn_locals`].
+pub(crate) fn apply_locals_for(n_inputs: u32, cond_depth: usize) -> [u32; 3] {
+    let base = first_local_index(n_inputs) + 1 + cond_depth as u32;
+    [base, base + 1, base + 2]
+}
+
+/// The wasm local index of the assign-scratch f64 for a function with `n_inputs`
+/// module-input params: the first declared local (`1 + n_inputs`), past
+/// `module_off` and the f64 input params. Threaded into [`EmitCtx::scratch_local`].
+pub(crate) fn scratch_local_for(n_inputs: u32) -> u32 {
+    first_local_index(n_inputs)
+}
+
+/// The `cond_depth` condition-register local indices for a function with
+/// `n_inputs` module-input params. They follow the scratch f64 (index
+/// `1 + n_inputs`), so the first is `2 + n_inputs`. Threaded into
+/// [`EmitCtx::condition_locals`].
+pub(crate) fn condition_locals_for(n_inputs: u32, cond_depth: usize) -> Vec<u32> {
+    let base = first_local_index(n_inputs) + 1;
+    (0..cond_depth as u32).map(|i| base + i).collect()
+}
+
+pub(crate) fn max_condition_depth(bc: &ByteCode) -> usize {
+    let mut depth: usize = 0;
+    let mut max_depth: usize = 0;
+    for op in &bc.code {
+        match op {
+            Opcode::SetCond {} => {
+                depth += 1;
+                max_depth = max_depth.max(depth);
+            }
+            // `If` consumes the most-recently-set condition. Guard against an
+            // unbalanced program (which would indicate malformed bytecode)
+            // with a saturating decrement rather than an underflow panic.
+            Opcode::If {} => {
+                depth = depth.saturating_sub(1);
+            }
+            _ => {}
+        }
+    }
+    max_depth
+}
+
+/// Push the dynamic part of a module-relative slot address: `module_off * 8`.
+/// Combined with a constant `memarg.offset` of `chunk_base + off*8`, this
+/// addresses `chunk_base + (module_off + off) * 8`, matching the VM's
+/// `curr[module_off + off]` / `next[module_off + off]`.
+pub(crate) fn push_module_relative_base(ctx: &EmitCtx, f: &mut Function) {
+    f.instruction(&Instruction::LocalGet(ctx.module_off_local));
+    f.instruction(&Instruction::I32Const(SLOT_SIZE as i32));
+    f.instruction(&Instruction::I32Mul);
+}
+
+/// Byte offset of a slot within a chunk: `chunk_base + off*8`.
+fn slot_byte_offset(chunk_base: u32, off: u16) -> u64 {
+    u64::from(chunk_base) + u64::from(off) * u64::from(SLOT_SIZE)
+}
+
+/// Emit-time analogue of the VM's per-`eval_bytecode` mutable state
+/// (`vm.rs:1277-1288`): the compile-time view stack, the iteration / broadcast
+/// contexts, and the condition-register stack pointer. Threaded through
+/// [`emit_ops`] so an unrolled iteration body can be re-emitted at each
+/// compile-time index without re-deriving the view stack.
+struct EmitState {
+    /// Emit-time stack pointer into `ctx.condition_locals`, mirroring the VM's
+    /// single `condition` register but generalized to nested `If`s.
+    cond_sp: usize,
+    /// Compile-time analogue of the VM's runtime `view_stack`: the `Push*View` /
+    /// `View*` opcodes push/transform/pop `ViewDesc`s here, and the reducers read
+    /// the top descriptor. Because every static view's geometry is known at
+    /// compile time, this never materializes anything at runtime -- element
+    /// addresses are folded into the emitted reads.
+    view_stack: Vec<ViewDesc>,
+    /// Active (unrolled) iteration contexts, one per nested `BeginIter`. The
+    /// `current` field is the compile-time iteration index the unroller is
+    /// emitting (Task 3).
+    iter_stack: Vec<IterCtx>,
+    /// Active broadcast-iteration contexts (`BeginBroadcastIter`, Task 3).
+    broadcast_stack: Vec<BroadcastCtx>,
+    /// The legacy scalar dynamic-subscript accumulator (`PushSubscriptIndex` /
+    /// `LoadSubscript`, Task 4), mirroring the VM's `subscript_index` +
+    /// `subscript_index_valid` (`vm.rs:1287-1288`). Cleared by each
+    /// `LoadSubscript`.
+    subscript: SubscriptAccum,
+    /// Bump cursor for the function's extra i32 locals (Task 4). A dynamic
+    /// subscript draws fresh i32 locals from here; the count is pre-sized by
+    /// [`count_extra_i32_locals`], so this never exceeds the declared count.
+    next_i32_local: u32,
+    /// Cumulative count of unrolled element-emit "units" for the function being
+    /// lowered, checked against [`MAX_UNROLL_UNITS`] (see [`EmitState::charge_unroll`]).
+    /// Every full unroll -- a reducer fold, a `BeginIter`/`BeginBroadcastIter`
+    /// body re-emission -- charges its iteration count here. Nested iterations
+    /// multiply naturally: an inner site is reached once per outer iteration, so
+    /// each inner charge already reflects the outer multiplier. When the running
+    /// total would exceed the cap, lowering aborts with `Unsupported` so the
+    /// model cleanly falls back to the VM instead of emitting a multi-megabyte
+    /// function body that a wasm engine would reject.
+    unroll_units: usize,
+}
+
+/// Upper bound on the cumulative number of unrolled element-emit "units" per
+/// wasm function (one reducer-fold element, or one `BeginIter`/`BeginBroadcastIter`
+/// body re-emission, is one unit).
+///
+/// Every array reducer and iteration loop is fully unrolled at compile time
+/// (each element address becomes a wasm constant -- see [`emit_reduce_fold`] and
+/// the `BeginIter`/`BeginBroadcastIter` arms). Without a bound, a large arrayed
+/// model -- especially nested iterations whose counts multiply -- could emit a
+/// function body exceeding what wasm engines accept (V8, for instance, caps a
+/// single function near ~7.6 MB of bytecode; the spec's 4 GiB ceiling is
+/// academic). At a generous ~50 bytes of emitted code per unit, this cap bounds
+/// unroll-driven code at roughly 3 MB, comfortably under the strictest engine
+/// limit.
+///
+/// The value `65_536` (2^16) is the natural ceiling of a single `u16` array
+/// dimension (`ViewDesc::dims` entries are `u16`, so one dimension tops out at
+/// 65_535). Real system-dynamics arrays are tiny -- the test corpus's largest
+/// single dimension is 9, and even a region x sector x cohort nest is on the
+/// order of 10^3 elements -- so this leaves >60x headroom for legitimate models
+/// while rejecting pathological products (e.g. a `[300, 300]` view, 90_000
+/// elements) before any code is emitted.
+///
+/// future: a runtime wasm loop driven by a precomputed offset table (per the
+/// Phase 5 design's non-contiguous path) would lift this cap entirely, trading a
+/// constant-size loop body for the current fully-unrolled form.
+const MAX_UNROLL_UNITS: usize = 65_536;
+
+impl EmitState {
+    /// Charge `units` against the per-function unroll budget, returning
+    /// `Unsupported` (so the model falls back to the VM) if the running total
+    /// would exceed [`MAX_UNROLL_UNITS`]. Called *before* an unroll site emits
+    /// its body, so an over-budget model is rejected without ever materializing
+    /// the oversized function. `units` saturates rather than wrapping, so a
+    /// pathological multi-dimensional product can never alias back under the cap.
+    fn charge_unroll(&mut self, units: usize) -> Result<(), WasmGenError> {
+        self.unroll_units = self.unroll_units.saturating_add(units);
+        if self.unroll_units > MAX_UNROLL_UNITS {
+            return Err(WasmGenError::Unsupported(format!(
+                "wasmgen: array unrolling exceeds the per-function budget of \
+                 {MAX_UNROLL_UNITS} elements (a large arrayed model); falling back to the VM"
+            )));
+        }
+        Ok(())
+    }
+}
+
+/// The legacy scalar dynamic-subscript accumulator (Task 4). `PushSubscriptIndex`
+/// appends a `(runtime_index_local, bounds)` and folds OOB into `valid_local`;
+/// `LoadSubscript` collapses the indices into a flat offset and reads the slot
+/// (or NaN). Mirrors the VM's `subscript_index` SmallVec + `subscript_index_valid`
+/// flag (`vm.rs:1287-1288`, `1341-1366`).
+#[derive(Default)]
+struct SubscriptAccum {
+    /// `(runtime_index_local, bounds)` for each pushed index, in push order. The
+    /// local holds the *0-based* runtime index (i32); `bounds` is the dimension
+    /// size for the row-major fold.
+    indices: Vec<(u32, u16)>,
+    /// wasm i32 local that is 0 once any pushed index was out of bounds, else 1.
+    /// `None` until the first `PushSubscriptIndex` of an accumulation allocates
+    /// it (then seeded to 1).
+    valid_local: Option<u32>,
+}
+
+/// One active iteration context for the unrolled `BeginIter` loop (Task 3).
+struct IterCtx {
+    /// The view captured as the iteration source/geometry at `BeginIter`
+    /// (`view_stack.last()` then).
+    iter_view: ViewDesc,
+    /// Destination temp id for `StoreIterElement`, when `has_write_temp`.
+    write_temp_id: Option<u8>,
+    /// The compile-time iteration index currently being emitted (the unroller
+    /// re-emits the body once per `0..size`).
+    current: usize,
+}
+
+/// One active broadcast-iteration context (`BeginBroadcastIter`, Task 3),
+/// mirroring the VM's `BroadcastState` (`vm.rs:68-81`) but with the result
+/// geometry + per-source dim maps resolved at compile time.
+struct BroadcastCtx {
+    /// Per source (deepest-first): the source view and its `dim_map` (one entry
+    /// per result dimension; `Some(src_dim)` or `None` for a broadcast axis).
+    sources: Vec<(ViewDesc, Vec<Option<usize>>)>,
+    /// Destination temp id for `StoreBroadcastElement`.
+    dest_temp_id: u8,
+    /// Result dimension sizes (the union of all sources' dims, first-encounter
+    /// order), used to decompose `current` into per-result-dim indices.
+    result_dims: Vec<u16>,
+    /// The compile-time result index currently being emitted.
+    current: usize,
+}
+
+/// Lower one opcode program. Value-producing opcodes leave their f64 result on
+/// the wasm operand stack; the assignment opcodes emit a store and leave the
+/// stack empty, exactly as the VM's stack-machine arms do. `Ret` is a no-op
+/// here: the wasm function's terminating `End` is emitted by the caller.
+pub(crate) fn emit_bytecode(
+    bc: &ByteCode,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let mut state = EmitState {
+        cond_sp: 0,
+        view_stack: Vec::new(),
+        iter_stack: Vec::new(),
+        broadcast_stack: Vec::new(),
+        subscript: SubscriptAccum::default(),
+        next_i32_local: ctx.extra_i32_local_base,
+        unroll_units: 0,
+    };
+    emit_ops(&bc.code, &bc.literals, ctx, &mut state, f)
+}
+
+/// An upper bound on the extra i32 wasm locals a program's dynamic subscripts
+/// need (Task 4), so the function-builder can reserve them past the scratch /
+/// condition / `Apply` locals.
+///
+/// Each `ViewSubscriptDynamic` draws at most two fresh locals (a runtime-offset
+/// addend + a validity flag, allocated once per dynamically-subscripted view);
+/// each `PushSubscriptIndex` draws at most two (a 0-based index local + the
+/// shared validity flag of its accumulation). Counting two per opcode is a
+/// generous bound -- a real accumulation reuses one view's pair across several
+/// subscripts and one validity flag across several pushed indices -- but
+/// over-reserving unused i32 locals is free, and the bound keeps the reservation
+/// a single cheap pass with no dataflow.
+pub(crate) fn count_extra_i32_locals(bc: &ByteCode) -> u32 {
+    bc.code
+        .iter()
+        .filter(|op| {
+            matches!(
+                op,
+                Opcode::ViewSubscriptDynamic { .. } | Opcode::PushSubscriptIndex { .. }
+            )
+        })
+        .count() as u32
+        * 2
+}
+
+/// The number of scratch f64 wasm locals a program needs for the `EvalModule`
+/// reverse-pop (Phase 7): the maximum `n_inputs` over its `EvalModule` sites.
+///
+/// Each `EvalModule { n_inputs }` pops its `n_inputs` operands into scratch f64
+/// locals (in reverse, matching the VM) before pushing `child_module_off` and
+/// re-pushing them in order. Because the sites are emitted sequentially (each
+/// fully consumes its scratch before the next runs), reserving the *max* per-site
+/// count -- not the sum -- suffices, and successive sites reuse the same locals.
+/// Returns 0 for a program that instantiates no submodule.
+pub(crate) fn count_module_input_scratch(bc: &ByteCode) -> u32 {
+    bc.code
+        .iter()
+        .filter_map(|op| match op {
+            Opcode::EvalModule { n_inputs, .. } => Some(u32::from(*n_inputs)),
+            _ => None,
+        })
+        .max()
+        .unwrap_or(0)
+}
+
+/// Lower a (sub-)slice of opcodes, threading the emit-time [`EmitState`]. The
+/// top-level program is one call over the whole `code`; an unrolled `BeginIter`
+/// loop body (Task 3) re-enters here over the body sub-slice once per iteration
+/// index. A `pc`-based loop (rather than `for`) lets the iteration arms consume
+/// their structured `BeginIter..NextIterOrJump..EndIter` span and re-emit the
+/// body, mirroring the VM's `pc` loop without needing the `jump_back` delta.
+///
+/// `literals` is the program's shared literal pool (`LoadConstant` /
+/// `AssignConstCurr` index it); it is the same across every body re-emission.
+fn emit_ops(
+    code: &[Opcode],
+    literals: &[f64],
+    ctx: &EmitCtx,
+    state: &mut EmitState,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let mut pc = 0usize;
+    while pc < code.len() {
+        let op = &code[pc];
+        match op {
+            Opcode::LoadConstant { id } => {
+                let v = *literals.get(*id as usize).ok_or_else(|| {
+                    WasmGenError::Unsupported(format!(
+                        "wasmgen: LoadConstant literal id {id} out of range"
+                    ))
+                })?;
+                f.instruction(&f64_const(v));
+            }
+            Opcode::LoadVar { off } => {
+                push_module_relative_base(ctx, f);
+                f.instruction(&Instruction::F64Load(memarg(slot_byte_offset(
+                    ctx.curr_base,
+                    *off,
+                ))));
+            }
+            Opcode::LoadGlobalVar { off } => {
+                // Absolute slot: ignore module_off (slots 0..4 are global).
+                f.instruction(&Instruction::I32Const(0));
+                f.instruction(&Instruction::F64Load(memarg(slot_byte_offset(
+                    ctx.curr_base,
+                    *off,
+                ))));
+            }
+            Opcode::Op2 { op } => emit_op2(*op, ctx, f)?,
+            Opcode::Not {} => {
+                // The VM's `Not` is `(!is_truthy(r)) as f64`, which simplifies to
+                // `approx_eq(r, 0.0) as f64` (since `is_truthy = !approx_eq(·,0.0)`,
+                // the double negation cancels). So push `approx_eq(r, 0.0)` and
+                // widen the i32 1/0 to f64.
+                f.instruction(&f64_const(0.0));
+                emit_call_approx_eq(ctx, f);
+                f.instruction(&Instruction::F64ConvertI32U);
+            }
+            Opcode::SetCond {} => {
+                let local = *ctx.condition_locals.get(state.cond_sp).ok_or_else(|| {
+                    WasmGenError::Unsupported(
+                        "wasmgen: SetCond nesting exceeded reserved condition locals".to_string(),
+                    )
+                })?;
+                // Reduce the f64 condition to i32 truthiness, routing through
+                // `approx_eq` so a near-zero / ULP-adjacent condition takes the
+                // same branch the VM's `is_truthy(pop)` takes.
+                emit_is_truthy(ctx, f);
+                f.instruction(&Instruction::LocalSet(local));
+                state.cond_sp += 1;
+            }
+            Opcode::If {} => {
+                if state.cond_sp == 0 {
+                    return Err(WasmGenError::Unsupported(
+                        "wasmgen: If without a preceding SetCond".to_string(),
+                    ));
+                }
+                state.cond_sp -= 1;
+                let local = ctx.condition_locals[state.cond_sp];
+                // Stack holds [t, f] (the VM pops f then t and yields
+                // `if condition { t } else { f }`); wasm `select` pops
+                // [t, f, cond_i32] and yields t when cond != 0 else f -- exact.
+                f.instruction(&Instruction::LocalGet(local));
+                f.instruction(&Instruction::Select);
+            }
+            Opcode::AssignCurr { off } => {
+                emit_assign(ctx.curr_base, *off, ctx, f);
+            }
+            Opcode::AssignNext { off } => {
+                emit_assign(ctx.next_base, *off, ctx, f);
+            }
+            // `AssignConstCurr` reaches a `CompiledSimulation` by two routes
+            // (see the module docstring): `compiler::codegen` emits it directly
+            // for any constant-RHS `AssignCurr` (`codegen.rs:1164`), and the
+            // peephole pass also fuses `LoadConstant; AssignCurr` into it
+            // (`bytecode.rs:1830`). It is *not* a late-3-address fusion artifact,
+            // so it is part of the scalar core, not an Unsupported case. Every
+            // model with a constant initial/aux carries it. Mirrors the VM's
+            // `curr[module_off + off] = literals[literal_id]` (`vm.rs:1453`).
+            Opcode::AssignConstCurr { off, literal_id } => {
+                let v = *literals.get(*literal_id as usize).ok_or_else(|| {
+                    WasmGenError::Unsupported(format!(
+                        "wasmgen: AssignConstCurr literal id {literal_id} out of range"
+                    ))
+                })?;
+                // Nothing is on the stack; push the store address then the value
+                // (f64.store wants [addr_i32, value_f64]).
+                push_module_relative_base(ctx, f);
+                // Phase 7 Task 2: an overridable constant sources its value from
+                // the constants-override region (initialized to `v`, mutable via
+                // the exported `set_value`) instead of the immediate literal, so
+                // an override takes effect on every assignment -- exactly as the
+                // VM rewrites the bytecode literal. The const region is indexed by
+                // absolute slot, so the read uses the same `module_off`-relative
+                // addressing the slab does (`const_region_base + (module_off +
+                // off) * 8`); a shared module run at several `module_off`s thus
+                // picks up each instance's distinct override. A non-overridable
+                // constant emits its literal unchanged.
+                if ctx.flows_const_offsets.contains(off) {
+                    f.instruction(&Instruction::LocalGet(ctx.module_off_local));
+                    f.instruction(&Instruction::I32Const(SLOT_SIZE as i32));
+                    f.instruction(&Instruction::I32Mul);
+                    f.instruction(&Instruction::F64Load(memarg(slot_byte_offset(
+                        ctx.const_region_base,
+                        *off,
+                    ))));
+                } else {
+                    f.instruction(&f64_const(v));
+                }
+                f.instruction(&Instruction::F64Store(memarg(slot_byte_offset(
+                    ctx.curr_base,
+                    *off,
+                ))));
+            }
+            // Peephole fusions of `Op2; Assign{Curr,Next}`. Operands `[l, r]`
+            // are on the stack; apply the op (which errors cleanly on an
+            // unsupported operator) then store the result. Mirrors the VM's
+            // `curr/next[module_off + off] = eval_op2(op, l, r)` (`vm.rs:1457`,
+            // `vm.rs:1463`).
+            Opcode::BinOpAssignCurr { op, off } => {
+                emit_op2(*op, ctx, f)?;
+                emit_assign(ctx.curr_base, *off, ctx, f);
+            }
+            Opcode::BinOpAssignNext { op, off } => {
+                emit_op2(*op, ctx, f)?;
+                emit_assign(ctx.next_base, *off, ctx, f);
+            }
+            // `Apply` always pops exactly three operands (codegen pads short
+            // builtins with `LoadConstant 0.0` / `LoadGlobalVar{FINAL_TIME}`),
+            // mirroring the VM (`vm.rs:1701`). See [`emit_apply`].
+            Opcode::Apply { func } => emit_apply(*func, ctx, f),
+            // `Lookup` pops `index` then `element_offset`, bounds-checks the
+            // offset, and dispatches to the mode's helper over the table at
+            // `base_gf + element_offset` (`vm.rs:1710`). See [`emit_lookup`].
+            Opcode::Lookup {
+                base_gf,
+                table_count,
+                mode,
+            } => emit_lookup(*base_gf, *table_count, *mode, ctx, f),
+            // `LoadPrev` mirrors the VM (`vm.rs:1320-1328`): a fallback is
+            // already on the stack (codegen pushes it just before this opcode);
+            // yield it while `use_prev_fallback` is set, otherwise read
+            // `prev_values[module_off + off]`. The gate is the global flag, never
+            // a TIME comparison (RK moves TIME to trial points).
+            Opcode::LoadPrev { off } => emit_load_prev(*off, ctx, f),
+            // `LoadInitial` mirrors the VM (`vm.rs:1332-1340`), but its
+            // `part == Initials` branch is resolved at compile time from
+            // `ctx.step_part`: in the initials program read `curr[module_off+off]`
+            // (the value being computed); elsewhere read the post-initials
+            // `initial_values[module_off+off]` snapshot.
+            Opcode::LoadInitial { off } => emit_load_initial(*off, ctx, f),
+
+            // ── View-stack construction (Phase 5 Task 1) ──────────────────
+            // Each opcode pushes/transforms a compile-time `ViewDesc`, mirroring
+            // the VM's `view_stack` arms (`vm.rs:1739-1855`). No wasm is emitted:
+            // the geometry is folded into later element reads.
+            Opcode::PushStaticView { view_id } => {
+                let view = ctx.ctx.get_static_view(*view_id).ok_or_else(|| {
+                    WasmGenError::Unsupported(format!(
+                        "wasmgen: PushStaticView view_id {view_id} out of range"
+                    ))
+                })?;
+                state.view_stack.push(ViewDesc::from_static(view));
+            }
+            // `PushVarView` builds a full contiguous view over a variable array;
+            // the VM folds `module_off` into the base (`vm.rs:1749`), so the base
+            // is module-relative.
+            Opcode::PushVarView {
+                base_off,
+                dim_list_id,
+            } => {
+                let (dims, dim_ids) = resolve_dim_list_dims(ctx, *dim_list_id)?;
+                state.view_stack.push(ViewDesc::contiguous(
+                    u32::from(*base_off),
+                    ViewBase::CurrModuleRelative,
+                    dims,
+                    dim_ids,
+                ));
+            }
+            // `PushTempView` builds a full contiguous view over a temp array
+            // (`vm.rs:1757`).
+            Opcode::PushTempView {
+                temp_id,
+                dim_list_id,
+            } => {
+                let (dims, dim_ids) = resolve_dim_list_dims(ctx, *dim_list_id)?;
+                state.view_stack.push(ViewDesc::contiguous(
+                    u32::from(*temp_id),
+                    ViewBase::Temp,
+                    dims,
+                    dim_ids,
+                ));
+            }
+            // `PushVarViewDirect` builds a contiguous view from raw dim sizes
+            // (dim_ids all 0), the base for a dynamic subscript (`vm.rs:1776`).
+            // Module-relative, like `PushVarView`.
+            Opcode::PushVarViewDirect {
+                base_off,
+                dim_list_id,
+            } => {
+                let dims = resolve_dim_list_raw(ctx, *dim_list_id)?;
+                let n = dims.len();
+                state.view_stack.push(ViewDesc::contiguous(
+                    u32::from(*base_off),
+                    ViewBase::CurrModuleRelative,
+                    dims,
+                    vec![0u16; n],
+                ));
+            }
+
+            // ── View-stack transforms (Phase 5 Task 1) ────────────────────
+            Opcode::ViewSubscriptConst { dim_idx, index } => {
+                view_top_mut(&mut state.view_stack)?
+                    .apply_single_subscript(*dim_idx as usize, *index);
+            }
+            Opcode::ViewRange {
+                dim_idx,
+                start,
+                end,
+            } => {
+                view_top_mut(&mut state.view_stack)?.apply_range(*dim_idx as usize, *start, *end);
+            }
+            Opcode::ViewStarRange {
+                dim_idx,
+                subdim_relation_id,
+            } => {
+                let rel = ctx
+                    .ctx
+                    .subdim_relations
+                    .get(*subdim_relation_id as usize)
+                    .ok_or_else(|| {
+                        WasmGenError::Unsupported(format!(
+                            "wasmgen: ViewStarRange subdim_relation_id {subdim_relation_id} \
+                             out of range"
+                        ))
+                    })?;
+                let parent_offsets = rel.parent_offsets.to_vec();
+                let child_dim_id = rel.child_dim_id;
+                view_top_mut(&mut state.view_stack)?.apply_sparse(
+                    *dim_idx as usize,
+                    parent_offsets,
+                    child_dim_id,
+                );
+            }
+            // `ViewWildcard` is a no-op in the VM (`vm.rs:1839`): the dimension
+            // stays as-is.
+            Opcode::ViewWildcard { dim_idx: _ } => {}
+            Opcode::ViewTranspose {} => {
+                view_top_mut(&mut state.view_stack)?.transpose();
+            }
+            Opcode::PopView {} => {
+                state.view_stack.pop().ok_or_else(|| {
+                    WasmGenError::Unsupported("wasmgen: PopView on empty view stack".to_string())
+                })?;
+            }
+            Opcode::DupView {} => {
+                let top = view_top(&state.view_stack)?.clone();
+                state.view_stack.push(top);
+            }
+
+            // ── Dynamic view subscript (Phase 5 Task 4) ───────────────────
+            // `ViewSubscriptDynamic` pops a 1-based runtime index, bounds-checks
+            // it against the top view's `dims[dim_idx]`, and folds
+            // `(index-1)*strides[dim_idx]` into the descriptor's runtime offset
+            // local; OOB sets the validity flag to 0 so later reads yield NaN.
+            // Mirrors `RuntimeView::apply_single_subscript_checked` (`vm.rs:1797`,
+            // `bytecode.rs:242`).
+            Opcode::ViewSubscriptDynamic { dim_idx } => {
+                emit_view_subscript_dynamic(*dim_idx as usize, ctx, state, f)?;
+            }
+            // `ViewRangeDynamic` (`vm.rs:1815`) clamps a runtime `[start:end]`
+            // range, which yields a runtime *size*. The unrolled element
+            // addressing here folds every address at compile time, so a runtime
+            // range cannot be expressed; returning `Unsupported` keeps such a
+            // model `Skipped`. A literal range is constant-folded by codegen into
+            // the static `ViewRange` arm, so this is only reached by a true
+            // runtime range.
+            Opcode::ViewRangeDynamic { dim_idx } => {
+                return Err(WasmGenError::Unsupported(format!(
+                    "wasmgen: ViewRangeDynamic (dim {dim_idx}) needs a runtime view size; \
+                     not supported"
+                )));
+            }
+
+            // ── Legacy scalar dynamic subscript (Phase 5 Task 4) ──────────
+            // `PushSubscriptIndex` pops a 1-based index, range-checks it against
+            // `bounds`, and accumulates the 0-based runtime index; OOB clears the
+            // accumulation's validity flag. `LoadSubscript` folds the accumulated
+            // indices into a flat offset and reads `curr[module_off+off+flat]`
+            // (NaN when invalid). Mirrors `vm.rs:1341-1366`.
+            Opcode::PushSubscriptIndex { bounds } => {
+                emit_push_subscript_index(*bounds, state, f);
+            }
+            Opcode::LoadSubscript { off } => {
+                emit_load_subscript(*off, ctx, state, f);
+            }
+
+            // ── Temp element reads (Phase 5 Task 1) ───────────────────────
+            // `temp_storage[temp_offsets[temp_id] + index]` (`vm.rs:1860`).
+            Opcode::LoadTempConst { temp_id, index } => {
+                let addr = temp_element_byte_addr(ctx, *temp_id, u32::from(*index))?;
+                f.instruction(&Instruction::I32Const(0));
+                f.instruction(&Instruction::F64Load(memarg(addr)));
+            }
+            // `temp_storage[temp_offsets[temp_id] + index]` with a runtime index
+            // (`vm.rs:1866`): the VM does `stack.pop().floor() as usize`.
+            Opcode::LoadTempDynamic { temp_id } => {
+                emit_load_temp_dynamic(ctx, *temp_id, f)?;
+            }
+
+            // ── Array reducers (Phase 5 Task 2) ───────────────────────────
+            // Reduce over the TOP view descriptor (the production pattern is
+            // `PushStaticView; Array<Reduce>; PopView`, so the descriptor stays
+            // for the trailing `PopView`).
+            Opcode::ArraySum {}
+            | Opcode::ArrayMax {}
+            | Opcode::ArrayMin {}
+            | Opcode::ArrayMean {}
+            | Opcode::ArrayStddev {}
+            | Opcode::ArraySize {} => {
+                let view = view_top(&state.view_stack)?.clone();
+                // `ArraySize` emits no element reads (just `size() as f64`), so it
+                // is free; every other reducer unrolls a fold over `size()`
+                // elements, and `ArrayStddev` makes two passes (sum, then squared
+                // deviations). Charge that many units before emitting the fold.
+                if !matches!(op, Opcode::ArraySize {}) {
+                    let passes = if matches!(op, Opcode::ArrayStddev {}) {
+                        2
+                    } else {
+                        1
+                    };
+                    state.charge_unroll(view.size().saturating_mul(passes))?;
+                }
+                emit_array_reduce(op, &view, ctx, f)?;
+            }
+
+            // ── Body element reads inside an unrolled iteration (Task 3) ───
+            // Each reads view element `current` (the compile-time iteration index
+            // the unroller set on the active iter context) and pushes the f64.
+            Opcode::LoadIterElement {} => {
+                let iter = state.iter_stack.last().ok_or_else(|| {
+                    WasmGenError::Unsupported(
+                        "wasmgen: LoadIterElement outside an iteration".to_string(),
+                    )
+                })?;
+                // The iteration view is also the source: read element `current`.
+                let view = iter.iter_view.clone();
+                let current = iter.current;
+                emit_view_element_load(&view, current, ctx, f)?;
+            }
+            // `temp_storage[temp_offsets[temp_id] + current]` (`vm.rs:1939`).
+            Opcode::LoadIterTempElement { temp_id } => {
+                let current = current_iter_index(state)?;
+                let addr = temp_element_byte_addr(ctx, *temp_id, current as u32)?;
+                f.instruction(&Instruction::I32Const(0));
+                f.instruction(&Instruction::F64Load(memarg(addr)));
+            }
+            // Read `view_stack.last()` at `current`, broadcasting against the
+            // iteration view (`vm.rs:1946`). `LoadIterViewAt{offset}` reads
+            // `view_stack[len-offset]` instead (`vm.rs:2068`).
+            Opcode::LoadIterViewTop {} => {
+                emit_load_iter_view(state, 1, ctx, f)?;
+            }
+            Opcode::LoadIterViewAt { offset } => {
+                emit_load_iter_view(state, *offset as usize, ctx, f)?;
+            }
+            // Store the popped value into `temp_storage[temp_offsets[write_temp]
+            // + current]` (`vm.rs:2184`).
+            Opcode::StoreIterElement {} => {
+                let iter = state.iter_stack.last().ok_or_else(|| {
+                    WasmGenError::Unsupported(
+                        "wasmgen: StoreIterElement outside an iteration".to_string(),
+                    )
+                })?;
+                let write_temp_id = iter.write_temp_id.ok_or_else(|| {
+                    WasmGenError::Unsupported(
+                        "wasmgen: StoreIterElement without a write temp".to_string(),
+                    )
+                })?;
+                let current = iter.current;
+                emit_store_iter_element(ctx, write_temp_id, current, f)?;
+            }
+
+            // ── Iteration loop (Task 3): unroll BeginIter..EndIter ────────
+            // The body span between `BeginIter` and its `NextIterOrJump` is
+            // structured (codegen.rs:1183-1378) and well-nested, so rather than a
+            // runtime wasm loop with the `jump_back` PC delta, the body is fully
+            // unrolled over the compile-time `size()` -- every element address is
+            // then a compile-time constant via `emit_view_element_load`, matching
+            // the array reducer's unrolled fold (Task 2) and the VM element-for-
+            // element. The captured iter view is `view_stack.last()` at `BeginIter`
+            // (`vm.rs:1880`).
+            Opcode::BeginIter {
+                write_temp_id,
+                has_write_temp,
+            } => {
+                let iter_view = view_top(&state.view_stack)?.clone();
+                let write_temp_id = if *has_write_temp {
+                    Some(*write_temp_id)
+                } else {
+                    None
+                };
+                let size = iter_view.size();
+                let (body, end_pc) = iter_span(code, pc, IterKind::Iter)?;
+                // Re-emitting the body once per element is `size` units of
+                // unrolling; charge it before the loop so an over-budget model is
+                // rejected without materializing the oversized body. Nested
+                // iterations multiply naturally: this arm is reached once per
+                // outer iteration, so each inner charge already carries the outer
+                // multiplier.
+                state.charge_unroll(size)?;
+                for current in 0..size {
+                    state.iter_stack.push(IterCtx {
+                        iter_view: iter_view.clone(),
+                        write_temp_id,
+                        current,
+                    });
+                    emit_ops(body, literals, ctx, state, f)?;
+                    state.iter_stack.pop();
+                }
+                pc = end_pc;
+                continue;
+            }
+            // `NextIterOrJump`/`EndIter` are consumed by the `BeginIter` unroll
+            // (the body slice excludes the `NextIterOrJump`, and `pc` is advanced
+            // past `EndIter`), so reaching one here means malformed bytecode.
+            Opcode::NextIterOrJump { .. } | Opcode::EndIter {} => {
+                return Err(WasmGenError::Unsupported(
+                    "wasmgen: NextIterOrJump/EndIter without a matching BeginIter".to_string(),
+                ));
+            }
+
+            // ── Broadcast iteration (Task 3): unroll over the union geometry ──
+            // `BeginBroadcastIter` unions the `n_sources` views' dim_ids into the
+            // result geometry, building a per-source dim map (`vm.rs:2314`); the
+            // body is then unrolled over the result size, mirroring
+            // `LoadBroadcastElement` / `StoreBroadcastElement`.
+            Opcode::BeginBroadcastIter {
+                n_sources,
+                dest_temp_id,
+            } => {
+                let bctx = build_broadcast_ctx(state, *n_sources as usize, *dest_temp_id)?;
+                let size: usize = bctx.result_dims.iter().map(|&d| d as usize).product();
+                let (body, end_pc) = iter_span(code, pc, IterKind::Broadcast)?;
+                // Same unroll accounting as `BeginIter`: the body is re-emitted
+                // once per element of the broadcast result geometry.
+                state.charge_unroll(size)?;
+                for current in 0..size {
+                    state.broadcast_stack.push(BroadcastCtx {
+                        sources: bctx.sources.clone(),
+                        dest_temp_id: bctx.dest_temp_id,
+                        result_dims: bctx.result_dims.clone(),
+                        current,
+                    });
+                    emit_ops(body, literals, ctx, state, f)?;
+                    state.broadcast_stack.pop();
+                }
+                pc = end_pc;
+                continue;
+            }
+            Opcode::LoadBroadcastElement { source_idx } => {
+                emit_load_broadcast_element(state, *source_idx as usize, ctx, f)?;
+            }
+            Opcode::StoreBroadcastElement {} => {
+                let bc_ctx = state.broadcast_stack.last().ok_or_else(|| {
+                    WasmGenError::Unsupported(
+                        "wasmgen: StoreBroadcastElement outside a broadcast iteration".to_string(),
+                    )
+                })?;
+                let dest_temp_id = bc_ctx.dest_temp_id;
+                let current = bc_ctx.current;
+                emit_store_iter_element(ctx, dest_temp_id, current, f)?;
+            }
+            Opcode::NextBroadcastOrJump { .. } | Opcode::EndBroadcastIter {} => {
+                return Err(WasmGenError::Unsupported(
+                    "wasmgen: NextBroadcastOrJump/EndBroadcastIter without a matching \
+                     BeginBroadcastIter"
+                        .to_string(),
+                ));
+            }
+
+            // ── Vector operations (Phase 6) ───────────────────────────────
+            // Each reads its inputs from the compile-time view stack (top =
+            // last) + the operand stack and writes its result array to its
+            // `write_temp_id` temp region -- except `VectorSelect`, which
+            // reduces to ONE scalar pushed on the stack. The view-stack reads +
+            // unroll-budget charging happen here (where `EmitState` lives); the
+            // wasm emission lives in `super::vector`, mirroring the matching VM
+            // arm element-for-element. The reducers leave the view descriptor on
+            // the stack for the trailing `PopView`, exactly like the Task-2
+            // reducers (the production pattern is `Push*View; <op>; PopView`).
+            Opcode::VectorSelect {} => {
+                // expr_view = top, sel_view = top-1 (vm.rs:2448-2449).
+                let n = state.view_stack.len();
+                if n < 2 {
+                    return Err(WasmGenError::Unsupported(
+                        "wasmgen: VectorSelect needs two views on the stack".to_string(),
+                    ));
+                }
+                let expr_view = state.view_stack[n - 1].clone();
+                let sel_view = state.view_stack[n - 2].clone();
+                // The gather unrolls over `min(sel, expr)` elements.
+                let size = sel_view.size().min(expr_view.size());
+                state.charge_unroll(size)?;
+                super::vector::emit_vector_select(&sel_view, &expr_view, ctx, f)?;
+            }
+            Opcode::VectorElmMap {
+                write_temp_id,
+                full_source_len,
+            } => {
+                // offset_view = top, source_view = top-1 (vm_vector_elm_map.rs).
+                let n = state.view_stack.len();
+                if n < 2 {
+                    return Err(WasmGenError::Unsupported(
+                        "wasmgen: VectorElmMap needs two views on the stack".to_string(),
+                    ));
+                }
+                let offset_view = state.view_stack[n - 1].clone();
+                let source_view = state.view_stack[n - 2].clone();
+                // The per-element map unrolls over the offset view's size.
+                state.charge_unroll(offset_view.size())?;
+                super::vector::emit_vector_elm_map(
+                    &source_view,
+                    &offset_view,
+                    *write_temp_id,
+                    *full_source_len,
+                    ctx,
+                    f,
+                )?;
+            }
+            Opcode::VectorSortOrder { write_temp_id } => {
+                let input_view = view_top(&state.view_stack)?.clone();
+                // Gather + scatter both unroll over `size`; the sort itself is a
+                // runtime loop in the `stable_sort` helper.
+                state.charge_unroll(input_view.size())?;
+                super::vector::emit_vector_sort_order(&input_view, *write_temp_id, ctx, f)?;
+            }
+            Opcode::Rank { write_temp_id } => {
+                let input_view = view_top(&state.view_stack)?.clone();
+                state.charge_unroll(input_view.size())?;
+                super::vector::emit_rank(&input_view, *write_temp_id, ctx, f)?;
+            }
+            Opcode::LookupArray {
+                base_gf,
+                table_count,
+                mode,
+                write_temp_id,
+            } => {
+                let input_view = view_top(&state.view_stack)?.clone();
+                state.charge_unroll(input_view.size())?;
+                super::vector::emit_lookup_array(
+                    &input_view,
+                    *base_gf,
+                    *table_count,
+                    *mode,
+                    *write_temp_id,
+                    ctx,
+                    f,
+                )?;
+            }
+            Opcode::AllocateAvailable { write_temp_id } => {
+                // profile_view = top, requests_view = top-1 (vm.rs:2634-2635).
+                let n_views = state.view_stack.len();
+                if n_views < 2 {
+                    return Err(WasmGenError::Unsupported(
+                        "wasmgen: AllocateAvailable needs two views on the stack".to_string(),
+                    ));
+                }
+                let profile_view = state.view_stack[n_views - 1].clone();
+                let requests_view = state.view_stack[n_views - 2].clone();
+                // Gather (requests) + profile reads + output copy unroll over
+                // n + 4n + n element-emits.
+                let n = requests_view.size();
+                state.charge_unroll(n.saturating_mul(6))?;
+                super::alloc::emit_allocate_available_op(
+                    &requests_view,
+                    &profile_view,
+                    *write_temp_id,
+                    ctx,
+                    f,
+                )?;
+            }
+            Opcode::AllocateByPriority { write_temp_id } => {
+                // priority_view = top, requests_view = top-1 (vm.rs:2728-2729).
+                let n_views = state.view_stack.len();
+                if n_views < 2 {
+                    return Err(WasmGenError::Unsupported(
+                        "wasmgen: AllocateByPriority needs two views on the stack".to_string(),
+                    ));
+                }
+                let priority_view = state.view_stack[n_views - 1].clone();
+                let requests_view = state.view_stack[n_views - 2].clone();
+                let n = requests_view.size();
+                state.charge_unroll(n.saturating_mul(6))?;
+                super::alloc::emit_allocate_by_priority_op(
+                    &requests_view,
+                    &priority_view,
+                    *write_temp_id,
+                    ctx,
+                    f,
+                )?;
+            }
+            // `LoadModuleInput { input }` mirrors the VM (`vm.rs:1376-1378`:
+            // `stack.push(module_inputs[input])`). The instance's inputs are wasm
+            // params `1..=n_inputs` (param 0 is `module_off`), so input `input` is
+            // at local `input + 1`.
+            Opcode::LoadModuleInput { input } => {
+                f.instruction(&Instruction::LocalGet(u32::from(*input) + 1));
+            }
+            // `EvalModule { id, n_inputs }` mirrors the VM (`vm.rs:1379-1443`):
+            // pop the `n_inputs` operands into scratch (in reverse), resolve the
+            // child instance, and `call` its function for the current `StepPart`,
+            // passing `module_off + decl.off` and the inputs in order.
+            Opcode::EvalModule { id, n_inputs } => {
+                emit_eval_module(*id, *n_inputs, ctx, f)?;
+            }
+            Opcode::Ret => {
+                // The caller emits the function's terminating `End`.
+            }
+            other => return Err(WasmGenError::Unsupported(unsupported_opcode(other))),
+        }
+        pc += 1;
+    }
+    Ok(())
+}
+
+impl EmitState {
+    /// Hand out the next fresh i32 wasm local (Task 4 dynamic subscripts). The
+    /// count is pre-reserved by [`count_extra_i32_locals`], so this never exceeds
+    /// the function's declared locals.
+    fn alloc_i32_local(&mut self) -> u32 {
+        let idx = self.next_i32_local;
+        self.next_i32_local += 1;
+        idx
+    }
+}
+
+/// The compile-time iteration index of the innermost active iteration context,
+/// erroring on a body opcode that appeared outside any iteration.
+fn current_iter_index(state: &EmitState) -> Result<usize, WasmGenError> {
+    state.iter_stack.last().map(|it| it.current).ok_or_else(|| {
+        WasmGenError::Unsupported("wasmgen: iteration body opcode outside an iteration".to_string())
+    })
+}
+
+/// Which structured iteration the body span belongs to: a `BeginIter` loop or a
+/// `BeginBroadcastIter` loop. Each has its own begin/next/end opcode triple, but
+/// the well-nested span scan is identical.
+#[derive(Clone, Copy, PartialEq, Eq)]
+enum IterKind {
+    Iter,
+    Broadcast,
+}
+
+/// Given the `pc` of a `BeginIter` / `BeginBroadcastIter`, return the body slice
+/// (the opcodes after the begin, up to but excluding its `NextIterOrJump` /
+/// `NextBroadcastOrJump`) and the pc *after* the matching `EndIter` /
+/// `EndBroadcastIter` (where the outer loop resumes).
+///
+/// The span is well-nested (codegen always emits `begin .. next .. end`), so a
+/// nested loop of the *same* kind is skipped by depth tracking: `begin` raises
+/// the depth and `end` lowers it; the matching `next` is the one at depth 0.
+/// A loop of the *other* kind cannot appear inside (codegen never interleaves
+/// the two families), but its begin/end would not affect this kind's depth, and
+/// its `next` is not this kind's `next`, so the scan is still correct.
+fn iter_span(
+    code: &[Opcode],
+    begin_pc: usize,
+    kind: IterKind,
+) -> Result<(&[Opcode], usize), WasmGenError> {
+    let is_begin = |op: &Opcode| match kind {
+        IterKind::Iter => matches!(op, Opcode::BeginIter { .. }),
+        IterKind::Broadcast => matches!(op, Opcode::BeginBroadcastIter { .. }),
+    };
+    let is_next = |op: &Opcode| match kind {
+        IterKind::Iter => matches!(op, Opcode::NextIterOrJump { .. }),
+        IterKind::Broadcast => matches!(op, Opcode::NextBroadcastOrJump { .. }),
+    };
+    let is_end = |op: &Opcode| match kind {
+        IterKind::Iter => matches!(op, Opcode::EndIter {}),
+        IterKind::Broadcast => matches!(op, Opcode::EndBroadcastIter {}),
+    };
+
+    let body_start = begin_pc + 1;
+    let mut depth = 0usize;
+    let mut i = body_start;
+    let mut body_end: Option<usize> = None;
+    while i < code.len() {
+        let op = &code[i];
+        if is_begin(op) {
+            depth += 1;
+        } else if is_next(op) {
+            if depth == 0 {
+                body_end = Some(i);
+                break;
+            }
+        } else if is_end(op) {
+            // `end` closes the most recent nested `begin` of this kind. The
+            // outermost (depth-0) `end` is reached only *after* our `next`, so a
+            // saturating decrement is safe.
+            depth = depth.saturating_sub(1);
+        }
+        i += 1;
+    }
+    let body_end = body_end.ok_or_else(|| {
+        WasmGenError::Unsupported("wasmgen: iteration with no matching Next opcode".to_string())
+    })?;
+    // The `end` opcode immediately follows the (depth-0) `next`.
+    let end_idx = body_end + 1;
+    if end_idx >= code.len() || !is_end(&code[end_idx]) {
+        return Err(WasmGenError::Unsupported(
+            "wasmgen: iteration Next not immediately followed by End".to_string(),
+        ));
+    }
+    Ok((&code[body_start..body_end], end_idx + 1))
+}
+
+/// Lower `LoadIterViewTop` (`stack_offset == 1`) / `LoadIterViewAt { offset }`:
+/// read `view_stack[len - stack_offset]` at the innermost iteration's `current`,
+/// broadcasting against the captured iteration view (`vm.rs:1946-2182`). An
+/// invalid source view, a source smaller than the iteration, or an unmatched
+/// dimension pushes NaN, exactly as the VM does.
+fn emit_load_iter_view(
+    state: &EmitState,
+    stack_offset: usize,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let iter = state.iter_stack.last().ok_or_else(|| {
+        WasmGenError::Unsupported("wasmgen: LoadIterView* outside an iteration".to_string())
+    })?;
+    if stack_offset == 0 || stack_offset > state.view_stack.len() {
+        return Err(WasmGenError::Unsupported(
+            "wasmgen: LoadIterView* stack offset out of range".to_string(),
+        ));
+    }
+    let source = &state.view_stack[state.view_stack.len() - stack_offset];
+    // The broadcast index mapping is resolved at compile time; `None` means the
+    // VM would push NaN for this (source-element, iteration-index) pair.
+    match source.iter_broadcast_offset(&iter.iter_view, iter.current, ctx.ctx) {
+        Some(flat) => emit_view_offset_load(source, flat, ctx, f),
+        None => {
+            f.instruction(&f64_const(f64::NAN));
+            Ok(())
+        }
+    }
+}
+
+/// Store the f64 already on the wasm stack into `temp_storage[temp_offsets[
+/// temp_id] + index]` (the `StoreIterElement` / `StoreBroadcastElement` write).
+/// `f64.store` wants `[addr_i32, value_f64]`, so park the value in the scratch
+/// local, push the constant address, then reload the value.
+fn emit_store_iter_element(
+    ctx: &EmitCtx,
+    temp_id: u8,
+    index: usize,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let addr = temp_element_byte_addr(ctx, temp_id, index as u32)?;
+    f.instruction(&Instruction::LocalSet(ctx.scratch_local));
+    f.instruction(&Instruction::I32Const(0));
+    f.instruction(&Instruction::LocalGet(ctx.scratch_local));
+    f.instruction(&Instruction::F64Store(memarg(addr)));
+    Ok(())
+}
+
+/// Build the compile-time broadcast context for a `BeginBroadcastIter`,
+/// mirroring the VM's `BeginBroadcastIter` arm (`vm.rs:2314-2373`): union the
+/// `n_sources` deepest views' dim_ids into the result geometry (first-encounter
+/// order), then build each source's `dim_map` (result dim -> source dim, or
+/// `None` for a broadcast axis).
+fn build_broadcast_ctx(
+    state: &EmitState,
+    n_sources: usize,
+    dest_temp_id: u8,
+) -> Result<BroadcastCtx, WasmGenError> {
+    if n_sources == 0 || n_sources > state.view_stack.len() {
+        return Err(WasmGenError::Unsupported(
+            "wasmgen: BeginBroadcastIter source count out of range".to_string(),
+        ));
+    }
+    let base = state.view_stack.len() - n_sources;
+    let sources_slice = &state.view_stack[base..];
+
+    // Result dim ids/sizes: the union over all sources, first-encounter order.
+    let mut result_dim_ids: Vec<u16> = Vec::new();
+    let mut result_dims: Vec<u16> = Vec::new();
+    for view in sources_slice {
+        for (d, &dim_id) in view.dim_ids.iter().enumerate() {
+            if !result_dim_ids.contains(&dim_id) {
+                result_dim_ids.push(dim_id);
+                result_dims.push(view.dims[d]);
+            }
+        }
+    }
+
+    // Per source: dim_map[result_dim] = Some(src_dim) by exact dim-id match, else
+    // None (the source broadcasts along that axis).
+    let mut sources: Vec<(ViewDesc, Vec<Option<usize>>)> = Vec::with_capacity(n_sources);
+    for view in sources_slice {
+        let dim_map: Vec<Option<usize>> = result_dim_ids
+            .iter()
+            .map(|&rid| view.dim_ids.iter().position(|&id| id == rid))
+            .collect();
+        sources.push((view.clone(), dim_map));
+    }
+
+    Ok(BroadcastCtx {
+        sources,
+        dest_temp_id,
+        result_dims,
+        current: 0,
+    })
+}
+
+/// Lower `LoadBroadcastElement { source_idx }`, mirroring the VM
+/// (`vm.rs:2375-2414`): decompose the broadcast `current` into per-result-dim
+/// indices, scatter them into the source's dimension order through its
+/// `dim_map`, then read the source element. An invalid source view pushes NaN.
+fn emit_load_broadcast_element(
+    state: &EmitState,
+    source_idx: usize,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let bc_ctx = state.broadcast_stack.last().ok_or_else(|| {
+        WasmGenError::Unsupported(
+            "wasmgen: LoadBroadcastElement outside a broadcast iteration".to_string(),
+        )
+    })?;
+    let (source, dim_map) = bc_ctx.sources.get(source_idx).ok_or_else(|| {
+        WasmGenError::Unsupported(
+            "wasmgen: LoadBroadcastElement source_idx out of range".to_string(),
+        )
+    })?;
+
+    // Decompose the result `current` into per-result-dim indices (row-major).
+    let n_result = bc_ctx.result_dims.len();
+    let mut result_indices = vec![0u16; n_result];
+    let mut remaining = bc_ctx.current;
+    for d in (0..n_result).rev() {
+        let dim = bc_ctx.result_dims[d] as usize;
+        result_indices[d] = (remaining % dim) as u16;
+        remaining /= dim;
+    }
+
+    // Scatter into the source's dimension order: ordered[src_dim] =
+    // result_indices[result_dim] for each mapped axis (`vm.rs:2395-2402`).
+    let mut ordered = vec![0u16; source.dims.len()];
+    for (result_dim, mapped) in dim_map.iter().enumerate() {
+        if let Some(src_dim) = mapped {
+            ordered[*src_dim] = result_indices[result_dim];
+        }
+    }
+    let flat = source.flat_offset_for_indices(&ordered);
+    let source = source.clone();
+    emit_view_offset_load(&source, flat, ctx, f)
+}
+
+/// Lower `ViewSubscriptDynamic { dim_idx }` (Task 4): pop the 1-based runtime
+/// index off the wasm stack, bounds-check it against the top view's
+/// `dims[dim_idx]`, and fold `(index-1) * strides[dim_idx]` into the view's
+/// runtime-offset local; an out-of-bounds index clears the view's validity flag.
+/// The *shape* change (dropping `dim_idx`) is compile-time; only the offset
+/// addend and validity are runtime. Mirrors `apply_single_subscript_checked`
+/// (`bytecode.rs:242`) + `apply_single_subscript` (`bytecode.rs:326`).
+fn emit_view_subscript_dynamic(
+    dim_idx: usize,
+    ctx: &EmitCtx,
+    state: &mut EmitState,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    use Instruction as Ins;
+
+    // Read the geometry (stride/bound) before mutating the descriptor's shape.
+    let view = view_top(&state.view_stack)?;
+    let dim_size = view.dim_at(dim_idx).ok_or_else(|| {
+        WasmGenError::Unsupported(format!(
+            "wasmgen: ViewSubscriptDynamic dim {dim_idx} out of range"
+        ))
+    })?;
+    let stride = view.stride_at(dim_idx).ok_or_else(|| {
+        WasmGenError::Unsupported(format!(
+            "wasmgen: ViewSubscriptDynamic dim {dim_idx} out of range"
+        ))
+    })?;
+    // Snapshot the (Copy) runtime-offset/validity locals so the borrow of `view`
+    // ends here, freeing `state` for the mutable re-borrow in the allocate path.
+    let existing_locals = (view.runtime_off_local, view.valid_local);
+
+    // Lazily allocate (and initialize) the view's runtime-offset + validity
+    // locals on its first dynamic subscript: offset 0, valid 1. The two locals
+    // are always set together (below), so once one is present so is the other --
+    // the `else unreachable!` makes that invariant explicit rather than relying
+    // on a bare `.unwrap()` pair.
+    let (off_local, valid_local) = match existing_locals {
+        (Some(off), Some(valid)) => (off, valid),
+        (Some(_), None) | (None, Some(_)) => unreachable!(
+            "wasmgen: a dynamically-subscripted view sets runtime_off_local and \
+             valid_local together; exactly one was present"
+        ),
+        (None, None) => {
+            let off_local = state.alloc_i32_local();
+            let valid_local = state.alloc_i32_local();
+            f.instruction(&Ins::I32Const(0));
+            f.instruction(&Ins::LocalSet(off_local));
+            f.instruction(&Ins::I32Const(1));
+            f.instruction(&Ins::LocalSet(valid_local));
+            let view = view_top_mut(&mut state.view_stack)?;
+            view.runtime_off_local = Some(off_local);
+            view.valid_local = Some(valid_local);
+            (off_local, valid_local)
+        }
+    };
+
+    // Park the popped f64 index in the scratch f64 local (free at an opcode
+    // boundary) so it can be read twice (bounds check + offset).
+    f.instruction(&Ins::LocalSet(ctx.scratch_local));
+
+    // in_bounds = (idx >= 1.0) & (idx <= dim_size). The VM floors the index, but
+    // the bound test is on the popped value; using the value directly (>= 1.0,
+    // <= dim_size) matches `index_1based == 0 || index_1based > dims[dim_idx]`
+    // on the floored u16 for any non-negative index, and a negative index fails
+    // `>= 1.0`. valid &= in_bounds (validity is sticky-false, like the VM).
+    f.instruction(&Ins::LocalGet(valid_local));
+    f.instruction(&Ins::LocalGet(ctx.scratch_local));
+    f.instruction(&Ins::F64Floor);
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::F64Ge); // floor(idx) >= 1
+    f.instruction(&Ins::LocalGet(ctx.scratch_local));
+    f.instruction(&Ins::F64Floor);
+    f.instruction(&f64_const(f64::from(dim_size)));
+    f.instruction(&Ins::F64Le); // floor(idx) <= dim_size
+    f.instruction(&Ins::I32And);
+    f.instruction(&Ins::I32And); // valid & in_bounds
+    f.instruction(&Ins::LocalSet(valid_local));
+
+    // off_local += (floor(idx) as i32 - 1) * stride. Folded unconditionally: when
+    // invalid the read is NaN-gated, so the (possibly bogus) offset is never used.
+    f.instruction(&Ins::LocalGet(off_local));
+    f.instruction(&Ins::LocalGet(ctx.scratch_local));
+    f.instruction(&Ins::F64Floor);
+    f.instruction(&Ins::I32TruncSatF64S);
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Sub); // index - 1 (0-based)
+    f.instruction(&Ins::I32Const(stride));
+    f.instruction(&Ins::I32Mul);
+    f.instruction(&Ins::I32Add);
+    f.instruction(&Ins::LocalSet(off_local));
+
+    // Drop the subscripted dimension from the compile-time shape.
+    let view = view_top_mut(&mut state.view_stack)?;
+    view.apply_single_subscript_dynamic(dim_idx)
+        .ok_or_else(|| {
+            WasmGenError::Unsupported(
+                "wasmgen: ViewSubscriptDynamic on a sparse/out-of-range dimension".to_string(),
+            )
+        })?;
+    Ok(())
+}
+
+/// Lower `PushSubscriptIndex { bounds }` (Task 4, legacy scalar subscript): pop
+/// the 1-based runtime index, range-check it against `bounds`, and accumulate
+/// its 0-based value in a fresh i32 local for the eventual `LoadSubscript` fold.
+/// An out-of-bounds index clears the accumulation's shared validity flag.
+/// Mirrors `vm.rs:1341-1349`.
+fn emit_push_subscript_index(bounds: u16, state: &mut EmitState, f: &mut Function) {
+    use Instruction as Ins;
+
+    // Allocate the shared validity flag on the first index of an accumulation
+    // (init 1 = valid). Subsequent indices reuse it.
+    let valid_local = match state.subscript.valid_local {
+        Some(v) => v,
+        None => {
+            let v = state.alloc_i32_local();
+            f.instruction(&Ins::I32Const(1));
+            f.instruction(&Ins::LocalSet(v));
+            state.subscript.valid_local = Some(v);
+            v
+        }
+    };
+
+    // A fresh i32 local holds this index's 0-based value until LoadSubscript
+    // folds it (several PushSubscriptIndex precede one LoadSubscript).
+    let idx_local = state.alloc_i32_local();
+
+    // idx_i32 = floor(pop) as i32 (the 1-based index).
+    f.instruction(&Ins::F64Floor);
+    f.instruction(&Ins::I32TruncSatF64S);
+    // Keep a copy for the bounds check (LocalTee leaves it on the stack).
+    f.instruction(&Ins::LocalTee(idx_local));
+
+    // in_bounds = (idx >= 1) & (idx <= bounds). The VM's test is
+    // `index == 0 || index > bounds` on a u16 (so a 0 or negative index, which
+    // `floor as i32` yields <= 0, also fails `>= 1`). valid &= in_bounds.
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32GeS); // idx >= 1
+    f.instruction(&Ins::LocalGet(idx_local));
+    f.instruction(&Ins::I32Const(i32::from(bounds)));
+    f.instruction(&Ins::I32LeS); // idx <= bounds
+    f.instruction(&Ins::I32And);
+    f.instruction(&Ins::LocalGet(valid_local));
+    f.instruction(&Ins::I32And);
+    f.instruction(&Ins::LocalSet(valid_local));
+
+    // Store the 0-based index (idx - 1) for the fold.
+    f.instruction(&Ins::LocalGet(idx_local));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Sub);
+    f.instruction(&Ins::LocalSet(idx_local));
+
+    state.subscript.indices.push((idx_local, bounds));
+}
+
+/// Lower `LoadSubscript { off }` (Task 4, legacy scalar subscript): fold the
+/// accumulated 0-based runtime indices into a row-major flat offset and push
+/// `curr[module_off + off + flat]`, or NaN when the accumulation is invalid.
+/// Mirrors `vm.rs:1351-1366`: `flat = 0; for (i, b) in indices { flat = flat*b
+/// + i }`. Clears the accumulator.
+fn emit_load_subscript(off: u16, ctx: &EmitCtx, state: &mut EmitState, f: &mut Function) {
+    use Instruction as Ins;
+    use wasm_encoder::BlockType;
+
+    let indices = std::mem::take(&mut state.subscript.indices);
+    let valid_local = state.subscript.valid_local.take();
+
+    let emit_load = |f: &mut Function| {
+        // Dynamic address part = (module_off + flat) * 8, where the row-major
+        // fold is `flat = (((i0)*b1 + i1)*b2 + i2)...` (the VM multiplies the
+        // running index by each entry's bound then adds the entry's index).
+        f.instruction(&Ins::LocalGet(ctx.module_off_local));
+        // flat fold:
+        if indices.is_empty() {
+            f.instruction(&Ins::I32Const(0));
+        } else {
+            // Start with i0.
+            f.instruction(&Ins::LocalGet(indices[0].0));
+            for (idx_local, bounds) in &indices[1..] {
+                f.instruction(&Ins::I32Const(i32::from(*bounds)));
+                f.instruction(&Ins::I32Mul);
+                f.instruction(&Ins::LocalGet(*idx_local));
+                f.instruction(&Ins::I32Add);
+            }
+        }
+        f.instruction(&Ins::I32Add); // module_off + flat
+        f.instruction(&Ins::I32Const(SLOT_SIZE as i32));
+        f.instruction(&Ins::I32Mul); // (module_off + flat) * 8
+        f.instruction(&Ins::F64Load(memarg(slot_byte_offset(ctx.curr_base, off))));
+    };
+
+    match valid_local {
+        Some(valid_local) => {
+            // if valid == 0 { NaN } else { load }
+            f.instruction(&Ins::LocalGet(valid_local));
+            f.instruction(&Ins::I32Eqz);
+            f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+            f.instruction(&f64_const(f64::NAN));
+            f.instruction(&Ins::Else);
+            emit_load(f);
+            f.instruction(&Ins::End);
+        }
+        // No PushSubscriptIndex preceded this (a 0-dim subscript): always valid.
+        None => emit_load(f),
+    }
+}
+
+/// Emit a store of the f64 already on the wasm stack into the module-relative
+/// slot `off` of `chunk_base`. `f64.store` wants `[addr_i32, value_f64]`, but
+/// the value is on top, so stash it in the scratch local, push the address,
+/// then reload the value.
+fn emit_assign(chunk_base: u32, off: u16, ctx: &EmitCtx, f: &mut Function) {
+    f.instruction(&Instruction::LocalSet(ctx.scratch_local));
+    push_module_relative_base(ctx, f);
+    f.instruction(&Instruction::LocalGet(ctx.scratch_local));
+    f.instruction(&Instruction::F64Store(memarg(slot_byte_offset(
+        chunk_base, off,
+    ))));
+}
+
+/// Lower a supported binary op. Operands are already on the wasm stack in push
+/// order `[l, r]`; the VM pops `r` then `l` and computes `l op r`, so the
+/// non-commutative wasm ops (`f64.sub`/`f64.div`) are already correct.
+/// Comparisons yield an i32 0/1 which is converted to f64 1.0/0.0 because
+/// downstream opcodes consume booleans as f64 (matching `eval_op2`).
+fn emit_op2(op: Op2, ctx: &EmitCtx, f: &mut Function) -> Result<(), WasmGenError> {
+    match op {
+        Op2::Add => {
+            f.instruction(&Instruction::F64Add);
+        }
+        Op2::Sub => {
+            f.instruction(&Instruction::F64Sub);
+        }
+        Op2::Mul => {
+            f.instruction(&Instruction::F64Mul);
+        }
+        Op2::Div => {
+            f.instruction(&Instruction::F64Div);
+        }
+        Op2::Gt => emit_cmp(f, &Instruction::F64Gt),
+        Op2::Gte => emit_cmp(f, &Instruction::F64Ge),
+        Op2::Lt => emit_cmp(f, &Instruction::F64Lt),
+        Op2::Lte => emit_cmp(f, &Instruction::F64Le),
+        // `Eq` is `approx_eq(l, r) as f64`: the operands `[l, r]` are already in
+        // call order, so `call approx_eq` then widen the i32 1/0 to f64.
+        Op2::Eq => {
+            emit_call_approx_eq(ctx, f);
+            f.instruction(&Instruction::F64ConvertI32U);
+        }
+        // `And`/`Or` are `(is_truthy(l) OP is_truthy(r)) as f64`.
+        Op2::And => emit_logical(ctx, f, Instruction::I32And),
+        Op2::Or => emit_logical(ctx, f, Instruction::I32Or),
+        // `Exp` is `l.powf(r)`: the operands `[l, r]` are already in call
+        // order, so `call pow` directly. Matches `powf` for a positive base
+        // (a negative base diverges -- see `super::math::emit_pow`).
+        Op2::Exp => {
+            f.instruction(&Instruction::Call(ctx.helpers.pow));
+        }
+        // `Mod` is `l.rem_euclid(r)` (result in [0, |r|)), routed through the
+        // `mod_euclid` helper (`[l, r]` already in call order).
+        Op2::Mod => {
+            f.instruction(&Instruction::Call(ctx.helpers.mod_euclid));
+        }
+    }
+    Ok(())
+}
+
+/// Lower `Op2::And`/`Op2::Or`: `(is_truthy(l) OP is_truthy(r)) as f64`, with
+/// `combine` the bitwise `i32.and`/`i32.or` that realizes `OP`.
+///
+/// The operands are on the stack as `[l, r]` (`r` on top), and the wasm operand
+/// stack is strict LIFO, so `l` cannot be reduced while `r` sits above it.
+/// Park `r` in the scratch f64 local (the same local `emit_assign` uses; it is
+/// free here and -- in the `BinOpAssign*` callers -- is overwritten by
+/// `emit_assign` before its next read), reduce `is_truthy(l)`, push `r` back and
+/// reduce `is_truthy(r)`, then combine. Each `is_truthy` yields an i32 that is
+/// exactly 0 or 1, so the bitwise `combine` equals the logical operator; and
+/// because `is_truthy` is pure and total, evaluating both operands is
+/// bit-identical to the VM's short-circuiting `&&`/`||`.
+fn emit_logical(ctx: &EmitCtx, f: &mut Function, combine: Instruction) {
+    // stack: [l, r] -> scratch = r; stack: [l]
+    f.instruction(&Instruction::LocalSet(ctx.scratch_local));
+    // is_truthy(l); stack: [t_l]
+    emit_is_truthy(ctx, f);
+    // bring r back; is_truthy(r); stack: [t_l, t_r]
+    f.instruction(&Instruction::LocalGet(ctx.scratch_local));
+    emit_is_truthy(ctx, f);
+    // combine and widen to f64 1.0/0.0
+    f.instruction(&combine);
+    f.instruction(&Instruction::F64ConvertI32U);
+}
+
+/// Emit an f64 comparison and convert its i32 result to the f64 0.0/1.0 the
+/// VM's `eval_op2` produces for comparisons.
+fn emit_cmp(f: &mut Function, cmp: &Instruction) {
+    f.instruction(cmp);
+    f.instruction(&Instruction::F64ConvertI32U);
+}
+
+/// Lower the `Apply { func }` opcode, mirroring the VM's `apply()`
+/// (`vm.rs:2938`). The three operands are on the wasm stack in push order
+/// `[a, b, c]` (`c` on top, matching the VM popping `c` then `b` then `a`);
+/// they are parked in the dedicated `ctx.apply_locals` so each builtin can read
+/// them any number of times in any order. The result is left on the stack.
+///
+/// `time`/`dt` for the time-driven builtins are read from `curr[TIME_OFF]` /
+/// `curr[DT_OFF]` (absolute global slots, like `LoadGlobalVar`), matching the
+/// VM's `time = curr[TIME_OFF]; dt = curr[DT_OFF]`.
+fn emit_apply(func: BuiltinId, ctx: &EmitCtx, f: &mut Function) {
+    use Instruction as Ins;
+    let [a, b, c] = ctx.apply_locals;
+
+    // Pop the three padded operands. The stack top is `c`, so set c, then b,
+    // then a (the VM pops in the same order).
+    f.instruction(&Ins::LocalSet(c));
+    f.instruction(&Ins::LocalSet(b));
+    f.instruction(&Ins::LocalSet(a));
+
+    let get = |f: &mut Function, l: u32| {
+        f.instruction(&Ins::LocalGet(l));
+    };
+
+    match func {
+        // ── Native f64 instructions on `a` ────────────────────────────────
+        BuiltinId::Abs => {
+            get(f, a);
+            f.instruction(&Ins::F64Abs);
+        }
+        BuiltinId::Sqrt => {
+            get(f, a);
+            f.instruction(&Ins::F64Sqrt);
+        }
+        // `Int = a.floor()` -- floor, NOT trunc (the VM's choice; they differ
+        // for negative arguments).
+        BuiltinId::Int => {
+            get(f, a);
+            f.instruction(&Ins::F64Floor);
+        }
+        // `Max`/`Min` use the wasm instructions per AC7.3. These differ from the
+        // VM's compare form (`if a>b {a} else {b}`) only on NaN/±0; if a corpus
+        // model ever surfaces such a divergence, switch the offending op to the
+        // compare-and-select form.
+        BuiltinId::Max => {
+            get(f, a);
+            get(f, b);
+            f.instruction(&Ins::F64Max);
+        }
+        BuiltinId::Min => {
+            get(f, a);
+            get(f, b);
+            f.instruction(&Ins::F64Min);
+        }
+
+        // ── Compare/arithmetic composed ───────────────────────────────────
+        // `Sign = if a>0 {1} else if a<0 {-1} else {0}`, i.e.
+        // `a>0 ? 1 : (a<0 ? -1 : 0)`, via two selects. wasm `select` yields its
+        // *deeper* operand when the condition is true, so the outer select is
+        // expressed with the inverted test `a<=0` (deeper = inner).
+        BuiltinId::Sign => {
+            // inner = select(-1.0, 0.0, a < 0)  ->  -1 if a<0 else 0
+            f.instruction(&f64_const(-1.0));
+            f.instruction(&f64_const(0.0));
+            get(f, a);
+            f.instruction(&f64_const(0.0));
+            f.instruction(&Ins::F64Lt);
+            f.instruction(&Ins::Select);
+            // result = select(inner, 1.0, a <= 0)  ->  inner if a<=0 else 1
+            f.instruction(&f64_const(1.0));
+            get(f, a);
+            f.instruction(&f64_const(0.0));
+            f.instruction(&Ins::F64Le);
+            f.instruction(&Ins::Select);
+        }
+        // `Quantum = if b==0.0 {a} else {(a/b).trunc()*b}` (exact `==`).
+        BuiltinId::Quantum => {
+            // select(a, (a/b).trunc()*b, b == 0.0)
+            get(f, a);
+            // (a/b).trunc() * b
+            get(f, a);
+            get(f, b);
+            f.instruction(&Ins::F64Div);
+            f.instruction(&Ins::F64Trunc);
+            get(f, b);
+            f.instruction(&Ins::F64Mul);
+            // cond: b == 0.0
+            get(f, b);
+            f.instruction(&f64_const(0.0));
+            f.instruction(&Ins::F64Eq);
+            f.instruction(&Ins::Select);
+        }
+        // `SafeDiv = if b != 0.0 {a/b} else {c}` (exact `!=`, NOT approx).
+        BuiltinId::SafeDiv => {
+            // select(a/b, c, b != 0.0)
+            get(f, a);
+            get(f, b);
+            f.instruction(&Ins::F64Div);
+            get(f, c);
+            get(f, b);
+            f.instruction(&f64_const(0.0));
+            f.instruction(&Ins::F64Ne);
+            f.instruction(&Ins::Select);
+        }
+        // `Sshape = b + (c-b)/(1.0 + exp(-4.0*(2.0*a-1.0)))`.
+        BuiltinId::Sshape => {
+            get(f, b);
+            // (c - b)
+            get(f, c);
+            get(f, b);
+            f.instruction(&Ins::F64Sub);
+            // denom = 1.0 + exp(-4.0 * (2.0*a - 1.0))
+            f.instruction(&f64_const(1.0));
+            // exp arg: -4.0 * (2.0*a - 1.0)
+            f.instruction(&f64_const(-4.0));
+            f.instruction(&f64_const(2.0));
+            get(f, a);
+            f.instruction(&Ins::F64Mul);
+            f.instruction(&f64_const(1.0));
+            f.instruction(&Ins::F64Sub);
+            f.instruction(&Ins::F64Mul);
+            f.instruction(&Ins::Call(ctx.helpers.exp));
+            f.instruction(&Ins::F64Add); // 1.0 + exp(..)
+            f.instruction(&Ins::F64Div); // (c-b) / denom
+            f.instruction(&Ins::F64Add); // b + ..
+        }
+
+        // ── Transcendentals on `a` (Task 2 helpers) ───────────────────────
+        BuiltinId::Exp => emit_call_unary(ctx.helpers.exp, a, ctx, f),
+        BuiltinId::Ln => emit_call_unary(ctx.helpers.ln, a, ctx, f),
+        BuiltinId::Log10 => emit_call_unary(ctx.helpers.log10, a, ctx, f),
+        BuiltinId::Sin => emit_call_unary(ctx.helpers.sin, a, ctx, f),
+        BuiltinId::Cos => emit_call_unary(ctx.helpers.cos, a, ctx, f),
+        BuiltinId::Tan => emit_call_unary(ctx.helpers.tan, a, ctx, f),
+        BuiltinId::Arcsin => emit_call_unary(ctx.helpers.asin, a, ctx, f),
+        BuiltinId::Arccos => emit_call_unary(ctx.helpers.acos, a, ctx, f),
+        BuiltinId::Arctan => emit_call_unary(ctx.helpers.atan, a, ctx, f),
+
+        // ── Time-driven ───────────────────────────────────────────────────
+        // `Step = step(time, dt, a, b) = if time + dt/2 > b {a} else {0.0}`.
+        BuiltinId::Step => {
+            // select(a, 0.0, time + dt/2 > b)
+            get(f, a);
+            f.instruction(&f64_const(0.0));
+            // time + dt/2.0
+            emit_load_global(ctx, f, TIME_OFF);
+            emit_load_global(ctx, f, DT_OFF);
+            f.instruction(&f64_const(2.0));
+            f.instruction(&Ins::F64Div);
+            f.instruction(&Ins::F64Add);
+            get(f, b);
+            f.instruction(&Ins::F64Gt);
+            f.instruction(&Ins::Select);
+        }
+        // `Ramp = ramp(time, slope=a, start=b, end=Some(c))`:
+        //   if time > b { if time >= c { a*(c-b) } else { a*(time-b) } } else 0.
+        // The Apply form always supplies an end time, so `end.is_some()` is true.
+        BuiltinId::Ramp => {
+            // done_value = a * (c - b)
+            get(f, a);
+            get(f, c);
+            get(f, b);
+            f.instruction(&Ins::F64Sub);
+            f.instruction(&Ins::F64Mul);
+            // ramping_value = a * (time - b)
+            get(f, a);
+            emit_load_global(ctx, f, TIME_OFF);
+            get(f, b);
+            f.instruction(&Ins::F64Sub);
+            f.instruction(&Ins::F64Mul);
+            // inner = select(done_value, ramping_value, time >= c)
+            emit_load_global(ctx, f, TIME_OFF);
+            get(f, c);
+            f.instruction(&Ins::F64Ge);
+            f.instruction(&Ins::Select);
+            // result = select(inner, 0.0, time > b)
+            f.instruction(&f64_const(0.0));
+            emit_load_global(ctx, f, TIME_OFF);
+            get(f, b);
+            f.instruction(&Ins::F64Gt);
+            f.instruction(&Ins::Select);
+        }
+        // `Pulse = pulse(time, dt, volume=a, first=b, interval=c)` (helper).
+        BuiltinId::Pulse => {
+            emit_load_global(ctx, f, TIME_OFF);
+            emit_load_global(ctx, f, DT_OFF);
+            get(f, a);
+            get(f, b);
+            get(f, c);
+            f.instruction(&Ins::Call(ctx.helpers.pulse));
+        }
+
+        // ── Constants ─────────────────────────────────────────────────────
+        BuiltinId::Inf => {
+            f.instruction(&f64_const(f64::INFINITY));
+        }
+        BuiltinId::Pi => {
+            f.instruction(&f64_const(std::f64::consts::PI));
+        }
+    }
+}
+
+/// Lower the `Lookup { base_gf, table_count, mode }` opcode, mirroring the VM's
+/// `Lookup` arm (`vm.rs:1710-1731`). The two operands are on the wasm stack as
+/// `[element_offset, index]` (`index` on top, matching the VM popping
+/// `lookup_index` then `element_offset`).
+///
+/// Bounds check: `element_offset < 0.0 || element_offset >= table_count as f64`
+/// pushes NaN (the VM's `*table_count as usize as f64` widens the compile-time
+/// `u16` count to f64). Otherwise the table index is
+/// `base_gf + (element_offset as i32)` (the VM's `as usize` truncation; the
+/// bounds check guarantees `0 <= element_offset < table_count`, so
+/// `i32.trunc_sat` is exact and non-negative); its `(data_off, count)` is read
+/// from the GF directory at `gf_directory_base + table_idx*8`, and the result
+/// comes from a static `call` to the mode's helper (the mode is known at
+/// compile time). The result is left on the stack.
+///
+/// `index`/`element_offset` are parked in [`scratch_local`](EmitCtx::scratch_local)
+/// and `apply_locals[0]` -- both free f64 scratch locals at an opcode boundary
+/// (nothing from a prior opcode is live there; `Lookup` and `Apply` never share
+/// a live operand within one opcode). The i32 directory address carries no
+/// dedicated local (the opcode-program function reserves none), so it is
+/// recomputed for the `count` read; the recompute is a handful of cheap integer
+/// ops.
+fn emit_lookup(
+    base_gf: GraphicalFunctionId,
+    table_count: u16,
+    mode: LookupMode,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) {
+    use Instruction as Ins;
+    use wasm_encoder::BlockType;
+
+    let index_local = ctx.scratch_local;
+    let elem_off_local = ctx.apply_locals[0];
+
+    // Pop the operands. `index` is on top, then `element_offset`.
+    f.instruction(&Ins::LocalSet(index_local));
+    f.instruction(&Ins::LocalSet(elem_off_local));
+
+    // bounds = (element_offset < 0.0) | (element_offset >= table_count as f64)
+    f.instruction(&Ins::LocalGet(elem_off_local));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::LocalGet(elem_off_local));
+    f.instruction(&f64_const(table_count as f64));
+    f.instruction(&Ins::F64Ge);
+    f.instruction(&Ins::I32Or);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    // out of range -> NaN
+    f.instruction(&f64_const(f64::NAN));
+    f.instruction(&Ins::Else);
+
+    let helper_idx = match mode {
+        LookupMode::Interpolate => ctx.helpers.lookup_interp,
+        LookupMode::Forward => ctx.helpers.lookup_forward,
+        LookupMode::Backward => ctx.helpers.lookup_backward,
+    };
+
+    // data_off = i32.load[dir_addr + 0]; count = i32.load[dir_addr + 4], where
+    // dir_addr = gf_directory_base + (base_gf + (element_offset as i32)) * 8.
+    push_gf_directory_addr(ctx, f, base_gf, elem_off_local);
+    f.instruction(&Ins::I32Load(i32_memarg(0)));
+    push_gf_directory_addr(ctx, f, base_gf, elem_off_local);
+    f.instruction(&Ins::I32Load(i32_memarg(4)));
+    // index, then call the mode's helper -> f64 result.
+    f.instruction(&Ins::LocalGet(index_local));
+    f.instruction(&Ins::Call(helper_idx));
+
+    f.instruction(&Ins::End); // end if
+}
+
+/// Push the byte address of table `base_gf + (element_offset as i32)`'s GF
+/// directory entry: `gf_directory_base + (base_gf + elem_off_i32) * 8`.
+/// `element_offset` is in `elem_off_local` (f64); `i32.trunc_sat_f64_s` matches
+/// the VM's `as usize` for the bounds-checked non-negative offset.
+fn push_gf_directory_addr(
+    ctx: &EmitCtx,
+    f: &mut Function,
+    base_gf: GraphicalFunctionId,
+    elem_off_local: u32,
+) {
+    use Instruction as Ins;
+    f.instruction(&Ins::I32Const(ctx.gf_directory_base as i32));
+    f.instruction(&Ins::I32Const(base_gf as i32));
+    f.instruction(&Ins::LocalGet(elem_off_local));
+    f.instruction(&Ins::I32TruncSatF64S);
+    f.instruction(&Ins::I32Add); // table_idx = base_gf + elem_off
+    f.instruction(&Ins::I32Const(GF_DIRECTORY_ENTRY_BYTES));
+    f.instruction(&Ins::I32Mul); // table_idx * 8
+    f.instruction(&Ins::I32Add); // gf_directory_base + table_idx*8
+}
+
+/// A 4-byte (i32) memory access with a static byte `offset` (for reading a GF
+/// directory entry's two i32 fields). The directory is 8-byte aligned, so a
+/// 4-byte access at offset 0 or 4 is naturally aligned.
+pub(crate) fn i32_memarg(offset: u64) -> MemArg {
+    MemArg {
+        offset,
+        align: 2, // log2(4): a 4-byte i32 access
+        memory_index: 0,
+    }
+}
+
+// ============================================================================
+// Array view stack + reducers (Phase 5 Tasks 1-2)
+// ============================================================================
+
+/// Borrow the top view descriptor, erroring (rather than panicking) on an empty
+/// stack -- malformed bytecode rather than a wrong module.
+fn view_top(view_stack: &[ViewDesc]) -> Result<&ViewDesc, WasmGenError> {
+    view_stack.last().ok_or_else(|| {
+        WasmGenError::Unsupported("wasmgen: view opcode on empty view stack".to_string())
+    })
+}
+
+/// Mutably borrow the top view descriptor for a transform opcode.
+fn view_top_mut(view_stack: &mut [ViewDesc]) -> Result<&mut ViewDesc, WasmGenError> {
+    view_stack.last_mut().ok_or_else(|| {
+        WasmGenError::Unsupported("wasmgen: view transform on empty view stack".to_string())
+    })
+}
+
+/// Resolve a dim-list id to `(dim sizes, dim ids)` for `PushVarView`/
+/// `PushTempView`: each entry is a `DimId`, and the size comes from
+/// `ctx.dimensions[DimId].size` (`vm.rs:1745`).
+fn resolve_dim_list_dims(
+    ctx: &EmitCtx,
+    dim_list_id: u16,
+) -> Result<(Vec<u16>, Vec<u16>), WasmGenError> {
+    let (n_dims, dim_ids) = ctx
+        .ctx
+        .dim_lists
+        .get(dim_list_id as usize)
+        .map(|(n, ids)| (*n as usize, *ids))
+        .ok_or_else(|| {
+            WasmGenError::Unsupported(format!("wasmgen: dim_list_id {dim_list_id} out of range"))
+        })?;
+    let mut dims = Vec::with_capacity(n_dims);
+    for &dim_id in dim_ids.iter().take(n_dims) {
+        let size = ctx
+            .ctx
+            .dimensions
+            .get(dim_id as usize)
+            .map(|d| d.size)
+            .ok_or_else(|| {
+                WasmGenError::Unsupported(format!("wasmgen: DimId {dim_id} out of range"))
+            })?;
+        dims.push(size);
+    }
+    let dim_id_vec = dim_ids[..n_dims].to_vec();
+    Ok((dims, dim_id_vec))
+}
+
+/// Resolve a dim-list id to its raw dimension sizes for `PushVarViewDirect`,
+/// where each entry is a literal dimension size, not a `DimId` (`vm.rs:1780`).
+/// The caller supplies the view's `dim_ids` itself (all zero -- this view is the
+/// base for a dynamic subscript, which does not broadcast), so only the sizes
+/// are returned here.
+fn resolve_dim_list_raw(ctx: &EmitCtx, dim_list_id: u16) -> Result<Vec<u16>, WasmGenError> {
+    let (n_dims, sizes) = ctx
+        .ctx
+        .dim_lists
+        .get(dim_list_id as usize)
+        .map(|(n, ids)| (*n as usize, *ids))
+        .ok_or_else(|| {
+            WasmGenError::Unsupported(format!("wasmgen: dim_list_id {dim_list_id} out of range"))
+        })?;
+    Ok(sizes[..n_dims].to_vec())
+}
+
+/// The absolute byte address of temp element `index` of temp `temp_id`:
+/// `temp_storage_base + (temp_offsets[temp_id] + index) * 8`.
+pub(crate) fn temp_element_byte_addr(
+    ctx: &EmitCtx,
+    temp_id: u8,
+    index: u32,
+) -> Result<u64, WasmGenError> {
+    let temp_off = *ctx.ctx.temp_offsets.get(temp_id as usize).ok_or_else(|| {
+        WasmGenError::Unsupported(format!("wasmgen: temp id {temp_id} out of range"))
+    })? as u64;
+    Ok(u64::from(ctx.temp_storage_base) + (temp_off + u64::from(index)) * u64::from(SLOT_SIZE))
+}
+
+/// Emit the wasm analogue of the VM's `fill_temp_nan` (`vm.rs:2866-2881`): store
+/// IEEE `f64::NAN` (NOT the finite `crate::float::NA` sentinel) into every slot
+/// of temp `temp_id`'s region, `temp_storage[temp_offsets[temp_id] ..
+/// temp_offsets[temp_id + 1]]` (or `.. temp_total_size` for the last temp).
+///
+/// The span is compile-time-known and small (one temp's worth of slots), so the
+/// stores are unrolled. Used by the Phase-6 vector ops (`super::vector`) for the
+/// invalid-input-view branch.
+pub(crate) fn emit_fill_temp_nan(
+    ctx: &EmitCtx,
+    temp_id: u8,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let idx = temp_id as usize;
+    let start = *ctx.ctx.temp_offsets.get(idx).ok_or_else(|| {
+        WasmGenError::Unsupported(format!("wasmgen: temp id {temp_id} out of range"))
+    })?;
+    let end = ctx
+        .ctx
+        .temp_offsets
+        .get(idx + 1)
+        .copied()
+        .unwrap_or(ctx.ctx.temp_total_size);
+    for slot in start..end {
+        // f64.store wants [addr_i32, value_f64]; the per-slot byte offset rides
+        // in the constant memarg, so the dynamic address is a constant 0.
+        f.instruction(&Instruction::I32Const(0));
+        f.instruction(&f64_const(f64::NAN));
+        f.instruction(&Instruction::F64Store(memarg(
+            u64::from(ctx.temp_storage_base) + (slot as u64) * u64::from(SLOT_SIZE),
+        )));
+    }
+    Ok(())
+}
+
+/// Lower `LoadTempDynamic { temp_id }`: pop a runtime index (the VM does
+/// `stack.pop().floor() as usize`), compute the temp element address, and load.
+///
+/// The address is `temp_storage_base + temp_offsets[temp_id]*8 + index*8`; the
+/// constant base/offset ride in the `memarg.offset`, so only `index*8` is
+/// computed at runtime. `i32.trunc_sat_f64_s` of `floor(index)` reproduces the
+/// VM's `floor() as usize` for a non-negative in-range index.
+fn emit_load_temp_dynamic(
+    ctx: &EmitCtx,
+    temp_id: u8,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    use Instruction as Ins;
+    let base = temp_element_byte_addr(ctx, temp_id, 0)?;
+    // index (f64, on top) -> floor -> i32 -> *8 (byte stride)
+    f.instruction(&Ins::F64Floor);
+    f.instruction(&Ins::I32TruncSatF64S);
+    f.instruction(&Ins::I32Const(SLOT_SIZE as i32));
+    f.instruction(&Ins::I32Mul);
+    f.instruction(&Ins::F64Load(memarg(base)));
+    Ok(())
+}
+
+/// Push the f64 value of view element `iter_idx` onto the wasm stack, reading
+/// from the byte address [`ViewDesc::element_addr`] computes. This is the single
+/// element-read primitive the reducers (Task 2) and -- for static/temp/var
+/// views -- the iteration loop (Task 3) build on.
+///
+/// The constant part of the address rides in the `memarg.offset`; the dynamic
+/// part of the wasm address is `module_off * 8` for a module-relative view (0 in
+/// the current single-root scope, but emitted for Phase 7 generality) and a bare
+/// `0` otherwise. A dynamically-subscripted view (Task 4) returns `Unsupported`
+/// here.
+///
+/// Landed with the view machinery (Task 1) as the single element-read primitive;
+/// its first consumer is the array reducer (Task 2), the iteration loop (Task 3)
+/// and the Phase-6 vector ops (`super::vector`).
+pub(crate) fn emit_view_element_load(
+    desc: &ViewDesc,
+    iter_idx: usize,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let addr = desc
+        .element_addr(iter_idx, ctx.curr_base, ctx.temp_storage_base, ctx.ctx)
+        .ok_or_else(bad_temp_view)?;
+    emit_addr_load(addr, ctx, f);
+    Ok(())
+}
+
+/// Push the f64 value of the view element at an *already-computed* flat slot
+/// offset (the broadcast paths -- `LoadIterViewTop` / `LoadBroadcastElement` --
+/// build the flat offset themselves rather than from an iteration index).
+fn emit_view_offset_load(
+    desc: &ViewDesc,
+    flat: usize,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let addr = desc
+        .element_addr_for_flat(flat, ctx.curr_base, ctx.temp_storage_base, ctx.ctx)
+        .ok_or_else(bad_temp_view)?;
+    emit_addr_load(addr, ctx, f);
+    Ok(())
+}
+
+/// Emit the f64 load for a resolved [`ElementAddr`]: the constant part rides in
+/// the `memarg.offset`; the dynamic part is `module_off * 8` for a module-
+/// relative view plus, for a dynamically-subscripted view (Task 4), the
+/// `runtime_off_local * 8` runtime addend (matching the VM's
+/// `curr[module_off + base_off + flat + dynamic]`). When the view carries a
+/// validity flag (`valid_local`), the whole load is wrapped in a guard that
+/// yields NaN when the flag is 0 -- the VM's out-of-bounds-subscript NaN.
+fn emit_addr_load(addr: ElementAddr, ctx: &EmitCtx, f: &mut Function) {
+    use Instruction as Ins;
+    use wasm_encoder::BlockType;
+
+    // Validity gate (dynamic subscript only): `if valid == 0 { NaN } else <load>`.
+    if let Some(valid_local) = addr.valid_local {
+        f.instruction(&Ins::LocalGet(valid_local));
+        f.instruction(&Ins::I32Eqz);
+        f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+        f.instruction(&f64_const(f64::NAN));
+        f.instruction(&Ins::Else);
+        emit_addr_load_unguarded(addr, ctx, f);
+        f.instruction(&Ins::End);
+    } else {
+        emit_addr_load_unguarded(addr, ctx, f);
+    }
+}
+
+/// The bare load half of [`emit_addr_load`] (no validity guard): push the
+/// dynamic address part, then `f64.load` with the constant `memarg.offset`. The
+/// dynamic part sums `module_off * 8` (module-relative views) and
+/// `runtime_off_local * 8` (a dynamic subscript's accumulated offset); if
+/// neither is present it is a bare `0`.
+fn emit_addr_load_unguarded(addr: ElementAddr, ctx: &EmitCtx, f: &mut Function) {
+    use Instruction as Ins;
+    let mut pushed = false;
+    if addr.module_relative {
+        push_module_relative_base(ctx, f);
+        pushed = true;
+    }
+    if let Some(off_local) = addr.runtime_off_local {
+        // runtime_off_local is a slot offset; convert to bytes.
+        f.instruction(&Ins::LocalGet(off_local));
+        f.instruction(&Ins::I32Const(SLOT_SIZE as i32));
+        f.instruction(&Ins::I32Mul);
+        if pushed {
+            f.instruction(&Ins::I32Add);
+        }
+        pushed = true;
+    }
+    if !pushed {
+        f.instruction(&Ins::I32Const(0));
+    }
+    f.instruction(&Ins::F64Load(memarg(addr.const_byte_offset)));
+}
+
+/// The `Unsupported` error for a temp-backed view whose `base_off` is not a
+/// valid temp id (`temp_offsets[base_off]` out of range) -- malformed bytecode
+/// rather than a wrong module.
+fn bad_temp_view() -> WasmGenError {
+    WasmGenError::Unsupported(
+        "wasmgen: array element read references an out-of-range temp id".to_string(),
+    )
+}
+
+/// Lower one array reducer over the top `ViewDesc` (the descriptor stays on the
+/// stack; the production pattern is `PushStaticView; Array<Reduce>; PopView`).
+///
+/// Reproduces `reduce_view` (`vm.rs:2802-2840`) and the per-reducer arms
+/// (`vm.rs:2216-2309`) exactly, including the asymmetry:
+/// - an **invalid** view (`valid_local` present and 0) yields NaN for *every*
+///   reducer, including `ArraySum` (`reduce_view`'s `if !is_valid { NaN }`);
+/// - an **empty-but-valid** view (`size() == 0`) yields `0.0` for `ArraySum`,
+///   `NaN` for Max/Min/Mean/Stddev, and `0` for `ArraySize`.
+///
+/// The fold is fully unrolled over the compile-time `size()`: reducer arrays are
+/// small, and unrolling reads each element at its compile-time-known address via
+/// [`emit_view_element_load`], so no runtime loop or precomputed offset table is
+/// needed for the static/temp views the reducer path produces. `ArrayMax`/
+/// `ArrayMin` use the VM's compare-and-select form (`if v > acc { v } else
+/// { acc }`), not `f64.max`/`f64.min`, matching the reduce path (AC7.3).
+fn emit_array_reduce(
+    op: &Opcode,
+    desc: &ViewDesc,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    use Instruction as Ins;
+
+    // ArraySize is always defined (the size of the view), independent of
+    // validity, and needs no element reads. The VM pushes `view.size() as f64`
+    // unconditionally (`vm.rs:2306`).
+    if matches!(op, Opcode::ArraySize {}) {
+        f.instruction(&f64_const(desc.size() as f64));
+        return Ok(());
+    }
+
+    let size = desc.size();
+    let is_sum = matches!(op, Opcode::ArraySum {});
+
+    // The empty-but-valid result, before accounting for an invalid view: 0.0 for
+    // Sum, NaN for the others.
+    let empty_result = if is_sum { 0.0 } else { f64::NAN };
+
+    if size == 0 {
+        // No element reads. For a static view (always valid) this is the final
+        // answer; a dynamic view's validity is folded in below.
+        f.instruction(&f64_const(empty_result));
+    } else {
+        emit_reduce_fold(op, desc, size, ctx, f)?;
+    }
+
+    // An invalid view (Task 4 dynamic subscript out of bounds) overrides the
+    // computed value with NaN for ALL reducers, mirroring `reduce_view`'s
+    // leading `if !is_valid { return NaN }`. For static views `valid_local` is
+    // `None`, so this is a no-op and the static result stands.
+    if let Some(valid_local) = desc.valid_local {
+        // Build `select(NaN, computed, valid == 0)`. wasm `select` pops
+        // `[a, b, cond]` and yields `a` when `cond != 0`, so `a` must be NaN and
+        // `b` the computed value. The computed value is currently on top, so
+        // park it (the fold has released `scratch_local` by now), push NaN, push
+        // the parked value, then `cond = (valid == 0)`.
+        f.instruction(&Ins::LocalSet(ctx.scratch_local));
+        f.instruction(&f64_const(f64::NAN)); // a = NaN
+        f.instruction(&Ins::LocalGet(ctx.scratch_local)); // b = computed
+        f.instruction(&Ins::LocalGet(valid_local));
+        f.instruction(&Ins::I32Eqz); // cond = 1 when invalid
+        f.instruction(&Ins::Select);
+    }
+
+    Ok(())
+}
+
+/// Emit the unrolled fold body for a non-empty reducer (size >= 1). Leaves the
+/// reduced f64 on the wasm stack. Split out so [`emit_array_reduce`] reads
+/// linearly.
+fn emit_reduce_fold(
+    op: &Opcode,
+    desc: &ViewDesc,
+    size: usize,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    use Instruction as Ins;
+    match op {
+        // Sum / Mean / Stddev all begin with the running sum over the elements.
+        Opcode::ArraySum {} | Opcode::ArrayMean {} | Opcode::ArrayStddev {} => {
+            // sum = e0 + e1 + ... (init 0.0, matching reduce_view's `0.0` init).
+            f.instruction(&f64_const(0.0));
+            for i in 0..size {
+                emit_view_element_load(desc, i, ctx, f)?;
+                f.instruction(&Ins::F64Add);
+            }
+            match op {
+                Opcode::ArraySum {} => {}
+                Opcode::ArrayMean {} => {
+                    // mean = sum / size (size > 0 here).
+                    f.instruction(&f64_const(size as f64));
+                    f.instruction(&Ins::F64Div);
+                }
+                Opcode::ArrayStddev {} => {
+                    // Two-pass population variance: mean = sum/size (computed
+                    // above and on the stack), then variance = mean of
+                    // (v - mean)^2, then sqrt. Park the mean so each squared
+                    // deviation can reference it.
+                    f.instruction(&f64_const(size as f64));
+                    f.instruction(&Ins::F64Div);
+                    f.instruction(&Ins::LocalSet(ctx.scratch_local)); // scratch = mean
+                    // variance_sum = Σ (v - mean)^2
+                    f.instruction(&f64_const(0.0));
+                    for i in 0..size {
+                        emit_view_element_load(desc, i, ctx, f)?;
+                        f.instruction(&Ins::LocalGet(ctx.scratch_local));
+                        f.instruction(&Ins::F64Sub); // v - mean
+                        // (v - mean)^2 via self-multiply. This equals `x * x` on
+                        // the host libm and agrees with the VM's `.powf(2.0)`
+                        // within floating-point tolerance regardless (`f64::powf`
+                        // is libm-dependent, so the two are not guaranteed
+                        // bit-identical on every platform).
+                        f.instruction(&Ins::LocalTee(ctx.apply_locals[0]));
+                        f.instruction(&Ins::LocalGet(ctx.apply_locals[0]));
+                        f.instruction(&Ins::F64Mul);
+                        f.instruction(&Ins::F64Add);
+                    }
+                    // stddev = sqrt(variance_sum / size)
+                    f.instruction(&f64_const(size as f64));
+                    f.instruction(&Ins::F64Div);
+                    f.instruction(&Ins::F64Sqrt);
+                }
+                _ => unreachable!(),
+            }
+        }
+        // Max / Min: fold with the VM's compare-and-select (`if v > acc { v }
+        // else { acc }`), init NEG_INFINITY / INFINITY (`vm.rs:2228`/`2245`).
+        Opcode::ArrayMax {} | Opcode::ArrayMin {} => {
+            let init = if matches!(op, Opcode::ArrayMax {}) {
+                f64::NEG_INFINITY
+            } else {
+                f64::INFINITY
+            };
+            f.instruction(&f64_const(init)); // acc
+            for i in 0..size {
+                // stack: [acc]; load v -> [acc, v]; select(v, acc, cmp).
+                emit_view_element_load(desc, i, ctx, f)?;
+                // Compute the comparison then select. wasm `select` pops
+                // [a, b, cond] and yields a when cond != 0. We want
+                // `if v <cmp> acc { v } else { acc }`, so push v then acc and
+                // test `v <cmp> acc`. Park acc/v in scratch f64 locals so they
+                // can be reused for both the select operands and the compare.
+                f.instruction(&Ins::LocalSet(ctx.apply_locals[1])); // b1 = v
+                f.instruction(&Ins::LocalSet(ctx.apply_locals[0])); // b0 = acc
+                f.instruction(&Ins::LocalGet(ctx.apply_locals[1])); // v   (select arg a)
+                f.instruction(&Ins::LocalGet(ctx.apply_locals[0])); // acc (select arg b)
+                f.instruction(&Ins::LocalGet(ctx.apply_locals[1])); // v
+                f.instruction(&Ins::LocalGet(ctx.apply_locals[0])); // acc
+                if matches!(op, Opcode::ArrayMax {}) {
+                    f.instruction(&Ins::F64Gt); // v > acc
+                } else {
+                    f.instruction(&Ins::F64Lt); // v < acc
+                }
+                f.instruction(&Ins::Select); // v if (cmp) else acc -> new acc
+            }
+        }
+        _ => unreachable!("emit_reduce_fold called with non-reducer opcode"),
+    }
+    Ok(())
+}
+
+/// Push `helper(local)` for a unary `(f64) -> f64` helper: load the f64 local,
+/// then `call`.
+fn emit_call_unary(helper_idx: u32, src: u32, _ctx: &EmitCtx, f: &mut Function) {
+    f.instruction(&Instruction::LocalGet(src));
+    f.instruction(&Instruction::Call(helper_idx));
+}
+
+/// Push the absolute (module-independent) global slot `off` from `curr`,
+/// matching `LoadGlobalVar` (slots 0..4 are reserved globals: TIME/DT/...).
+fn emit_load_global(ctx: &EmitCtx, f: &mut Function, off: u16) {
+    f.instruction(&Instruction::I32Const(0));
+    f.instruction(&Instruction::F64Load(memarg(slot_byte_offset(
+        ctx.curr_base,
+        off,
+    ))));
+}
+
+/// Lower `LoadPrev { off }`, mirroring the VM (`vm.rs:1320-1328`). A fallback
+/// f64 is already on the wasm stack (codegen pushes it immediately before this
+/// opcode). Park it in the scratch local, then build `select(fallback,
+/// prev_values[module_off+off], use_prev_fallback)`: wasm `select` yields its
+/// *deeper* operand when the condition is non-zero, so pushing
+/// `[fallback, prev_value, use_prev_fallback]` yields the fallback while the
+/// flag is set and the snapshot value once it is cleared.
+fn emit_load_prev(off: u16, ctx: &EmitCtx, f: &mut Function) {
+    use Instruction as Ins;
+    // Park the fallback (top of stack) so the module-relative prev_values
+    // address can be pushed beneath it.
+    f.instruction(&Ins::LocalSet(ctx.scratch_local));
+    f.instruction(&Ins::LocalGet(ctx.scratch_local)); // [fallback]
+    // prev_values[module_off + off]
+    push_module_relative_base(ctx, f);
+    f.instruction(&Ins::F64Load(memarg(slot_byte_offset(
+        ctx.prev_values_base,
+        off,
+    )))); // [fallback, prev_value]
+    f.instruction(&Ins::GlobalGet(ctx.use_prev_fallback_global)); // [fallback, prev_value, cond]
+    f.instruction(&Ins::Select);
+}
+
+/// Lower `LoadInitial { off }`, mirroring the VM (`vm.rs:1332-1340`) with the
+/// `part == Initials` branch resolved at compile time from `ctx.step_part`. In
+/// the initials program the snapshot is not yet taken, so read
+/// `curr[module_off+off]` (the value being computed); in the flows/stocks
+/// programs read the post-initials `initial_values[module_off+off]` snapshot.
+fn emit_load_initial(off: u16, ctx: &EmitCtx, f: &mut Function) {
+    let chunk_base = if ctx.step_part == StepPart::Initials {
+        ctx.curr_base
+    } else {
+        ctx.initial_values_base
+    };
+    push_module_relative_base(ctx, f);
+    f.instruction(&Instruction::F64Load(memarg(slot_byte_offset(
+        chunk_base, off,
+    ))));
+}
+
+/// Lower `EvalModule { id, n_inputs }`, mirroring the VM (`vm.rs:1379-1443`).
+///
+/// The `n_inputs` operands are already on the wasm stack (the parent's bytecode
+/// pushed them, top = the last input). The VM pops them in reverse into
+/// `module_inputs[j]` (`for j in (0..n_inputs).rev()`), computes
+/// `child_module_off = module_off + context.modules[id].off`, then evaluates the
+/// child for the current `part`. Here:
+///   1. pop the operands in reverse into the function's `module_input_scratch`
+///      f64 locals (`scratch[j]` for `j` from `n_inputs-1` down to 0), so
+///      `scratch[j]` holds input `j` -- identical to the VM's `module_inputs[j]`;
+///   2. resolve the child instance's function index for `ctx.step_part` (the
+///      `EvalModule` site in the initials/flows/stocks program calls the child's
+///      initials/flows/stocks function -- the `StepPart` is compile-time per
+///      program; the instantiation graph is acyclic, so the index already exists);
+///   3. push `child_module_off` (`module_off + decl.off`) then `scratch[0..k]` in
+///      order -- the child's `(module_off, in_0, .., in_{k-1})` argument list --
+///      and `call` it. The child reads/writes the shared slab at `module_off +
+///      off`, so threading the runtime `child_module_off` is what lets one
+///      `CompiledModule` run at several base offsets.
+fn emit_eval_module(
+    id: u16,
+    n_inputs: u8,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let decl = ctx.ctx.modules.get(id as usize).ok_or_else(|| {
+        WasmGenError::Unsupported(format!("wasmgen: EvalModule module id {id} out of range"))
+    })?;
+    let child_key = make_module_key(&decl.model_name, &decl.input_set);
+    let &fn_index = ctx
+        .module_fn_index
+        .get(&(child_key, ctx.step_part))
+        .ok_or_else(|| {
+            WasmGenError::Unsupported(format!(
+                "wasmgen: EvalModule child instance for module id {id} has no compiled function"
+            ))
+        })?;
+    let decl_off = i32::try_from(decl.off).map_err(|_| {
+        WasmGenError::Unsupported("wasmgen: module offset too large to lower".to_string())
+    })?;
+
+    // Pop the operands in reverse into the reverse-pop scratch f64 locals, so
+    // `scratch[j]` ends holding input `j` (exactly the VM's `module_inputs[j]`).
+    let n = u32::from(n_inputs);
+    for j in (0..n).rev() {
+        f.instruction(&Instruction::LocalSet(ctx.module_input_scratch_base + j));
+    }
+    // Push `child_module_off = module_off + decl.off`.
+    f.instruction(&Instruction::LocalGet(ctx.module_off_local));
+    f.instruction(&Instruction::I32Const(decl_off));
+    f.instruction(&Instruction::I32Add);
+    // Push the inputs back in order, then call the child's `part` function.
+    for j in 0..n {
+        f.instruction(&Instruction::LocalGet(ctx.module_input_scratch_base + j));
+    }
+    f.instruction(&Instruction::Call(fn_index));
+    Ok(())
+}
+
+/// Name an unsupported opcode without depending on `Debug` (feature-gated via
+/// `debug-derive`).
+fn unsupported_opcode(op: &Opcode) -> String {
+    let name = match op {
+        Opcode::PushSubscriptIndex { .. } => "PushSubscriptIndex",
+        Opcode::LoadSubscript { .. } => "LoadSubscript",
+        Opcode::LoadModuleInput { .. } => "LoadModuleInput",
+        Opcode::EvalModule { .. } => "EvalModule",
+        Opcode::Apply { .. } => "Apply",
+        Opcode::Lookup { .. } => "Lookup",
+        // Fused / superinstruction / array opcodes never reach a
+        // CompiledSimulation consumer, but name them defensively.
+        _ => "opcode",
+    };
+    format!("wasmgen: unsupported Opcode::{name}")
+}
+
+#[cfg(test)]
+#[path = "lower_tests.rs"]
+mod tests;
diff --git a/src/simlin-engine/src/wasmgen/lower_tests.rs b/src/simlin-engine/src/wasmgen/lower_tests.rs
new file mode 100644
index 000000000..f671d694d
--- /dev/null
+++ b/src/simlin-engine/src/wasmgen/lower_tests.rs
@@ -0,0 +1,5355 @@
+// Copyright 2026 The Simlin Authors. All rights reserved.
+// Use of this source code is governed by the Apache License,
+// Version 2.0, that can be found in the LICENSE file.
+
+//! Tests for the bytecode-to-WebAssembly lowering ([`super`]). Split out of
+//! `lower.rs` to keep that file under the project line-count lint; this is the
+//! `#[cfg(test)] mod tests` body, included via `#[path]` so `use super::*`
+//! still resolves the lowering module's private items.
+
+use super::*;
+use checked::Store;
+use wasm::validate;
+use wasm_encoder::{
+    CodeSection, ExportKind, ExportSection, FunctionSection, MemorySection, MemoryType, Module,
+    TypeSection, ValType,
+};
+
+use crate::bytecode::ByteCodeContext;
+use std::sync::OnceLock;
+
+/// Local layout for the test harness function. The function takes `module_off`
+/// as param 0 (no f64 module-input params -- these are root-only lowering tests);
+/// the scratch f64 and condition i32(s) are declared locals, whose indices come
+/// from the production helpers (`scratch_local_for` / `condition_locals_for`).
+const L_MODULE_OFF: u32 = 0;
+
+/// A shared empty `ByteCodeContext` for the scalar-opcode tests, which never
+/// touch the array tables. Array-view tests build their own context (with
+/// `static_views`/`temp_offsets`) and an `EmitCtx` borrowing it locally.
+fn empty_ctx() -> &'static ByteCodeContext {
+    static EMPTY: OnceLock<ByteCodeContext> = OnceLock::new();
+    EMPTY.get_or_init(ByteCodeContext::default)
+}
+
+/// A shared empty `(ModuleKey, StepPart) -> fn index` map. These lowering unit
+/// tests build single root-only functions (0 module inputs) and never emit an
+/// `EvalModule`, so the map is never consulted. The whole-model `EvalModule` /
+/// `LoadModuleInput` parity is exercised end-to-end in `module.rs`'s tests.
+fn empty_module_fn_index()
+-> &'static std::collections::HashMap<(crate::vm::ModuleKey, StepPart), u32> {
+    static EMPTY: OnceLock<std::collections::HashMap<(crate::vm::ModuleKey, StepPart), u32>> =
+        OnceLock::new();
+    EMPTY.get_or_init(std::collections::HashMap::new)
+}
+
+/// A shared empty set of overridable constant offsets. These lowering unit
+/// tests never exercise an `AssignConstCurr` redirect (the set is empty, so
+/// every `AssignConstCurr` emits its immediate literal -- the pre-Task-2
+/// behavior these tests pin); the constants-region redirect is exercised
+/// end-to-end in `module.rs`'s `set_value`/`reset` tests.
+fn empty_const_offsets() -> &'static std::collections::HashSet<u16> {
+    static EMPTY: OnceLock<std::collections::HashSet<u16>> = OnceLock::new();
+    EMPTY.get_or_init(std::collections::HashSet::new)
+}
+
+fn ctx_with_cond_depth(depth: usize) -> EmitCtx<'static> {
+    // These tests build a root-only function: `module_off` is param 0, there are
+    // no f64 module-input params, so `n_inputs == 0` reproduces the historical
+    // (pre-Phase-7) local indices exactly (scratch at 1, conditions at 2..).
+    let n_inputs = 0;
+    EmitCtx {
+        curr_base: 0,
+        next_base: 4096,
+        // The non-Lookup opcode tests place no GF regions; these bases are
+        // unused by the opcodes they exercise. The Lookup-opcode tests
+        // (which do read these) build their own ctx with real GF bases.
+        gf_directory_base: 0,
+        gf_data_base: 0,
+        // The PREVIOUS/INIT opcode tests build their own ctx with real
+        // snapshot bases + flag; the rest never touch these fields.
+        initial_values_base: 0,
+        prev_values_base: 0,
+        use_prev_fallback_global: 0,
+        step_part: StepPart::Flows,
+        dt: 0.5,
+        start_time: 1.0,
+        final_time: 25.0,
+        module_off_local: L_MODULE_OFF,
+        scratch_local: scratch_local_for(n_inputs),
+        condition_locals: condition_locals_for(n_inputs, depth),
+        apply_locals: apply_locals_for(n_inputs, depth),
+        // The helper-function indices are deterministic (helpers occupy the
+        // module's first function slots), and `build_module` emits exactly
+        // these helper bodies ahead of `eval`, so the indices agree.
+        helpers: build_helpers().fns,
+        // The scalar-opcode tests place no temp region; the array-view tests
+        // build their own ctx with a real temp base + context.
+        temp_storage_base: 0,
+        // Dynamic-subscript scratch i32 locals (Task 4) follow the scratch
+        // f64 / condition i32s / Apply f64s / the vector-op scratch blocks;
+        // `build_module` declares exactly `count_extra_i32_locals(bc)` of them
+        // at this base.
+        extra_i32_local_base: extra_i32_local_base(n_inputs, depth),
+        // The fixed Phase-6 vector-op scratch local blocks.
+        vector_f64_locals: vector_f64_locals_for(n_inputs, depth),
+        vector_i32_locals: vector_i32_locals_for(n_inputs, depth),
+        // The vector-op scratch region: well past TEMP_BASE (8192) but within
+        // the harness's single 64 KiB memory page, so the small test views'
+        // sort-pair / collected-value staging never collides with temp_storage.
+        vector_scratch_base: VECTOR_SCRATCH_BASE,
+        // The allocation scratch region: a separate high band, past the vector
+        // scratch and clear of temp_storage, sized for the tiny test views'
+        // request/profile/out staging.
+        alloc_scratch_base: ALLOC_SCRATCH_BASE,
+        // No `EvalModule` in these single-function tests: the reverse-pop scratch
+        // base sits past the extra-i32 block (none declared here), and the child
+        // function map is empty.
+        module_input_scratch_base: module_input_scratch_base(n_inputs, depth, 0),
+        // No overridable constants in these single-function tests: the constants
+        // region is unused and the offset set is empty (every `AssignConstCurr`
+        // emits its immediate literal -- the pre-Task-2 behavior). The Task-2
+        // override redirect is exercised end-to-end in `module.rs`'s tests.
+        const_region_base: 0,
+        flows_const_offsets: empty_const_offsets(),
+        module_fn_index: empty_module_fn_index(),
+        ctx: empty_ctx(),
+    }
+}
+
+/// Byte offset of the vector-op scratch region for the test harness. Past
+/// `TEMP_BASE` (8192) and any small test temp region, with ~6000 f64 slots of
+/// headroom before the 64 KiB page end -- ample for the tiny test views.
+const VECTOR_SCRATCH_BASE: u32 = 16384;
+
+/// Byte offset of the allocation scratch region for the test harness. A high
+/// band (~40 KiB) past `VECTOR_SCRATCH_BASE`, leaving room for both regions'
+/// tiny test stagings within the single 64 KiB page.
+const ALLOC_SCRATCH_BASE: u32 = 40960;
+
+fn bc(literals: Vec<f64>, code: Vec<Opcode>) -> ByteCode {
+    ByteCode { literals, code }
+}
+
+/// Build a module exporting `mem` and an `eval(module_off: i32)` function
+/// whose body is the lowered `bc`. When `with_result`, `eval` returns the
+/// f64 left on the stack. The function declares one scratch f64 local plus
+/// `cond_depth` i32 condition locals.
+///
+/// Mirrors `module.rs`'s production assembly: the emitted helper functions
+/// ([`build_helpers`]) occupy function indices `0..N` so the `call`s
+/// `emit_bytecode` generates resolve, and `eval` follows at index `N`.
+fn build_module(bc: &ByteCode, ctx: &EmitCtx, with_result: bool, cond_depth: usize) -> Vec<u8> {
+    let mut module = Module::new();
+
+    let helpers = build_helpers();
+    let n_helpers = helpers.functions.len() as u32;
+
+    // Type 0 is `eval`'s signature; each helper's signature follows.
+    let mut types = TypeSection::new();
+    if with_result {
+        types.ty().function([ValType::I32], [ValType::F64]);
+    } else {
+        types.ty().function([ValType::I32], []);
+    }
+    for hf in &helpers.functions {
+        types.ty().function(hf.params.clone(), hf.results.clone());
+    }
+    module.section(&types);
+
+    // Function indices follow declaration order: helpers first (0..N), then
+    // `eval` at N. Helper type indices are 1..=N (eval's type is 0).
+    let mut functions = FunctionSection::new();
+    for (i, _) in helpers.functions.iter().enumerate() {
+        functions.function(1 + i as u32);
+    }
+    functions.function(0);
+    module.section(&functions);
+
+    let mut memories = MemorySection::new();
+    memories.memory(MemoryType {
+        minimum: 1,
+        maximum: None,
+        memory64: false,
+        shared: false,
+        page_size_log2: None,
+    });
+    module.section(&memories);
+
+    let mut exports = ExportSection::new();
+    exports.export("eval", ExportKind::Func, n_helpers);
+    exports.export("mem", ExportKind::Memory, 0);
+    module.section(&exports);
+
+    let mut code = CodeSection::new();
+    for hf in helpers.functions {
+        code.function(&hf.body);
+    }
+    // 1 scratch f64 local, `cond_depth` i32 condition locals, the 3 `Apply`
+    // scratch f64 locals, the program's dynamic-subscript i32 scratch locals,
+    // and the `EvalModule` reverse-pop f64 scratch (none here -- root-only) --
+    // the same layout production uses.
+    let mut func = Function::new(opcode_fn_locals(
+        cond_depth,
+        count_extra_i32_locals(bc),
+        count_module_input_scratch(bc),
+    ));
+    emit_bytecode(bc, ctx, &mut func).expect("lowering should succeed");
+    func.instruction(&Instruction::End);
+    code.function(&func);
+    module.section(&code);
+
+    module.finish()
+}
+
+/// Emit, validate, instantiate, seed `curr`/`next` slots, run `eval(0)`,
+/// and either return its f64 result (`read_addr == None`) or the f64 at
+/// `read_addr`.
+fn run(
+    bc: &ByteCode,
+    ctx: &EmitCtx,
+    with_result: bool,
+    cond_depth: usize,
+    seed: &[(u64, f64)],
+    read_addr: Option<u64>,
+) -> f64 {
+    let bytes = build_module(bc, ctx, with_result, cond_depth);
+    let info = validate(&bytes).expect("emitted module must validate");
+    let mut store = Store::new(());
+    let module = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("emitted module must instantiate")
+        .module_addr;
+
+    if !seed.is_empty() {
+        let mem = store
+            .instance_export(module, "mem")
+            .unwrap()
+            .as_mem()
+            .unwrap();
+        store.mem_access_mut_slice(mem, |bytes| {
+            for &(addr, v) in seed {
+                let a = addr as usize;
+                bytes[a..a + 8].copy_from_slice(&v.to_le_bytes());
+            }
+        });
+    }
+
+    let eval = store
+        .instance_export(module, "eval")
+        .unwrap()
+        .as_func()
+        .unwrap();
+
+    match read_addr {
+        None => store
+            .invoke_simple_typed(eval, (0_i32,))
+            .expect("invocation must succeed"),
+        Some(addr) => {
+            store
+                .invoke_simple_typed::<(i32,), ()>(eval, (0_i32,))
+                .expect("invocation must succeed");
+            let mem = store
+                .instance_export(module, "mem")
+                .unwrap()
+                .as_mem()
+                .unwrap();
+            store.mem_access_mut_slice(mem, |bytes| {
+                let a = addr as usize;
+                f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap())
+            })
+        }
+    }
+}
+
+/// Evaluate a value program (with a 0-depth condition stack) and return its
+/// result.
+fn value(code: Vec<Opcode>, literals: Vec<f64>, seed: &[(u64, f64)]) -> f64 {
+    run(
+        &bc(literals, code),
+        &ctx_with_cond_depth(0),
+        true,
+        0,
+        seed,
+        None,
+    )
+}
+
+/// Run an assignment program and read back the stored slot.
+fn stored(code: Vec<Opcode>, literals: Vec<f64>, seed: &[(u64, f64)], read_addr: u64) -> f64 {
+    run(
+        &bc(literals, code),
+        &ctx_with_cond_depth(0),
+        false,
+        0,
+        seed,
+        Some(read_addr),
+    )
+}
+
+fn op2(op: Op2) -> Opcode {
+    Opcode::Op2 { op }
+}
+
+// ── LoadConstant ──────────────────────────────────────────────────────
+
+#[test]
+fn lowers_load_constant() {
+    assert_eq!(
+        value(vec![Opcode::LoadConstant { id: 0 }], vec![3.5], &[]),
+        3.5
+    );
+}
+
+#[test]
+fn lowers_load_constant_selects_right_literal() {
+    let code = vec![Opcode::LoadConstant { id: 2 }];
+    assert_eq!(value(code, vec![1.0, 2.0, 42.0], &[]), 42.0);
+}
+
+// ── LoadVar / LoadGlobalVar ───────────────────────────────────────────
+
+#[test]
+fn lowers_load_var_from_curr() {
+    // slot 4 of curr lives at byte 4*8 = 32; module_off is 0.
+    let code = vec![Opcode::LoadVar { off: 4 }];
+    assert_eq!(value(code, vec![], &[(32, 7.0)]), 7.0);
+}
+
+#[test]
+fn lowers_load_global_var_absolute() {
+    // LoadGlobalVar reads slot `off` ignoring module_off; slot 0 (TIME) at
+    // byte 0.
+    let code = vec![Opcode::LoadGlobalVar { off: 0 }];
+    assert_eq!(value(code, vec![], &[(0, 13.0)]), 13.0);
+}
+
+#[test]
+fn load_var_honors_module_off() {
+    // With a non-zero module_off, LoadVar{off:1} reads curr[module_off+1];
+    // LoadGlobalVar{off:1} reads curr[1] regardless. We verify the dynamic
+    // base path by running eval with module_off=2 directly.
+    let ctx = ctx_with_cond_depth(0);
+    let program = bc(vec![], vec![Opcode::LoadVar { off: 1 }]);
+    let bytes = build_module(&program, &ctx, true, 0);
+    let info = validate(&bytes).expect("module must validate");
+    let mut store = Store::new(());
+    let module = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("instantiate")
+        .module_addr;
+    let mem = store
+        .instance_export(module, "mem")
+        .unwrap()
+        .as_mem()
+        .unwrap();
+    // curr[3] at byte 24 (module_off=2 + off=1).
+    store.mem_access_mut_slice(mem, |bytes| {
+        bytes[24..32].copy_from_slice(&99.0_f64.to_le_bytes());
+    });
+    let eval = store
+        .instance_export(module, "eval")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    let result: f64 = store.invoke_simple_typed(eval, (2_i32,)).expect("invoke");
+    assert_eq!(result, 99.0);
+}
+
+// ── Op2: arithmetic ───────────────────────────────────────────────────
+
+#[test]
+fn lowers_arithmetic_ops() {
+    let lc = |id| Opcode::LoadConstant { id };
+    // 2 + 3 = 5
+    assert_eq!(
+        value(vec![lc(0), lc(1), op2(Op2::Add)], vec![2.0, 3.0], &[]),
+        5.0
+    );
+    // 2 - 3 = -1 (operand order: l=2, r=3)
+    assert_eq!(
+        value(vec![lc(0), lc(1), op2(Op2::Sub)], vec![2.0, 3.0], &[]),
+        -1.0
+    );
+    // 2 * 3 = 6
+    assert_eq!(
+        value(vec![lc(0), lc(1), op2(Op2::Mul)], vec![2.0, 3.0], &[]),
+        6.0
+    );
+    // 3 / 2 = 1.5 (operand order: l=3, r=2)
+    assert_eq!(
+        value(vec![lc(0), lc(1), op2(Op2::Div)], vec![3.0, 2.0], &[]),
+        1.5
+    );
+}
+
+#[test]
+fn op2_operand_order_matches_vm() {
+    // The VM computes `l op r` with l pushed first. births = pop * rate:
+    // pop=slot4 (byte 32), constant rate.
+    let code = vec![
+        Opcode::LoadVar { off: 4 },
+        Opcode::LoadConstant { id: 0 },
+        op2(Op2::Mul),
+    ];
+    assert_eq!(value(code, vec![0.1], &[(32, 100.0)]), 10.0);
+}
+
+// ── Op2: comparisons yield f64 0.0/1.0 ────────────────────────────────
+
+#[test]
+fn lowers_comparisons_to_f64_bool() {
+    let lc = |id| Opcode::LoadConstant { id };
+    assert_eq!(
+        value(vec![lc(0), lc(1), op2(Op2::Gt)], vec![2.0, 1.0], &[]),
+        1.0
+    );
+    assert_eq!(
+        value(vec![lc(0), lc(1), op2(Op2::Gt)], vec![1.0, 2.0], &[]),
+        0.0
+    );
+    assert_eq!(
+        value(vec![lc(0), lc(1), op2(Op2::Gte)], vec![1.0, 1.0], &[]),
+        1.0
+    );
+    assert_eq!(
+        value(vec![lc(0), lc(1), op2(Op2::Lt)], vec![1.0, 2.0], &[]),
+        1.0
+    );
+    assert_eq!(
+        value(vec![lc(0), lc(1), op2(Op2::Lte)], vec![1.0, 1.0], &[]),
+        1.0
+    );
+}
+
+// ── Not ───────────────────────────────────────────────────────────────
+
+#[test]
+fn lowers_not_truthiness() {
+    let lc = |id| Opcode::LoadConstant { id };
+    assert_eq!(value(vec![lc(0), Opcode::Not {}], vec![0.0], &[]), 1.0);
+    assert_eq!(value(vec![lc(0), Opcode::Not {}], vec![5.0], &[]), 0.0);
+}
+
+// ── SetCond + If ──────────────────────────────────────────────────────
+
+/// `if cond then t else f`. Mirrors codegen's emission order: push t, push
+/// f, push cond, SetCond, If. Run with a depth-1 condition stack.
+fn if_program(cond: f64, t: f64, f: f64) -> f64 {
+    let code = vec![
+        Opcode::LoadConstant { id: 1 }, // t
+        Opcode::LoadConstant { id: 2 }, // f
+        Opcode::LoadConstant { id: 0 }, // cond
+        Opcode::SetCond {},
+        Opcode::If {},
+    ];
+    run(
+        &bc(vec![cond, t, f], code),
+        &ctx_with_cond_depth(1),
+        true,
+        1,
+        &[],
+        None,
+    )
+}
+
+#[test]
+fn lowers_if_selects_true_arm() {
+    assert_eq!(if_program(1.0, 10.0, 20.0), 10.0);
+}
+
+#[test]
+fn lowers_if_selects_false_arm_for_zero() {
+    assert_eq!(if_program(0.0, 10.0, 20.0), 20.0);
+}
+
+#[test]
+fn lowers_if_truthy_nonzero_is_true() {
+    // Any non-zero condition is true (matches the VM's is_truthy).
+    assert_eq!(if_program(0.5, 10.0, 20.0), 10.0);
+    assert_eq!(if_program(-3.0, 10.0, 20.0), 10.0);
+}
+
+#[test]
+fn lowers_if_with_comparison_condition() {
+    // if pop > 50 then 1 else 0, pop in slot 4 (byte 32).
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // t = 1
+        Opcode::LoadConstant { id: 1 }, // f = 0
+        Opcode::LoadVar { off: 4 },     // pop
+        Opcode::LoadConstant { id: 2 }, // 50
+        op2(Op2::Gt),
+        Opcode::SetCond {},
+        Opcode::If {},
+    ];
+    let run_with = |seed: &[(u64, f64)]| {
+        run(
+            &bc(vec![1.0, 0.0, 50.0], code.clone()),
+            &ctx_with_cond_depth(1),
+            true,
+            1,
+            seed,
+            None,
+        )
+    };
+    assert_eq!(run_with(&[(32, 100.0)]), 1.0);
+    assert_eq!(run_with(&[(32, 10.0)]), 0.0);
+}
+
+#[test]
+fn lowers_nested_if() {
+    // if (if a then b else c) then d else e.
+    // codegen order: push d, push e, then walk the cond which is the inner
+    // If (push b, push c, push a, SetCond_inner, If_inner), then
+    // SetCond_outer, If_outer. literals: a,b,c,d,e at 0..5.
+    let code = vec![
+        Opcode::LoadConstant { id: 3 }, // d
+        Opcode::LoadConstant { id: 4 }, // e
+        Opcode::LoadConstant { id: 1 }, // b
+        Opcode::LoadConstant { id: 2 }, // c
+        Opcode::LoadConstant { id: 0 }, // a
+        Opcode::SetCond {},             // inner
+        Opcode::If {},                  // inner -> b or c
+        Opcode::SetCond {},             // outer (cond = inner result)
+        Opcode::If {},                  // outer -> d or e
+    ];
+    let eval = |a: f64, b: f64, c: f64, d: f64, e: f64| {
+        run(
+            &bc(vec![a, b, c, d, e], code.clone()),
+            &ctx_with_cond_depth(2),
+            true,
+            2,
+            &[],
+            None,
+        )
+    };
+    // a truthy -> inner = b. b truthy -> outer = d.
+    assert_eq!(eval(1.0, 1.0, 0.0, 100.0, 200.0), 100.0);
+    // a falsey -> inner = c. c falsey -> outer = e.
+    assert_eq!(eval(0.0, 1.0, 0.0, 100.0, 200.0), 200.0);
+    // a truthy -> inner = b=0 (falsey) -> outer = e.
+    assert_eq!(eval(1.0, 0.0, 9.0, 100.0, 200.0), 200.0);
+}
+
+// ── AssignCurr / AssignNext ───────────────────────────────────────────
+
+#[test]
+fn lowers_assign_curr_constant() {
+    // store 42.0 into curr slot 5 (byte 40), read it back.
+    let code = vec![
+        Opcode::LoadConstant { id: 0 },
+        Opcode::AssignCurr { off: 5 },
+    ];
+    assert_eq!(stored(code, vec![42.0], &[], 40), 42.0);
+}
+
+#[test]
+fn lowers_assign_const_curr() {
+    // AssignConstCurr is emitted by base codegen for a constant-RHS
+    // assignment (e.g. a constant initial or aux): curr[off] = literals[id].
+    // Store 7.0 into curr slot 6 (byte 48), read it back.
+    let code = vec![Opcode::AssignConstCurr {
+        off: 6,
+        literal_id: 0,
+    }];
+    assert_eq!(stored(code, vec![7.0], &[], 48), 7.0);
+}
+
+#[test]
+fn assign_const_curr_honors_module_off() {
+    // With module_off=2, AssignConstCurr{off:1} writes curr[3] (byte 24).
+    let ctx = ctx_with_cond_depth(0);
+    let program = bc(
+        vec![3.5],
+        vec![Opcode::AssignConstCurr {
+            off: 1,
+            literal_id: 0,
+        }],
+    );
+    let bytes = build_module(&program, &ctx, false, 0);
+    let info = validate(&bytes).expect("module must validate");
+    let mut store = Store::new(());
+    let module = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("instantiate")
+        .module_addr;
+    let eval = store
+        .instance_export(module, "eval")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    store
+        .invoke_simple_typed::<(i32,), ()>(eval, (2_i32,))
+        .expect("invoke");
+    let mem = store
+        .instance_export(module, "mem")
+        .unwrap()
+        .as_mem()
+        .unwrap();
+    let v = store.mem_access_mut_slice(mem, |bytes| {
+        f64::from_le_bytes(bytes[24..32].try_into().unwrap())
+    });
+    assert_eq!(v, 3.5);
+}
+
+#[test]
+fn lowers_bin_op_assign_curr() {
+    // BinOpAssignCurr is the peephole fusion of `Op2; AssignCurr`: pops
+    // [l, r], computes l op r, stores to curr[off]. Mirrors vm.rs:1457.
+    // deaths = pop / 80 -> curr slot 6 (byte 48); pop = slot 4 (byte 32).
+    let code = vec![
+        Opcode::LoadVar { off: 4 },
+        Opcode::LoadConstant { id: 0 },
+        Opcode::BinOpAssignCurr {
+            op: Op2::Div,
+            off: 6,
+        },
+    ];
+    assert_eq!(stored(code, vec![80.0], &[(32, 200.0)], 48), 2.5);
+}
+
+#[test]
+fn lowers_bin_op_assign_next() {
+    // BinOpAssignNext is the peephole fusion of `Op2; AssignNext` (stock
+    // integration): pops [l, r], computes l op r, stores to next[off].
+    // next[pop] = pop + delta, with delta in curr slot 5.
+    // next slot 4 lives at next_base(4096) + 32 = 4128.
+    let code = vec![
+        Opcode::LoadVar { off: 4 }, // pop
+        Opcode::LoadVar { off: 5 }, // delta
+        Opcode::BinOpAssignNext {
+            op: Op2::Add,
+            off: 4,
+        },
+    ];
+    // pop=100, delta=3.75 -> 103.75
+    assert_eq!(
+        stored(code, vec![], &[(32, 100.0), (40, 3.75)], 4128),
+        103.75
+    );
+}
+
+#[test]
+fn bin_op_assign_curr_operand_order_matches_vm() {
+    // Non-commutative op: l - r with l pushed first.
+    // result = a - b -> curr slot 5 (byte 40); a=slot 3 (24), b=slot 4 (32).
+    let code = vec![
+        Opcode::LoadVar { off: 3 },
+        Opcode::LoadVar { off: 4 },
+        Opcode::BinOpAssignCurr {
+            op: Op2::Sub,
+            off: 5,
+        },
+    ];
+    assert_eq!(stored(code, vec![], &[(24, 10.0), (32, 3.0)], 40), 7.0);
+}
+
+// Note: every `Op2` variant is supported as of Phase 2 (Mod/Exp landed in
+// Task 3), so there is no longer an unsupported operator to drive the
+// `BinOpAssign*` error-propagation path. The fused-`Mod` form is exercised
+// for correctness by `bin_op_assign_curr_mod_stores_rem_euclid`; the
+// clean-error-on-unsupported-*opcode* path is covered by
+// `unsupported_lookup_returns_error` / `unsupported_array_opcode_returns_error`.
+
+#[test]
+fn lowers_assign_curr_from_expr() {
+    // deaths = pop / 80 -> curr slot 6 (byte 48); pop = slot 4 (byte 32).
+    let code = vec![
+        Opcode::LoadVar { off: 4 },
+        Opcode::LoadConstant { id: 0 },
+        op2(Op2::Div),
+        Opcode::AssignCurr { off: 6 },
+    ];
+    assert_eq!(stored(code, vec![80.0], &[(32, 200.0)], 48), 2.5);
+}
+
+#[test]
+fn lowers_assign_next_euler_update() {
+    // next[pop] = pop + (births - deaths) * dt, all read from curr.
+    // pop=slot4 (32), births=slot5 (40), deaths=slot6 (48); dt=0.5 literal.
+    // next slot 4 lives at next_base(4096) + 32 = 4128.
+    let code = vec![
+        Opcode::LoadVar { off: 4 },     // pop
+        Opcode::LoadVar { off: 5 },     // births
+        Opcode::LoadVar { off: 6 },     // deaths
+        op2(Op2::Sub),                  // births - deaths
+        Opcode::LoadConstant { id: 0 }, // dt
+        op2(Op2::Mul),                  // (births - deaths) * dt
+        op2(Op2::Add),                  // pop + ...
+        Opcode::AssignNext { off: 4 },
+    ];
+    // pop=100, births=10, deaths=2.5 -> 100 + 7.5*0.5 = 103.75
+    let seed = &[(32, 100.0), (40, 10.0), (48, 2.5)];
+    assert_eq!(stored(code, vec![0.5], seed, 4128), 103.75);
+}
+
+#[test]
+fn assign_next_honors_module_off() {
+    // With module_off=2, AssignNext{off:0} writes next[2]; next_base=4096,
+    // so byte 4096 + 2*8 = 4112.
+    let ctx = ctx_with_cond_depth(0);
+    let program = bc(
+        vec![7.0],
+        vec![
+            Opcode::LoadConstant { id: 0 },
+            Opcode::AssignNext { off: 0 },
+        ],
+    );
+    let bytes = build_module(&program, &ctx, false, 0);
+    let info = validate(&bytes).expect("module must validate");
+    let mut store = Store::new(());
+    let module = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("instantiate")
+        .module_addr;
+    let eval = store
+        .instance_export(module, "eval")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    store
+        .invoke_simple_typed::<(i32,), ()>(eval, (2_i32,))
+        .expect("invoke");
+    let mem = store
+        .instance_export(module, "mem")
+        .unwrap()
+        .as_mem()
+        .unwrap();
+    let v = store.mem_access_mut_slice(mem, |bytes| {
+        f64::from_le_bytes(bytes[4112..4120].try_into().unwrap())
+    });
+    assert_eq!(v, 7.0);
+}
+
+// ── Ret is a no-op ────────────────────────────────────────────────────
+
+#[test]
+fn ret_emits_nothing() {
+    // A program that loads a constant then Ret leaves just the constant.
+    let code = vec![Opcode::LoadConstant { id: 0 }, Opcode::Ret];
+    assert_eq!(value(code, vec![5.0], &[]), 5.0);
+}
+
+// ── AC1.5: raw Op2::Div by zero matches IEEE / the VM ─────────────────
+
+#[test]
+fn div_by_zero_matches_vm_ieee() {
+    let lc = |id| Opcode::LoadConstant { id };
+    // x/0 -> +Inf
+    assert_eq!(
+        value(vec![lc(0), lc(1), op2(Op2::Div)], vec![1.0, 0.0], &[]),
+        f64::INFINITY
+    );
+    // -x/0 -> -Inf
+    assert_eq!(
+        value(vec![lc(0), lc(1), op2(Op2::Div)], vec![-1.0, 0.0], &[]),
+        f64::NEG_INFINITY
+    );
+    // 0/0 -> NaN
+    let nan = value(vec![lc(0), lc(1), op2(Op2::Div)], vec![0.0, 0.0], &[]);
+    assert!(nan.is_nan());
+}
+
+// ── AC1.4: unsupported opcodes return a clean error, never a panic ────
+
+#[test]
+fn op2_eq_lowers_without_error() {
+    // Eq is now supported (routed through the approx_eq helper), so lowering
+    // must succeed where Phase 1 returned Unsupported. Numeric parity is
+    // covered by the dedicated approx_eq / Op2::Eq tests below.
+    let mut func = Function::new([]);
+    let program = bc(vec![1.0, 2.0], vec![op2(Op2::Eq)]);
+    let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func);
+    assert!(result.is_ok(), "Op2::Eq should lower without error");
+}
+
+#[test]
+fn op2_mod_lowers_without_error() {
+    // Mod is now supported (rem_euclid via the mod_euclid helper); lowering
+    // must succeed where Phase 1 returned Unsupported.
+    let mut func = Function::new([]);
+    let program = bc(vec![], vec![op2(Op2::Mod)]);
+    let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func);
+    assert!(result.is_ok(), "Op2::Mod should lower without error");
+}
+
+#[test]
+fn op2_exp_lowers_without_error() {
+    // Exp is now supported (powf via the pow helper).
+    let mut func = Function::new([]);
+    let program = bc(vec![], vec![op2(Op2::Exp)]);
+    let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func);
+    assert!(result.is_ok(), "Op2::Exp should lower without error");
+}
+
+// ── Op2::Exp (pow) / Op2::Mod (rem_euclid) numeric parity ─────────────
+
+/// Evaluate `l Op2::Exp r` (push l, push r, Op2::Exp) -> f64.
+fn eval_exp(l: f64, r: f64) -> f64 {
+    value(
+        vec![
+            Opcode::LoadConstant { id: 0 },
+            Opcode::LoadConstant { id: 1 },
+            op2(Op2::Exp),
+        ],
+        vec![l, r],
+        &[],
+    )
+}
+
+#[test]
+fn op2_exp_matches_powf_for_positive_base() {
+    // The VM's `eval_op2` Exp is `l.powf(r)`. The wasm `pow` helper matches
+    // `powf` for a positive base across integer/fractional/negative
+    // exponents; assert within the documented helper tolerance.
+    let bases: [f64; 6] = [0.5, 1.0, 2.0, 3.7, 10.0, 100.0];
+    let exps: [f64; 9] = [-3.0, -1.5, -1.0, 0.0, 0.5, 1.0, 2.0, 2.5, 7.0];
+    for &l in &bases {
+        for &r in &exps {
+            let want = l.powf(r);
+            let got = eval_exp(l, r);
+            let abs = (got - want).abs();
+            let rel = if want != 0.0 { abs / want.abs() } else { abs };
+            assert!(
+                abs <= 1e-9 || rel <= 1e-9,
+                "Exp({l}, {r}): got {got}, want {want} (abs {abs:.3e}, rel {rel:.3e})",
+            );
+        }
+    }
+    // x == 1 and y == 0 are the helper's exact short-circuits.
+    assert_eq!(eval_exp(1.0, 42.0), 1.0);
+    assert_eq!(eval_exp(7.0, 0.0), 1.0);
+}
+
+/// Evaluate `l Op2::Mod r` (push l, push r, Op2::Mod) -> f64.
+fn eval_mod(l: f64, r: f64) -> f64 {
+    value(
+        vec![
+            Opcode::LoadConstant { id: 0 },
+            Opcode::LoadConstant { id: 1 },
+            op2(Op2::Mod),
+        ],
+        vec![l, r],
+        &[],
+    )
+}
+
+#[test]
+fn op2_mod_matches_rem_euclid_all_sign_combos() {
+    // The VM's `eval_op2` Mod is `l.rem_euclid(r)` (result in [0, |r|)),
+    // NOT a truncated remainder. Cover all four sign combinations and
+    // non-integer operands.
+    let cases: &[(f64, f64)] = &[
+        (7.0, 3.0),
+        (-7.0, 3.0),
+        (7.0, -3.0),
+        (-7.0, -3.0),
+        (7.5, 2.5),
+        (-7.5, 2.5),
+        (7.5, -2.5),
+        (-7.5, -2.5),
+        (5.3, 2.1),
+        (-5.3, 2.1),
+        (5.3, -2.1),
+        (-5.3, -2.1),
+        (0.0, 3.0),
+        (3.0, 3.0),
+        (-3.0, 3.0),
+        (2.0, 4.0),
+    ];
+    for &(l, r) in cases {
+        let want = l.rem_euclid(r);
+        let got = eval_mod(l, r);
+        assert!(
+            (got - want).abs() < 1e-12,
+            "Mod({l}, {r}): got {got}, want {want}",
+        );
+        // The euclidean remainder is always in [0, |r|).
+        assert!(
+            (0.0..r.abs()).contains(&got),
+            "Mod({l}, {r}) = {got} not in [0, {})",
+            r.abs(),
+        );
+    }
+}
+
+#[test]
+fn bin_op_assign_curr_mod_stores_rem_euclid() {
+    // The peephole-fused `Op2::Mod; AssignCurr` form must also lower (it was
+    // an Unsupported case in Phase 1). -7 mod 3 = 2 -> curr slot 5 (byte 40).
+    let code = vec![
+        Opcode::LoadConstant { id: 0 },
+        Opcode::LoadConstant { id: 1 },
+        Opcode::BinOpAssignCurr {
+            op: Op2::Mod,
+            off: 5,
+        },
+    ];
+    assert_eq!(stored(code, vec![-7.0, 3.0], &[], 40), 2.0);
+}
+
+#[test]
+fn apply_lowers_without_error() {
+    // Apply is supported as of Phase 2 Task 4; lowering must succeed where
+    // Phase 1 returned Unsupported. (Numeric parity is covered by the
+    // dedicated per-builtin tests below.)
+    let mut func = Function::new([]);
+    let program = bc(
+        vec![],
+        vec![Opcode::Apply {
+            func: BuiltinId::Abs,
+        }],
+    );
+    let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func);
+    assert!(result.is_ok(), "Apply should lower without error");
+}
+
+#[test]
+fn lookup_lowers_without_error() {
+    // Lookup is supported as of Phase 3; lowering must succeed where Phase 2
+    // returned Unsupported. (Numeric parity is covered by the seeded-table
+    // tests below and the end-to-end GF model tests in module.rs.)
+    let mut func = Function::new(opcode_fn_locals(0, 0, 0));
+    let program = bc(
+        vec![0.0, 1.0],
+        vec![
+            Opcode::LoadConstant { id: 0 }, // element_offset
+            Opcode::LoadConstant { id: 1 }, // index
+            Opcode::Lookup {
+                base_gf: 0,
+                table_count: 1,
+                mode: LookupMode::Interpolate,
+            },
+        ],
+    );
+    let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func);
+    assert!(result.is_ok(), "Lookup should lower without error");
+}
+
+#[test]
+fn unsupported_array_opcode_returns_error() {
+    // The reducers, static view ops, and iteration loops are supported as of
+    // Phase 5 Tasks 1-3, so this drives a still-unsupported module opcode
+    // (`EvalModule`, Phase 7) to confirm an unhandled opcode still returns a
+    // clean error rather than a wrong module.
+    let mut func = Function::new([]);
+    let program = bc(vec![], vec![Opcode::EvalModule { id: 0, n_inputs: 0 }]);
+    let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func);
+    assert!(matches!(result, Err(WasmGenError::Unsupported(_))));
+}
+
+#[test]
+fn begin_iter_on_empty_view_stack_errors() {
+    // A `BeginIter` with no view pushed first is malformed bytecode: it must
+    // error cleanly (empty-view-stack), not panic.
+    let mut func = Function::new([]);
+    let program = bc(
+        vec![],
+        vec![Opcode::BeginIter {
+            write_temp_id: 0,
+            has_write_temp: false,
+        }],
+    );
+    let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func);
+    assert!(matches!(result, Err(WasmGenError::Unsupported(_))));
+}
+
+// ── Lookup opcode: seeded-table parity with the VM lookup functions ───
+
+// GF region bases for the Lookup opcode tests, placed well past
+// `next_base` (4096) so they cannot overlap the curr/next chunks. The
+// single test table's directory entry sits at `GF_DIR_BASE`; its data
+// follows at `GF_DATA_BASE`.
+const GF_DIR_BASE: u32 = 8192;
+const GF_DATA_BASE: u32 = 8192 + 8; // one 8-byte directory entry
+
+/// A ctx whose GF region bases point at the hand-seeded test regions, so a
+/// `Lookup` opcode reads the directory at `GF_DIR_BASE`.
+fn ctx_with_gf() -> EmitCtx<'static> {
+    EmitCtx {
+        gf_directory_base: GF_DIR_BASE,
+        gf_data_base: GF_DATA_BASE,
+        ..ctx_with_cond_depth(0)
+    }
+}
+
+/// Pack a GF directory entry `(data_off, count)` into the f64 whose 8 LE
+/// bytes are `data_off` (low i32) then `count` (high i32) -- so seeding it as
+/// one f64 writes exactly the two i32 the `Lookup` opcode reads.
+///
+/// Assumes a little-endian test host: the low 32 bits land at the lower
+/// address, matching production's `to_le_bytes` directory encoding (the
+/// opcode reads `data_off` at offset 0 and `count` at offset 4).
+fn dir_entry_f64(data_off: u32, count: u32) -> f64 {
+    f64::from_bits(((count as u64) << 32) | data_off as u64)
+}
+
+/// Seed a single GF table (`base_gf == 0`, `table_count == 1`) into memory:
+/// the directory entry at `GF_DIR_BASE` and the knots at `GF_DATA_BASE`.
+fn seed_single_table(knots: &[(f64, f64)]) -> Vec<(u64, f64)> {
+    let mut seed = vec![(
+        u64::from(GF_DIR_BASE),
+        dir_entry_f64(GF_DATA_BASE, knots.len() as u32),
+    )];
+    for (k, &(x, y)) in knots.iter().enumerate() {
+        let knot_addr = u64::from(GF_DATA_BASE) + (k as u64) * 16;
+        seed.push((knot_addr, x));
+        seed.push((knot_addr + 8, y));
+    }
+    seed
+}
+
+/// Run a `Lookup` over a single seeded table at `(element_offset, index)`.
+/// `table_count` lets a test push an out-of-range element_offset.
+fn run_lookup_opcode(
+    mode: LookupMode,
+    knots: &[(f64, f64)],
+    table_count: u16,
+    element_offset: f64,
+    index: f64,
+) -> f64 {
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // element_offset (pushed first)
+        Opcode::LoadConstant { id: 1 }, // index (pushed second, on top)
+        Opcode::Lookup {
+            base_gf: 0,
+            table_count,
+            mode,
+        },
+    ];
+    run(
+        &bc(vec![element_offset, index], code),
+        &ctx_with_gf(),
+        true,
+        0,
+        &seed_single_table(knots),
+        None,
+    )
+}
+
+/// The VM oracle for `mode` -- the exact function the opcode dispatches to.
+fn vm_lookup_oracle(mode: LookupMode, knots: &[(f64, f64)], index: f64) -> f64 {
+    match mode {
+        LookupMode::Interpolate => crate::vm::lookup(knots, index),
+        LookupMode::Forward => crate::vm::lookup_forward(knots, index),
+        LookupMode::Backward => crate::vm::lookup_backward(knots, index),
+    }
+}
+
+fn assert_lookup_opcode_matches_vm(mode: LookupMode, knots: &[(f64, f64)], index: f64) {
+    let got = run_lookup_opcode(mode, knots, 1, 0.0, index);
+    let want = vm_lookup_oracle(mode, knots, index);
+    if want.is_nan() {
+        assert!(got.is_nan(), "{mode:?} at {index}: expected NaN, got {got}");
+    } else {
+        assert_eq!(got, want, "{mode:?} at {index}: got {got}, want {want}");
+    }
+}
+
+const LOOKUP_OPCODE_TABLE: &[(f64, f64)] = &[(0.0, 10.0), (1.0, 20.0), (2.5, 5.0), (4.0, 40.0)];
+
+#[test]
+fn lookup_opcode_dispatches_to_each_mode_and_reads_directory() {
+    // The opcode reads (data_off, count) from the directory, then dispatches
+    // to the mode's helper. Probe below/above range, on a knot, and between
+    // knots for all three modes against the VM oracle.
+    let probes = [-1.0, 0.0, 0.5, 1.0, 1.75, 2.5, 3.0, 4.0, 9.0];
+    for mode in [
+        LookupMode::Interpolate,
+        LookupMode::Forward,
+        LookupMode::Backward,
+    ] {
+        for &index in &probes {
+            assert_lookup_opcode_matches_vm(mode, LOOKUP_OPCODE_TABLE, index);
+        }
+    }
+}
+
+#[test]
+fn lookup_opcode_out_of_range_element_offset_is_nan() {
+    // The VM pushes NaN when element_offset < 0 or >= table_count, BEFORE
+    // touching the table; the opcode must match (the directory is seeded for
+    // table 0 only, so an OOB offset must short-circuit, never read garbage).
+    for mode in [
+        LookupMode::Interpolate,
+        LookupMode::Forward,
+        LookupMode::Backward,
+    ] {
+        // table_count = 1, so offset 1 and -1 are both out of range.
+        assert!(
+            run_lookup_opcode(mode, LOOKUP_OPCODE_TABLE, 1, 1.0, 2.0).is_nan(),
+            "{mode:?}: element_offset == table_count must be NaN"
+        );
+        assert!(
+            run_lookup_opcode(mode, LOOKUP_OPCODE_TABLE, 1, -1.0, 2.0).is_nan(),
+            "{mode:?}: negative element_offset must be NaN"
+        );
+        // In range (offset 0) is NOT NaN for an in-range index.
+        assert!(
+            !run_lookup_opcode(mode, LOOKUP_OPCODE_TABLE, 1, 0.0, 2.0).is_nan(),
+            "{mode:?}: in-range element_offset must not be NaN"
+        );
+    }
+}
+
+#[test]
+fn lookup_opcode_nan_index_is_nan() {
+    for mode in [
+        LookupMode::Interpolate,
+        LookupMode::Forward,
+        LookupMode::Backward,
+    ] {
+        assert!(
+            run_lookup_opcode(mode, LOOKUP_OPCODE_TABLE, 1, 0.0, f64::NAN).is_nan(),
+            "{mode:?}: a NaN index must be NaN"
+        );
+    }
+}
+
+// ── Lookup opcode: runtime table selection across TWO tables ──────────
+//
+// The single-table parity tests above always pass `element_offset == 0`, so
+// the directory-indexing arithmetic in `push_gf_directory_addr`
+// (`gf_directory_base + (base_gf + element_offset) * 8`) is only exercised
+// for offset 0 -- the `* 8` stride and the offset add are never tested with
+// a nonzero offset (the out-of-range tests short-circuit to NaN before the
+// directory read). Phase 5/7 lower an arrayed scalar `Lookup` to a runtime
+// per-element `element_offset` that selects a per-element table, so the
+// table-selection path must be pinned here.
+
+// Two-table layout: a 2-entry directory at `GF2_DIR_BASE`, then each
+// table's knots laid out back-to-back past the directory.
+const GF2_DIR_BASE: u32 = 8192;
+const GF2_TABLE0_DATA: u32 = GF2_DIR_BASE + 2 * 8; // past two 8-byte entries
+// Table 0 has two knots (4 f64 = 32 bytes); table 1's data follows.
+const GF2_TABLE1_DATA: u32 = GF2_TABLE0_DATA + 2 * 16;
+
+/// Seed two GF tables so that directory entry `t` (`t ∈ {0,1}`) points at
+/// `table_t`'s knots. Mirrors the production directory layout the opcode
+/// reads via `push_gf_directory_addr`.
+fn seed_two_tables(table0: &[(f64, f64)], table1: &[(f64, f64)]) -> Vec<(u64, f64)> {
+    let mut seed = vec![
+        (
+            u64::from(GF2_DIR_BASE),
+            dir_entry_f64(GF2_TABLE0_DATA, table0.len() as u32),
+        ),
+        (
+            u64::from(GF2_DIR_BASE) + 8,
+            dir_entry_f64(GF2_TABLE1_DATA, table1.len() as u32),
+        ),
+    ];
+    for (base, knots) in [(GF2_TABLE0_DATA, table0), (GF2_TABLE1_DATA, table1)] {
+        for (k, &(x, y)) in knots.iter().enumerate() {
+            let knot_addr = u64::from(base) + (k as u64) * 16;
+            seed.push((knot_addr, x));
+            seed.push((knot_addr + 8, y));
+        }
+    }
+    seed
+}
+
+/// Run a `Lookup` with a compile-time-constant `element_offset` against a
+/// two-table directory (`base_gf == 0`, `table_count == 2`).
+fn run_lookup_two_tables(
+    mode: LookupMode,
+    table0: &[(f64, f64)],
+    table1: &[(f64, f64)],
+    element_offset: f64,
+    index: f64,
+) -> f64 {
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // element_offset (pushed first)
+        Opcode::LoadConstant { id: 1 }, // index (pushed second, on top)
+        Opcode::Lookup {
+            base_gf: 0,
+            table_count: 2,
+            mode,
+        },
+    ];
+    let ctx = EmitCtx {
+        gf_directory_base: GF2_DIR_BASE,
+        // `gf_data_base` is unused at runtime by the opcode (each table's
+        // data offset comes from its directory entry), but set it to the
+        // first table's data so the ctx is internally consistent.
+        gf_data_base: GF2_TABLE0_DATA,
+        ..ctx_with_cond_depth(0)
+    };
+    run(
+        &bc(vec![element_offset, index], code),
+        &ctx,
+        true,
+        0,
+        &seed_two_tables(table0, table1),
+        None,
+    )
+}
+
+#[test]
+fn lookup_opcode_selects_table_by_element_offset() {
+    // Two tables whose values differ at the probe index in ALL three modes,
+    // so selecting the wrong table is observable regardless of mode:
+    //   table 0: y = 10x        index 5 -> interp 50,  fwd 100, bwd 0
+    //   table 1: y = x/10 + 1   index 5 -> interp 1.5, fwd 2,   bwd 1
+    let table0: &[(f64, f64)] = &[(0.0, 0.0), (10.0, 100.0)];
+    let table1: &[(f64, f64)] = &[(0.0, 1.0), (10.0, 2.0)];
+    let index = 5.0;
+
+    for mode in [
+        LookupMode::Interpolate,
+        LookupMode::Forward,
+        LookupMode::Backward,
+    ] {
+        // The two tables must genuinely disagree here, otherwise selecting
+        // the wrong table would silently pass.
+        let want0 = vm_lookup_oracle(mode, table0, index);
+        let want1 = vm_lookup_oracle(mode, table1, index);
+        assert_ne!(
+            want0, want1,
+            "{mode:?}: tables must differ at the probe index to detect mis-selection"
+        );
+
+        // element_offset == 1 selects table 1; the result must match the VM
+        // oracle over table 1 (and therefore differ from table 0).
+        let got = run_lookup_two_tables(mode, table0, table1, 1.0, index);
+        assert_eq!(
+            got, want1,
+            "{mode:?}: element_offset==1 must read table 1: got {got}, want {want1}"
+        );
+
+        // Sanity: element_offset == 0 still selects table 0 (the offset is a
+        // real selector, not a constant remap to table 1).
+        let got0 = run_lookup_two_tables(mode, table0, table1, 0.0, index);
+        assert_eq!(
+            got0, want0,
+            "{mode:?}: element_offset==0 must read table 0: got {got0}, want {want0}"
+        );
+    }
+}
+
+// ── LoadInitial / LoadPrev opcodes (Task 1: snapshot regions) ─────────
+
+// Snapshot region bases for these tests, placed past `next_base` (4096) so
+// they cannot overlap the curr/next chunks.
+const INITIAL_BASE: u32 = 8192;
+const PREV_BASE: u32 = 8192 + 4096;
+
+/// `LoadInitial` in the flows/stocks programs reads `initial_values[off]`
+/// (the post-initials snapshot), NOT `curr`. Seed both regions to distinct
+/// values at the same slot so a wrong-region read is observable.
+#[test]
+fn load_initial_in_flows_reads_initial_values_region() {
+    let ctx = EmitCtx {
+        initial_values_base: INITIAL_BASE,
+        step_part: StepPart::Flows,
+        ..ctx_with_cond_depth(0)
+    };
+    // curr[2] = 111 (byte 16), initial_values[2] = 222 (INITIAL_BASE + 16).
+    let seed = [(16u64, 111.0), (u64::from(INITIAL_BASE) + 16, 222.0)];
+    let got = run(
+        &bc(vec![], vec![Opcode::LoadInitial { off: 2 }]),
+        &ctx,
+        true,
+        0,
+        &seed,
+        None,
+    );
+    assert_eq!(got, 222.0, "LoadInitial in Flows must read initial_values");
+}
+
+/// `LoadInitial` in the initials program reads `curr[off]` (the value being
+/// computed), because the snapshot is not yet taken (`vm.rs:1334`).
+#[test]
+fn load_initial_in_initials_reads_curr() {
+    let ctx = EmitCtx {
+        initial_values_base: INITIAL_BASE,
+        step_part: StepPart::Initials,
+        ..ctx_with_cond_depth(0)
+    };
+    let seed = [(16u64, 111.0), (u64::from(INITIAL_BASE) + 16, 222.0)];
+    let got = run(
+        &bc(vec![], vec![Opcode::LoadInitial { off: 2 }]),
+        &ctx,
+        true,
+        0,
+        &seed,
+        None,
+    );
+    assert_eq!(got, 111.0, "LoadInitial in Initials must read curr");
+}
+
+/// `LoadInitial` honors `module_off`: with a non-zero module base it reads
+/// `initial_values[module_off + off]`.
+#[test]
+fn load_initial_honors_module_off() {
+    let ctx = EmitCtx {
+        initial_values_base: INITIAL_BASE,
+        step_part: StepPart::Stocks,
+        ..ctx_with_cond_depth(0)
+    };
+    // module_off=2, off=1 -> initial_values[3] at INITIAL_BASE + 24.
+    let program = bc(vec![], vec![Opcode::LoadInitial { off: 1 }]);
+    let bytes = build_module(&program, &ctx, true, 0);
+    let info = validate(&bytes).expect("module must validate");
+    let mut store = Store::new(());
+    let module = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("instantiate")
+        .module_addr;
+    let mem = store
+        .instance_export(module, "mem")
+        .unwrap()
+        .as_mem()
+        .unwrap();
+    store.mem_access_mut_slice(mem, |bytes| {
+        let a = (INITIAL_BASE + 24) as usize;
+        bytes[a..a + 8].copy_from_slice(&77.0_f64.to_le_bytes());
+    });
+    let eval = store
+        .instance_export(module, "eval")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    let result: f64 = store.invoke_simple_typed(eval, (2_i32,)).expect("invoke");
+    assert_eq!(
+        result, 77.0,
+        "LoadInitial must read initial_values[module_off+off]"
+    );
+}
+
+/// Build a module exporting `mem`, a mutable i32 global `use_prev_fallback`
+/// (at index 0, the index the test ctx names), and an `eval(module_off: i32)
+/// -> f64` whose body lowers `LoadConstant(fallback); LoadPrev{off}`. The
+/// helper functions lead the function/code sections so any `call` resolves;
+/// `eval` follows. `fallback_flag` is the global's init value (1 = use the
+/// fallback, 0 = read prev_values).
+fn build_load_prev_module(off: u16, fallback: f64, fallback_flag: i32) -> Vec<u8> {
+    let mut module = Module::new();
+    let helpers = build_helpers();
+    let n_helpers = helpers.functions.len() as u32;
+
+    let mut types = TypeSection::new();
+    types.ty().function([ValType::I32], [ValType::F64]); // eval
+    for hf in &helpers.functions {
+        types.ty().function(hf.params.clone(), hf.results.clone());
+    }
+    module.section(&types);
+
+    let mut functions = FunctionSection::new();
+    for (i, _) in helpers.functions.iter().enumerate() {
+        functions.function(1 + i as u32);
+    }
+    functions.function(0); // eval -> type 0
+    module.section(&functions);
+
+    let mut memories = MemorySection::new();
+    memories.memory(MemoryType {
+        minimum: 1,
+        maximum: None,
+        memory64: false,
+        shared: false,
+        page_size_log2: None,
+    });
+    module.section(&memories);
+
+    // The single mutable i32 global the LoadPrev ctx gates on (index 0).
+    let mut globals = wasm_encoder::GlobalSection::new();
+    globals.global(
+        wasm_encoder::GlobalType {
+            val_type: ValType::I32,
+            mutable: true,
+            shared: false,
+        },
+        &wasm_encoder::ConstExpr::i32_const(fallback_flag),
+    );
+    module.section(&globals);
+
+    let mut exports = ExportSection::new();
+    exports.export("eval", ExportKind::Func, n_helpers);
+    exports.export("mem", ExportKind::Memory, 0);
+    module.section(&exports);
+
+    let ctx = EmitCtx {
+        prev_values_base: PREV_BASE,
+        use_prev_fallback_global: 0,
+        ..ctx_with_cond_depth(0)
+    };
+    let program = bc(
+        vec![fallback],
+        vec![Opcode::LoadConstant { id: 0 }, Opcode::LoadPrev { off }],
+    );
+
+    let mut code = CodeSection::new();
+    for hf in helpers.functions {
+        code.function(&hf.body);
+    }
+    let mut func = Function::new(opcode_fn_locals(0, 0, 0));
+    emit_bytecode(&program, &ctx, &mut func).expect("LoadPrev should lower");
+    func.instruction(&Instruction::End);
+    code.function(&func);
+    module.section(&code);
+
+    module.finish()
+}
+
+/// Run `LoadConstant(fallback); LoadPrev{off}` with `prev_values[off]` seeded
+/// to `prev_value` and the gate set to `fallback_flag`.
+fn run_load_prev(off: u16, fallback: f64, prev_value: f64, fallback_flag: i32) -> f64 {
+    let bytes = build_load_prev_module(off, fallback, fallback_flag);
+    let info = validate(&bytes).expect("LoadPrev module must validate");
+    let mut store = Store::new(());
+    let module = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("instantiate")
+        .module_addr;
+    let mem = store
+        .instance_export(module, "mem")
+        .unwrap()
+        .as_mem()
+        .unwrap();
+    store.mem_access_mut_slice(mem, |bytes| {
+        let a = (PREV_BASE + u32::from(off) * SLOT_SIZE) as usize;
+        bytes[a..a + 8].copy_from_slice(&prev_value.to_le_bytes());
+    });
+    let eval = store
+        .instance_export(module, "eval")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    store.invoke_simple_typed(eval, (0_i32,)).expect("invoke")
+}
+
+/// `LoadPrev` returns the caller-supplied fallback while `use_prev_fallback`
+/// is set (1), exactly as the VM does before the first snapshot
+/// (`vm.rs:1322`). The seeded `prev_values` value must NOT be read.
+#[test]
+fn load_prev_returns_fallback_when_flag_set() {
+    let got = run_load_prev(2, 3.5, 999.0, 1);
+    assert_eq!(got, 3.5, "with the flag set, LoadPrev yields its fallback");
+}
+
+/// `LoadPrev` reads `prev_values[off]` once `use_prev_fallback` is cleared
+/// (0), exactly as the VM does after the first snapshot (`vm.rs:1325`).
+#[test]
+fn load_prev_reads_prev_values_when_flag_clear() {
+    let got = run_load_prev(2, 3.5, 999.0, 0);
+    assert_eq!(
+        got, 999.0,
+        "with the flag clear, LoadPrev reads prev_values"
+    );
+}
+
+// ── approx_eq helper (AC7.2, AC1.5) ───────────────────────────────────
+
+/// Build a module exporting `eq(a: f64, b: f64) -> i32` whose body is just
+/// `local.get a; local.get b; call approx_eq`, directly exercising the
+/// emitted helper in isolation. The helper functions are placed at indices
+/// `0..N` (so the `call` resolves) and `eq` follows at index `N`.
+fn build_approx_eq_module() -> Vec<u8> {
+    let mut module = Module::new();
+
+    let helpers = build_helpers();
+    let n_helpers = helpers.functions.len() as u32;
+
+    // Type 0 is `eq`'s signature (f64, f64) -> i32; helper types follow.
+    let mut types = TypeSection::new();
+    types
+        .ty()
+        .function([ValType::F64, ValType::F64], [ValType::I32]);
+    for hf in &helpers.functions {
+        types.ty().function(hf.params.clone(), hf.results.clone());
+    }
+    module.section(&types);
+
+    let mut functions = FunctionSection::new();
+    for (i, _) in helpers.functions.iter().enumerate() {
+        functions.function(1 + i as u32);
+    }
+    functions.function(0);
+    module.section(&functions);
+
+    // The GF lookup helpers (`super::lookup`) `f64.load` from memory 0, so
+    // a module that includes every helper body must declare a memory even
+    // though `eq` itself never touches it.
+    let mut memories = MemorySection::new();
+    memories.memory(MemoryType {
+        minimum: 1,
+        maximum: None,
+        memory64: false,
+        shared: false,
+        page_size_log2: None,
+    });
+    module.section(&memories);
+
+    let mut exports = ExportSection::new();
+    exports.export("eq", ExportKind::Func, n_helpers);
+    module.section(&exports);
+
+    let mut code = CodeSection::new();
+    for hf in helpers.functions {
+        code.function(&hf.body);
+    }
+    let mut eq = Function::new([]);
+    eq.instruction(&Instruction::LocalGet(0));
+    eq.instruction(&Instruction::LocalGet(1));
+    eq.instruction(&Instruction::Call(helpers.fns.approx_eq));
+    eq.instruction(&Instruction::End);
+    code.function(&eq);
+    module.section(&code);
+
+    module.finish()
+}
+
+/// Run the emitted `approx_eq` helper on `(a, b)` under the interpreter,
+/// returning its i32 result (1 = approximately equal). Built once per call
+/// (cheap; the sample sizes are small).
+fn run_approx_eq(a: f64, b: f64) -> i32 {
+    let bytes = build_approx_eq_module();
+    let info = validate(&bytes).expect("approx_eq module must validate");
+    let mut store = Store::new(());
+    let module = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("approx_eq module must instantiate")
+        .module_addr;
+    let eq = store
+        .instance_export(module, "eq")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    store
+        .invoke_simple_typed::<(f64, f64), i32>(eq, (a, b))
+        .expect("eq invocation must succeed")
+}
+
+/// Assert the emitted helper agrees with the Rust `crate::float::approx_eq`
+/// oracle for both argument orders (the function is symmetric).
+fn assert_approx_eq_matches_oracle(a: f64, b: f64) {
+    let oracle = crate::float::approx_eq(a, b) as i32;
+    assert_eq!(
+        run_approx_eq(a, b),
+        oracle,
+        "approx_eq({a:?}, {b:?}) disagreed with oracle {oracle}"
+    );
+    let oracle_swapped = crate::float::approx_eq(b, a) as i32;
+    assert_eq!(
+        run_approx_eq(b, a),
+        oracle_swapped,
+        "approx_eq({b:?}, {a:?}) disagreed with oracle {oracle_swapped}"
+    );
+}
+
+/// Move `x` by `k` ULPs in raw-bit order (the increment the float-cmp ordered
+/// map measures within a sign). For small `|k|` and finite `x` this yields a
+/// value the oracle judges 0..|k| ULPs away.
+fn nudge_ulps(x: f64, k: i64) -> f64 {
+    f64::from_bits(((x.to_bits() as i64).wrapping_add(k)) as u64)
+}
+
+#[test]
+fn approx_eq_matches_oracle_curated() {
+    // The exact edge cases the task enumerates.
+    let na = crate::float::NA; // finite -2^109 sentinel, NOT NaN.
+    let cases: &[(f64, f64)] = &[
+        // exact-equal
+        (1.0, 1.0),
+        (0.0, 0.0),
+        (-3.5, -3.5),
+        (1e300, 1e300),
+        // far apart
+        (1.0, 2.0),
+        (0.0, 1e100),
+        (-1e9, 1e9),
+        // 1-4 ULP apart around 1.0
+        (1.0, nudge_ulps(1.0, 1)),
+        (1.0, nudge_ulps(1.0, 2)),
+        (1.0, nudge_ulps(1.0, 3)),
+        (1.0, nudge_ulps(1.0, 4)),
+        // 5 ULPs apart (just past the threshold) around a larger magnitude
+        (1000.0, nudge_ulps(1000.0, 5)),
+        (1000.0, nudge_ulps(1000.0, 4)),
+        // f64::EPSILON-apart around 1.0 (the absolute-epsilon check)
+        (1.0, 1.0 + f64::EPSILON),
+        (1.0, 1.0 - f64::EPSILON),
+        // around zero (subnormals and tiny values straddling the epsilon)
+        (0.0, f64::from_bits(1)),                // smallest subnormal
+        (0.0, -f64::from_bits(1)),               // negative smallest subnormal
+        (0.0, f64::EPSILON),                     // EPSILON away from zero
+        (0.0, 1e-300),                           // tiny normal, within epsilon
+        (f64::MIN_POSITIVE, -f64::MIN_POSITIVE), // straddle zero by subnormal step
+        // signed zeros
+        (0.0, -0.0),
+        // NaN cases
+        (f64::NAN, f64::NAN),
+        (f64::NAN, 1.0),
+        (f64::NAN, 0.0),
+        // the finite :NA: sentinel
+        (na, na),
+        (na, 0.0),
+        (na, 1.0),
+        (na, -(2.0_f64).powi(110)),
+        // infinities
+        (f64::INFINITY, f64::INFINITY),
+        (f64::NEG_INFINITY, f64::NEG_INFINITY),
+        (f64::INFINITY, f64::NEG_INFINITY),
+        (f64::INFINITY, f64::MAX),
+        (f64::NEG_INFINITY, f64::MIN),
+    ];
+    for &(a, b) in cases {
+        assert_approx_eq_matches_oracle(a, b);
+    }
+}
+
+#[test]
+fn approx_eq_matches_oracle_randomized() {
+    use rand::prelude::*;
+    // Fixed seed: a sampled-but-reproducible parity sweep against the oracle.
+    let mut rng = StdRng::seed_from_u64(0xA222_02EE);
+    for _ in 0..400 {
+        // A diverse magnitude/sign base value.
+        let exp = rng.random_range(-308i32..=308);
+        let mantissa: f64 = rng.random_range(-1.0..1.0);
+        let x = mantissa * 10f64.powi(exp);
+
+        // ULP-adjacent partner (often within the 4-ULP threshold, sometimes
+        // just past it), exercising the ULP path on both sides of the gap.
+        let k = rng.random_range(-8i64..=8);
+        assert_approx_eq_matches_oracle(x, nudge_ulps(x, k));
+
+        // An independent unrelated value (usually far apart -> ULP + epsilon
+        // both fail), exercising the false path.
+        let exp2 = rng.random_range(-308i32..=308);
+        let y: f64 = rng.random_range(-1.0..1.0) * 10f64.powi(exp2);
+        assert_approx_eq_matches_oracle(x, y);
+
+        // Near-zero straddling pairs (the epsilon absolute check region).
+        let tiny_a = rng.random_range(-1.0..1.0) * f64::EPSILON;
+        let tiny_b = rng.random_range(-1.0..1.0) * f64::EPSILON;
+        assert_approx_eq_matches_oracle(tiny_a, tiny_b);
+    }
+}
+
+// ── Op2::Eq / And / Or / Not / SetCond+If route through approx_eq ─────
+
+/// Evaluate `l Op2::Eq r` (push l, push r, Op2::Eq) and return the f64 bool.
+fn eval_eq(l: f64, r: f64) -> f64 {
+    let lit = vec![l, r];
+    value(
+        vec![
+            Opcode::LoadConstant { id: 0 },
+            Opcode::LoadConstant { id: 1 },
+            op2(Op2::Eq),
+        ],
+        lit,
+        &[],
+    )
+}
+
+#[test]
+fn op2_eq_matches_vm_for_ulp_adjacent_operands() {
+    // Raw `==` would call these unequal, but the VM's `approx_eq` (and so the
+    // wasm) calls them equal: 1 ULP and EPSILON-apart around 1.0.
+    let one_ulp = nudge_ulps(1.0, 1);
+    assert_eq!(eval_eq(1.0, one_ulp), 1.0);
+    assert_eq!(eval_eq(1.0, 1.0 + f64::EPSILON), 1.0);
+    // 5 ULPs apart at a larger magnitude: past the threshold -> not equal.
+    assert_eq!(eval_eq(1000.0, nudge_ulps(1000.0, 5)), 0.0);
+    // Exact and far-apart anchors.
+    assert_eq!(eval_eq(2.5, 2.5), 1.0);
+    assert_eq!(eval_eq(1.0, 2.0), 0.0);
+    // NaN == NaN is true under approx_eq (identical bits -> 0 ULPs).
+    assert_eq!(eval_eq(f64::NAN, f64::NAN), 1.0);
+    assert_eq!(eval_eq(f64::NAN, 1.0), 0.0);
+}
+
+#[test]
+fn op2_eq_matches_vm_oracle_over_sample() {
+    // The whole-expression Eq lowering must agree with the VM's eval_op2 Eq
+    // (= approx_eq as f64) across the curated edge values.
+    let na = crate::float::NA;
+    let cases: &[(f64, f64)] = &[
+        (1.0, nudge_ulps(1.0, 3)),
+        (1.0, nudge_ulps(1.0, 4)),
+        (1.0, nudge_ulps(1.0, 5)),
+        (0.0, -0.0),
+        (0.0, f64::EPSILON),
+        (na, na),
+        (na, 0.0),
+        (f64::INFINITY, f64::INFINITY),
+        (f64::INFINITY, f64::NEG_INFINITY),
+    ];
+    for &(l, r) in cases {
+        let expected = crate::float::approx_eq(l, r) as i8 as f64;
+        assert_eq!(eval_eq(l, r), expected, "Eq({l:?}, {r:?})");
+    }
+}
+
+/// Evaluate `l Op2::And r` / `l Op2::Or r`.
+fn eval_logical(op: Op2, l: f64, r: f64) -> f64 {
+    value(
+        vec![
+            Opcode::LoadConstant { id: 0 },
+            Opcode::LoadConstant { id: 1 },
+            op2(op),
+        ],
+        vec![l, r],
+        &[],
+    )
+}
+
+/// The VM's truthiness: `is_truthy(n) = !approx_eq(n, 0.0)`.
+fn vm_is_truthy(n: f64) -> bool {
+    !crate::float::approx_eq(n, 0.0)
+}
+
+#[test]
+fn op2_and_or_match_vm_truthiness() {
+    // EPSILON is falsy (within epsilon of 0); a small-but-not-epsilon value
+    // is truthy. These are exactly where raw `!= 0.0` would diverge from the
+    // VM.
+    let eps = f64::EPSILON;
+    let small = 0.001;
+    let operands = [
+        0.0,
+        -0.0,
+        eps,
+        -eps,
+        small,
+        -small,
+        1.0,
+        f64::NAN,
+        f64::INFINITY,
+    ];
+    for &l in &operands {
+        for &r in &operands {
+            let and_expected = (vm_is_truthy(l) && vm_is_truthy(r)) as i8 as f64;
+            let or_expected = (vm_is_truthy(l) || vm_is_truthy(r)) as i8 as f64;
+            assert_eq!(
+                eval_logical(Op2::And, l, r),
+                and_expected,
+                "And({l:?}, {r:?})"
+            );
+            assert_eq!(eval_logical(Op2::Or, l, r), or_expected, "Or({l:?}, {r:?})");
+        }
+    }
+}
+
+#[test]
+fn op2_and_or_operand_order_preserved() {
+    // And/Or stash the right operand in the scratch local; verify a
+    // non-symmetric truthiness pairing still combines correctly and that the
+    // scratch reuse doesn't corrupt a following assignment.
+    // (truthy AND falsy) = 0; (truthy OR falsy) = 1.
+    assert_eq!(eval_logical(Op2::And, 5.0, 0.0), 0.0);
+    assert_eq!(eval_logical(Op2::And, 0.0, 5.0), 0.0);
+    assert_eq!(eval_logical(Op2::Or, 5.0, 0.0), 1.0);
+    assert_eq!(eval_logical(Op2::Or, 0.0, 5.0), 1.0);
+}
+
+#[test]
+fn bin_op_assign_and_uses_scratch_safely() {
+    // BinOpAssignCurr{And} fuses the And reduction with a store; the And
+    // lowering reuses the scratch local, which emit_assign then overwrites.
+    // Verify the stored result is correct. (truthy AND truthy) = 1 -> slot 5.
+    let code = vec![
+        Opcode::LoadConstant { id: 0 },
+        Opcode::LoadConstant { id: 1 },
+        Opcode::BinOpAssignCurr {
+            op: Op2::And,
+            off: 5,
+        },
+    ];
+    assert_eq!(stored(code, vec![3.0, 7.0], &[], 40), 1.0);
+    // (truthy AND falsy) = 0.
+    let code0 = vec![
+        Opcode::LoadConstant { id: 0 },
+        Opcode::LoadConstant { id: 1 },
+        Opcode::BinOpAssignCurr {
+            op: Op2::And,
+            off: 5,
+        },
+    ];
+    assert_eq!(stored(code0, vec![3.0, 0.0], &[], 40), 0.0);
+}
+
+#[test]
+fn not_matches_vm_approx_eq_truthiness() {
+    // Not(n) = (!is_truthy(n)) as f64 = approx_eq(n, 0.0) as f64.
+    // EPSILON is "false" so Not(EPSILON) = 1.0; small-but-not-epsilon is
+    // "true" so Not(0.001) = 0.0.
+    let operands = [0.0, -0.0, f64::EPSILON, -f64::EPSILON, 0.001, 1.0, f64::NAN];
+    for &n in &operands {
+        let expected = (!vm_is_truthy(n)) as i8 as f64;
+        let got = value(
+            vec![Opcode::LoadConstant { id: 0 }, Opcode::Not {}],
+            vec![n],
+            &[],
+        );
+        assert_eq!(got, expected, "Not({n:?})");
+    }
+}
+
+#[test]
+fn setcond_if_uses_approx_eq_truthiness() {
+    // `if cond then t else f` with the condition routed through approx_eq.
+    // EPSILON is falsy -> selects the else arm; 0.001 is truthy -> then arm.
+    let if_eval = |cond: f64| {
+        let code = vec![
+            Opcode::LoadConstant { id: 1 }, // t
+            Opcode::LoadConstant { id: 2 }, // f
+            Opcode::LoadConstant { id: 0 }, // cond
+            Opcode::SetCond {},
+            Opcode::If {},
+        ];
+        run(
+            &bc(vec![cond, 10.0, 20.0], code),
+            &ctx_with_cond_depth(1),
+            true,
+            1,
+            &[],
+            None,
+        )
+    };
+    // Falsy conditions (within epsilon of 0) -> else (20.0).
+    assert_eq!(if_eval(0.0), 20.0);
+    assert_eq!(if_eval(-0.0), 20.0);
+    assert_eq!(if_eval(f64::EPSILON), 20.0);
+    assert_eq!(if_eval(-f64::EPSILON), 20.0);
+    // Truthy conditions -> then (10.0).
+    assert_eq!(if_eval(0.001), 10.0);
+    assert_eq!(if_eval(1.0), 10.0);
+    assert_eq!(if_eval(f64::NAN), 10.0); // is_truthy(NaN) is true
+    assert_eq!(if_eval(f64::INFINITY), 10.0);
+}
+
+// ── Apply: per-builtin parity with the VM's apply() ───────────────────
+
+/// Run `Apply{func}` over the three operands `(a, b, c)` with `time`/`dt`
+/// seeded into the reserved global slots (TIME at byte 0, DT at byte 8 of
+/// `curr`). The program pushes a, b, c then `Apply`, so `c` is on top --
+/// matching the VM's pop order.
+fn apply_eval(func: BuiltinId, a: f64, b: f64, c: f64, time: f64, dt: f64) -> f64 {
+    let code = vec![
+        Opcode::LoadConstant { id: 0 },
+        Opcode::LoadConstant { id: 1 },
+        Opcode::LoadConstant { id: 2 },
+        Opcode::Apply { func },
+    ];
+    // Seed TIME (slot 0 -> byte 0) and DT (slot 1 -> byte 8) of curr.
+    value(code, vec![a, b, c], &[(0, time), (8, dt)])
+}
+
+/// `step`/`ramp`/`pulse` reproduced verbatim from `vm.rs` so the per-builtin
+/// tests compare the wasm output to the exact formula the VM's `apply()`
+/// uses, not to libm.
+fn vm_step(time: f64, dt: f64, height: f64, step_time: f64) -> f64 {
+    if time + dt / 2.0 > step_time {
+        height
+    } else {
+        0.0
+    }
+}
+fn vm_ramp(time: f64, slope: f64, start: f64, end: f64) -> f64 {
+    if time > start {
+        if time >= end {
+            slope * (end - start)
+        } else {
+            slope * (time - start)
+        }
+    } else {
+        0.0
+    }
+}
+fn vm_pulse(time: f64, dt: f64, volume: f64, first: f64, interval: f64) -> f64 {
+    if time < first {
+        return 0.0;
+    }
+    let mut next = first;
+    while time >= next {
+        if time < next + dt {
+            return volume / dt;
+        } else if interval <= 0.0 {
+            break;
+        } else {
+            next += interval;
+        }
+    }
+    0.0
+}
+
+/// Assert a wasm `Apply` result equals an exact f64 value (for the
+/// non-transcendental builtins, which the wasm reproduces bit-for-bit).
+fn assert_apply_exact(func: BuiltinId, a: f64, b: f64, c: f64, time: f64, dt: f64, want: f64) {
+    let got = apply_eval(func, a, b, c, time, dt);
+    if want.is_nan() {
+        assert!(got.is_nan(), "apply result expected NaN, got {got}");
+    } else {
+        assert_eq!(got, want, "apply({a},{b},{c},t={time},dt={dt})");
+    }
+}
+
+#[test]
+fn apply_abs_sqrt_int() {
+    assert_apply_exact(BuiltinId::Abs, -3.5, 0.0, 0.0, 0.0, 1.0, 3.5);
+    assert_apply_exact(BuiltinId::Abs, 3.5, 0.0, 0.0, 0.0, 1.0, 3.5);
+    assert_apply_exact(BuiltinId::Sqrt, 16.0, 0.0, 0.0, 0.0, 1.0, 4.0);
+    // Int is floor, NOT trunc: floor(-2.5) = -3 (trunc would give -2).
+    assert_apply_exact(BuiltinId::Int, -2.5, 0.0, 0.0, 0.0, 1.0, (-2.5f64).floor());
+    assert_apply_exact(BuiltinId::Int, 2.9, 0.0, 0.0, 0.0, 1.0, 2.0);
+    assert_apply_exact(BuiltinId::Int, -2.9, 0.0, 0.0, 0.0, 1.0, -3.0);
+}
+
+#[test]
+fn apply_min_max() {
+    assert_apply_exact(BuiltinId::Max, 3.0, 7.0, 0.0, 0.0, 1.0, 7.0);
+    assert_apply_exact(BuiltinId::Max, 7.0, 3.0, 0.0, 0.0, 1.0, 7.0);
+    assert_apply_exact(BuiltinId::Min, 3.0, 7.0, 0.0, 0.0, 1.0, 3.0);
+    assert_apply_exact(BuiltinId::Min, 7.0, 3.0, 0.0, 0.0, 1.0, 3.0);
+    assert_apply_exact(BuiltinId::Max, -1.0, -5.0, 0.0, 0.0, 1.0, -1.0);
+    assert_apply_exact(BuiltinId::Min, -1.0, -5.0, 0.0, 0.0, 1.0, -5.0);
+}
+
+#[test]
+fn apply_sign() {
+    assert_apply_exact(BuiltinId::Sign, 5.0, 0.0, 0.0, 0.0, 1.0, 1.0);
+    assert_apply_exact(BuiltinId::Sign, -5.0, 0.0, 0.0, 0.0, 1.0, -1.0);
+    // Sign(0) = 0 exactly (the VM's `else` branch).
+    assert_apply_exact(BuiltinId::Sign, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0);
+    assert_apply_exact(BuiltinId::Sign, -0.0, 0.0, 0.0, 0.0, 1.0, 0.0);
+}
+
+#[test]
+fn apply_quantum() {
+    // q == 0 -> x (exact ==, returns a unchanged).
+    assert_apply_exact(BuiltinId::Quantum, 3.7, 0.0, 0.0, 0.0, 1.0, 3.7);
+    // q != 0 -> (x/q).trunc() * q.
+    assert_apply_exact(
+        BuiltinId::Quantum,
+        7.0,
+        2.0,
+        0.0,
+        0.0,
+        1.0,
+        (7.0f64 / 2.0).trunc() * 2.0,
+    );
+    assert_apply_exact(
+        BuiltinId::Quantum,
+        -7.0,
+        2.0,
+        0.0,
+        0.0,
+        1.0,
+        (-7.0f64 / 2.0).trunc() * 2.0,
+    );
+    assert_apply_exact(
+        BuiltinId::Quantum,
+        5.5,
+        0.5,
+        0.0,
+        0.0,
+        1.0,
+        (5.5f64 / 0.5).trunc() * 0.5,
+    );
+}
+
+#[test]
+fn apply_safe_div() {
+    // b != 0 -> a/b.
+    assert_apply_exact(BuiltinId::SafeDiv, 6.0, 3.0, 99.0, 0.0, 1.0, 2.0);
+    // b == 0 -> c (the default), via exact `!= 0.0`.
+    assert_apply_exact(BuiltinId::SafeDiv, 6.0, 0.0, 99.0, 0.0, 1.0, 99.0);
+    // A subnormal (non-zero) denominator still divides, NOT falls back.
+    let sub = f64::from_bits(1);
+    assert_apply_exact(BuiltinId::SafeDiv, 6.0, sub, 99.0, 0.0, 1.0, 6.0 / sub);
+    // -0.0 is == 0.0, so it falls back to c (matches the VM's `b != 0.0`).
+    assert_apply_exact(BuiltinId::SafeDiv, 6.0, -0.0, 99.0, 0.0, 1.0, 99.0);
+}
+
+#[test]
+fn apply_sshape() {
+    // b + (c-b)/(1 + exp(-4*(2a-1))), within the exp helper's tolerance.
+    for &a in &[0.0f64, 0.25, 0.5, 0.75, 1.0] {
+        let want = 2.0 + (8.0 - 2.0) / (1.0 + (-4.0 * (2.0 * a - 1.0)).exp());
+        let got = apply_eval(BuiltinId::Sshape, a, 2.0, 8.0, 0.0, 1.0);
+        assert!(
+            (got - want).abs() < 1e-9,
+            "Sshape({a}): got {got}, want {want}",
+        );
+    }
+}
+
+#[test]
+fn apply_transcendentals_match_libm() {
+    // Each transcendental Apply arm calls the Task 2 helper on `a`; assert
+    // it lands within the helpers' documented tolerance of Rust f64.
+    let close = |func: BuiltinId, a: f64, want: f64| {
+        let got = apply_eval(func, a, 0.0, 0.0, 0.0, 1.0);
+        assert!(
+            (got - want).abs() < 1e-8 || (got - want).abs() / want.abs() < 1e-8,
+            "{func:?}({a}): got {got}, want {want}",
+        );
+    };
+    close(BuiltinId::Exp, 1.5, 1.5f64.exp());
+    close(BuiltinId::Ln, 7.0, 7.0f64.ln());
+    close(BuiltinId::Log10, 1000.0, 3.0);
+    close(BuiltinId::Sin, 0.7, 0.7f64.sin());
+    close(BuiltinId::Cos, 0.7, 0.7f64.cos());
+    close(BuiltinId::Tan, 0.7, 0.7f64.tan());
+    close(BuiltinId::Arcsin, 0.5, 0.5f64.asin());
+    close(BuiltinId::Arccos, 0.5, 0.5f64.acos());
+    close(BuiltinId::Arctan, 2.0, 2.0f64.atan());
+}
+
+#[test]
+fn apply_step_across_breakpoint() {
+    // step(time, dt, height=a, step_time=b) = if time+dt/2 > b {a} else 0.
+    let dt = 0.5;
+    for &t in &[0.0, 1.0, 2.0, 3.0, 4.0, 5.0] {
+        let want = vm_step(t, dt, 10.0, 3.0);
+        assert_apply_exact(BuiltinId::Step, 10.0, 3.0, 0.0, t, dt, want);
+    }
+}
+
+#[test]
+fn apply_ramp_across_breakpoints() {
+    // ramp(time, slope=a, start=b, end=c) over its three regimes.
+    for &t in &[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0] {
+        let want = vm_ramp(t, 2.0, 2.0, 5.0);
+        assert_apply_exact(BuiltinId::Ramp, 2.0, 2.0, 5.0, t, 1.0, want);
+    }
+}
+
+#[test]
+fn apply_pulse_across_intervals() {
+    // pulse(time, dt, volume=a, first=b, interval=c) across several periods,
+    // including the no-repeat (interval == 0) case.
+    let dt = 1.0;
+    for &t in &[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0] {
+        // Repeating pulse: volume 4, first at t=2, every 3.
+        assert_apply_exact(
+            BuiltinId::Pulse,
+            4.0,
+            2.0,
+            3.0,
+            t,
+            dt,
+            vm_pulse(t, dt, 4.0, 2.0, 3.0),
+        );
+        // Single pulse: interval 0 -> fires once at t in [first, first+dt).
+        assert_apply_exact(
+            BuiltinId::Pulse,
+            4.0,
+            2.0,
+            0.0,
+            t,
+            dt,
+            vm_pulse(t, dt, 4.0, 2.0, 0.0),
+        );
+    }
+}
+
+#[test]
+fn apply_inf_pi() {
+    assert_apply_exact(BuiltinId::Inf, 0.0, 0.0, 0.0, 0.0, 1.0, f64::INFINITY);
+    assert_apply_exact(BuiltinId::Pi, 0.0, 0.0, 0.0, 0.0, 1.0, std::f64::consts::PI);
+}
+
+#[test]
+fn apply_inside_if_does_not_clobber_condition() {
+    // An `Apply` in an If arm shares the function with the condition local;
+    // the dedicated apply locals must not collide. Build (codegen-padded
+    // Apply operands): `if cond then ABS(a) else f`, cond truthy.
+    let padded = vec![
+        Opcode::LoadConstant { id: 1 }, // a = -4 (the `then` operand)
+        Opcode::LoadConstant { id: 3 }, // pad b = 0
+        Opcode::LoadConstant { id: 3 }, // pad c = 0
+        Opcode::Apply {
+            func: BuiltinId::Abs,
+        }, // ABS(-4) = 4 -> the `then` value
+        Opcode::LoadConstant { id: 2 }, // f = 99
+        Opcode::LoadConstant { id: 0 }, // cond = 1 (truthy)
+        Opcode::SetCond {},
+        Opcode::If {},
+    ];
+    let got = run(
+        &bc(vec![1.0, -4.0, 99.0, 0.0], padded),
+        &ctx_with_cond_depth(1),
+        true,
+        1,
+        &[],
+        None,
+    );
+    assert_eq!(got, 4.0, "Apply in an If-then arm should yield ABS(-4)=4");
+}
+
+// ── max_condition_depth ───────────────────────────────────────────────
+
+#[test]
+fn max_condition_depth_counts_nesting() {
+    // Single If: depth 1.
+    let single = bc(vec![], vec![Opcode::SetCond {}, Opcode::If {}]);
+    assert_eq!(max_condition_depth(&single), 1);
+
+    // Two sequential Ifs: still depth 1 (LIFO, fully popped between).
+    let sequential = bc(
+        vec![],
+        vec![
+            Opcode::SetCond {},
+            Opcode::If {},
+            Opcode::SetCond {},
+            Opcode::If {},
+        ],
+    );
+    assert_eq!(max_condition_depth(&sequential), 1);
+
+    // Interleaved: SetCond, SetCond, If, If -> depth 2. Current codegen
+    // never emits this (it walks a condition to completion before its
+    // SetCond, so nested IFs come out sequentially); this guards the
+    // defensive stack-sizing against a future interleaved emission.
+    let nested = bc(
+        vec![],
+        vec![
+            Opcode::SetCond {},
+            Opcode::SetCond {},
+            Opcode::If {},
+            Opcode::If {},
+        ],
+    );
+    assert_eq!(max_condition_depth(&nested), 2);
+
+    // No conditions: depth 0.
+    let none = bc(vec![], vec![Opcode::LoadConstant { id: 0 }]);
+    assert_eq!(max_condition_depth(&none), 0);
+}
+
+// ════════════════════════════════════════════════════════════════════════
+// Phase 5 Task 1: temp-element reads (LoadTempConst / LoadTempDynamic)
+//
+// The compile-time view-descriptor stack + the static view ops' addressing
+// are pinned directly against the VM's `RuntimeView` in `views.rs`'s unit
+// tests (no wasm or reducer needed); here the LoadTemp opcodes -- which read
+// `temp_storage` and produce a value on the arithmetic stack -- are run under
+// DLR-FT to confirm the emitted reads hit the temp region the VM addresses.
+// ════════════════════════════════════════════════════════════════════════
+
+// Region base for the temp-storage reads: well past `next_base` (4096) so it
+// cannot overlap the curr/next chunks.
+const TEMP_BASE: u32 = 8192;
+
+/// Build an `EmitCtx` over a real `ByteCodeContext` (so the temp opcodes can
+/// resolve `temp_offsets`), with `temp_storage_base` set.
+fn ctx_with_arrays(context: &ByteCodeContext) -> EmitCtx<'_> {
+    EmitCtx {
+        temp_storage_base: TEMP_BASE,
+        ctx: context,
+        ..ctx_with_cond_depth(0)
+    }
+}
+
+#[test]
+fn load_temp_const_reads_temp_storage() {
+    // temp_offsets = [0, 4]; LoadTempConst{temp_id:1, index:2} reads
+    // temp_storage[4 + 2] = temp slot 6 (byte TEMP_BASE + 6*8).
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0, 4], 8);
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![Opcode::LoadTempConst {
+        temp_id: 1,
+        index: 2,
+    }];
+    let seed = vec![(u64::from(TEMP_BASE) + 6 * 8, 42.0)];
+    let got = run(&bc(vec![], code), &ctx, true, 0, &seed, None);
+    assert_eq!(got, 42.0);
+}
+
+#[test]
+fn load_temp_dynamic_reads_temp_storage() {
+    // LoadTempDynamic{temp_id:0} pops a runtime index (floor) and reads
+    // temp_storage[temp_offsets[0] + index]. Push index 3 via a constant.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], 5);
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // index = 3.0
+        Opcode::LoadTempDynamic { temp_id: 0 },
+    ];
+    let seed = vec![(u64::from(TEMP_BASE) + 3 * 8, 77.0)];
+    let got = run(&bc(vec![3.0], code), &ctx, true, 0, &seed, None);
+    assert_eq!(got, 77.0);
+}
+
+#[test]
+fn load_temp_dynamic_floors_fractional_index() {
+    // The VM does `stack.pop().floor() as usize`; index 2.9 -> slot 2.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], 4);
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![
+        Opcode::LoadConstant { id: 0 },
+        Opcode::LoadTempDynamic { temp_id: 0 },
+    ];
+    let seed = vec![(u64::from(TEMP_BASE) + 2 * 8, 13.0)];
+    let got = run(&bc(vec![2.9], code), &ctx, true, 0, &seed, None);
+    assert_eq!(got, 13.0);
+}
+
+// ════════════════════════════════════════════════════════════════════════
+// Phase 5 Task 2: array reducers (Sum/Max/Min/Mean/Stddev/Size)
+//
+// These run the emitted reducers under DLR-FT and assert the result matches
+// the VM's own addressing oracle (`RuntimeView::flat_offset`, via
+// `StaticArrayView::to_runtime_view`) folded per the matching VM reducer arm
+// (`vm.rs:2216-2309`). The view transform opcodes the production codegen does
+// not emit directly (it bakes constant subscripts into one `PushStaticView`)
+// are exercised here on a `PushVarView` base so each `apply_*` is reduced
+// over and checked against the VM. Reuses `TEMP_BASE` / `ctx_with_arrays`
+// from the Task 1 section above.
+// ════════════════════════════════════════════════════════════════════════
+
+use crate::bytecode::{
+    DimensionInfo, RuntimeSparseMapping, RuntimeView, StaticArrayView, SubdimensionRelation,
+};
+use smallvec::SmallVec;
+
+fn seed_run(base_byte: u64, values: &[f64]) -> Vec<(u64, f64)> {
+    values
+        .iter()
+        .enumerate()
+        .map(|(i, &v)| (base_byte + (i as u64) * 8, v))
+        .collect()
+}
+
+/// Read element `iter_idx` of `view` from a flat slab `data` indexed by slot,
+/// using the VM's own addressing (`to_runtime_view().flat_offset`). The
+/// addressing oracle for every reducer parity check.
+fn vm_view_element(view: &StaticArrayView, data: &[f64], iter_idx: usize) -> f64 {
+    let rv = view.to_runtime_view();
+    let n = rv.dims.len();
+    let mut indices: SmallVec<[u16; 4]> = smallvec::smallvec![0; n];
+    let mut remaining = iter_idx;
+    for d in (0..n).rev() {
+        let dim = rv.dims[d] as usize;
+        indices[d] = (remaining % dim) as u16;
+        remaining /= dim;
+    }
+    let flat = rv.flat_offset(&indices);
+    data[rv.base_off as usize + flat]
+}
+
+/// The VM's expected `ArraySum` over `view`'s elements drawn from `data`.
+fn vm_sum(view: &StaticArrayView, data: &[f64]) -> f64 {
+    (0..view.to_runtime_view().size())
+        .map(|i| vm_view_element(view, data, i))
+        .sum()
+}
+
+fn dense_view(base_off: u32, dims: &[u16]) -> StaticArrayView {
+    // Row-major strides for a dense contiguous array.
+    let mut strides: SmallVec<[i32; 4]> = SmallVec::new();
+    let mut s = 1i32;
+    for &d in dims.iter().rev() {
+        strides.push(s);
+        s *= d as i32;
+    }
+    strides.reverse();
+    StaticArrayView {
+        base_off,
+        is_temp: false,
+        dims: dims.iter().copied().collect(),
+        strides,
+        offset: 0,
+        sparse: SmallVec::new(),
+        dim_ids: dims.iter().map(|_| 0u16).collect(),
+    }
+}
+
+/// Compile+run `PushStaticView(view); <reduce>; PopView` over a `curr` array
+/// seeded from `data` (slot 0 of curr is byte 0).
+fn run_static_reduce(view: StaticArrayView, reduce: Opcode, data: &[f64]) -> f64 {
+    let mut context = ByteCodeContext::default();
+    let view_id = context.add_static_view(view);
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![
+        Opcode::PushStaticView { view_id },
+        reduce,
+        Opcode::PopView {},
+    ];
+    run(&bc(vec![], code), &ctx, true, 0, &seed_run(0, data), None)
+}
+// ── Task 1: PushStaticView addressing across geometries ───────────────
+
+#[test]
+fn static_view_sum_contiguous_matches_vm() {
+    // A bare 1-D contiguous view over curr slots 0..4.
+    let data = [10.0, 20.0, 30.0, 40.0];
+    let view = dense_view(0, &[4]);
+    let got = run_static_reduce(view.clone(), Opcode::ArraySum {}, &data);
+    assert_eq!(got, vm_sum(&view, &data));
+    assert_eq!(got, 100.0);
+}
+
+#[test]
+fn static_view_sum_with_offset_matches_vm() {
+    // A range slice source[3:5] over a 5-element array bakes into `offset=2`
+    // (0-based start), dims=[3]. Elements are data[2], data[3], data[4].
+    let data = [1.0, 2.0, 3.0, 4.0, 5.0];
+    let mut view = dense_view(0, &[3]);
+    view.offset = 2;
+    let got = run_static_reduce(view.clone(), Opcode::ArraySum {}, &data);
+    assert_eq!(got, vm_sum(&view, &data));
+    assert_eq!(got, 3.0 + 4.0 + 5.0);
+}
+
+#[test]
+fn static_view_sum_transposed_strides_matches_vm() {
+    // A 2x3 matrix stored row-major (strides [3,1]) transposed to dims [3,2]
+    // with strides [1,3] -- non-contiguous, so the strided flat_offset path
+    // is exercised. Data laid out row-major: m[r,c] = data[r*3 + c].
+    let data = [11.0, 12.0, 13.0, 21.0, 22.0, 23.0];
+    let view = StaticArrayView {
+        base_off: 0,
+        is_temp: false,
+        dims: SmallVec::from_slice(&[3, 2]),
+        strides: SmallVec::from_slice(&[1, 3]),
+        offset: 0,
+        sparse: SmallVec::new(),
+        dim_ids: SmallVec::from_slice(&[0, 0]),
+    };
+    assert!(!view.to_runtime_view().is_contiguous());
+    let got = run_static_reduce(view.clone(), Opcode::ArraySum {}, &data);
+    // Sum is order-independent and covers all six cells regardless.
+    assert_eq!(got, vm_sum(&view, &data));
+    assert_eq!(got, 11.0 + 12.0 + 13.0 + 21.0 + 22.0 + 23.0);
+}
+
+#[test]
+fn static_view_max_transposed_picks_right_cells() {
+    // Max over the transposed view must read the same cells the VM reads.
+    // Make one cell dominate so a mis-addressed read would change the max.
+    let data = [11.0, 12.0, 99.0, 21.0, 22.0, 23.0];
+    let view = StaticArrayView {
+        base_off: 0,
+        is_temp: false,
+        dims: SmallVec::from_slice(&[3, 2]),
+        strides: SmallVec::from_slice(&[1, 3]),
+        offset: 0,
+        sparse: SmallVec::new(),
+        dim_ids: SmallVec::from_slice(&[0, 0]),
+    };
+    let got = run_static_reduce(view, Opcode::ArrayMax {}, &data);
+    assert_eq!(got, 99.0);
+}
+
+#[test]
+fn static_view_sum_sparse_matches_vm() {
+    // A sparse (star-range) view selecting elements at parent offsets [0, 2]
+    // of a 4-element array: dims=[2], a RuntimeSparseMapping mapping view
+    // index 0->parent 0, 1->parent 2. Elements are data[0], data[2].
+    let data = [5.0, 6.0, 7.0, 8.0];
+    let view = StaticArrayView {
+        base_off: 0,
+        is_temp: false,
+        dims: SmallVec::from_slice(&[2]),
+        strides: SmallVec::from_slice(&[1]),
+        offset: 0,
+        sparse: smallvec::smallvec![RuntimeSparseMapping {
+            dim_index: 0,
+            parent_offsets: SmallVec::from_slice(&[0, 2]),
+        }],
+        dim_ids: SmallVec::from_slice(&[0]),
+    };
+    let got = run_static_reduce(view.clone(), Opcode::ArraySum {}, &data);
+    assert_eq!(got, vm_sum(&view, &data));
+    assert_eq!(got, 5.0 + 7.0);
+}
+
+#[test]
+fn static_temp_view_sum_reads_temp_storage() {
+    // A contiguous temp view (is_temp) reads temp_storage, not curr. temp_id
+    // 0 lives at temp_offsets[0]=0, so its slot 0 is byte TEMP_BASE.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], 3);
+    let view = StaticArrayView {
+        base_off: 0, // temp_id 0
+        is_temp: true,
+        dims: SmallVec::from_slice(&[3]),
+        strides: SmallVec::from_slice(&[1]),
+        offset: 0,
+        sparse: SmallVec::new(),
+        dim_ids: SmallVec::from_slice(&[0]),
+    };
+    let view_id = context.add_static_view(view);
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![
+        Opcode::PushStaticView { view_id },
+        Opcode::ArraySum {},
+        Opcode::PopView {},
+    ];
+    // Seed curr slots 0..3 with decoys and temp_storage with the real data;
+    // a read from the wrong region would pick up the decoys.
+    let mut seed = seed_run(0, &[100.0, 200.0, 300.0]);
+    seed.extend(seed_run(u64::from(TEMP_BASE), &[2.0, 3.0, 4.0]));
+    let got = run(&bc(vec![], code), &ctx, true, 0, &seed, None);
+    assert_eq!(got, 9.0, "temp view must read temp_storage, not curr");
+}
+
+#[test]
+fn static_temp_view_honors_temp_offset() {
+    // temp_id 1 lives at temp_offsets[1]=4, so its slot 0 is byte
+    // TEMP_BASE + 4*8. A reducer over it must skip temp 0's slots.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0, 4], 6);
+    let view = StaticArrayView {
+        base_off: 1, // temp_id 1
+        is_temp: true,
+        dims: SmallVec::from_slice(&[2]),
+        strides: SmallVec::from_slice(&[1]),
+        offset: 0,
+        sparse: SmallVec::new(),
+        dim_ids: SmallVec::from_slice(&[0]),
+    };
+    let view_id = context.add_static_view(view);
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![
+        Opcode::PushStaticView { view_id },
+        Opcode::ArraySum {},
+        Opcode::PopView {},
+    ];
+    // temp_storage: [t0_0, t0_1, t0_2, t0_3, t1_0, t1_1] = [9,9,9,9, 2, 5].
+    let seed = seed_run(u64::from(TEMP_BASE), &[9.0, 9.0, 9.0, 9.0, 2.0, 5.0]);
+    let got = run(&bc(vec![], code), &ctx, true, 0, &seed, None);
+    assert_eq!(got, 7.0, "temp view must start at temp_offsets[temp_id]");
+}
+
+// ── Task 1: view transform opcodes (mirror RuntimeView::apply_*) ──────
+//
+// Build a full var view with PushVarView, apply one transform, reduce, and
+// compare to the VM's RuntimeView with the same transform applied. These are
+// the opcodes production codegen bakes into a single PushStaticView, so they
+// are exercised here directly to pin each `apply_*` against the VM.
+
+/// A `ByteCodeContext` with a single dimension of `size` (DimId 0) and a
+/// dim-list `[DimId 0]` (DimListId 0) for a 1-D `PushVarView`.
+fn ctx_one_dim(size: u16) -> ByteCodeContext {
+    let mut context = ByteCodeContext::default();
+    let name_id = context.intern_name("D");
+    context.add_dimension(DimensionInfo::indexed(name_id, size));
+    context.add_dim_list(1, [0, 0, 0, 0]);
+    context
+}
+
+/// Run `PushVarView(base 0, dims) ; <transforms> ; <reduce> ; PopView` and
+/// also build the VM `RuntimeView` the same way for the addressing oracle.
+fn run_var_view_reduce(
+    context: &ByteCodeContext,
+    transforms: &[Opcode],
+    reduce: Opcode,
+    data: &[f64],
+) -> f64 {
+    let ctx = ctx_with_arrays(context);
+    let mut code = vec![Opcode::PushVarView {
+        base_off: 0,
+        dim_list_id: 0,
+    }];
+    code.extend_from_slice(transforms);
+    code.push(reduce);
+    code.push(Opcode::PopView {});
+    run(&bc(vec![], code), &ctx, true, 0, &seed_run(0, data), None)
+}
+
+#[test]
+fn view_subscript_const_drops_dim_matches_vm() {
+    // A 2x3 matrix; subscript dim 0 to index 1 (0-based) -> row 1: cells
+    // data[3], data[4], data[5]. Mirror with RuntimeView.
+    let mut context = ByteCodeContext::default();
+    let name_d = context.intern_name("D");
+    context.add_dimension(DimensionInfo::indexed(name_d, 2));
+    let name_e = context.intern_name("E");
+    context.add_dimension(DimensionInfo::indexed(name_e, 3));
+    context.add_dim_list(2, [0, 1, 0, 0]); // [DimId 0 (size2), DimId 1 (size3)]
+    let data = [11.0, 12.0, 13.0, 21.0, 22.0, 23.0];
+
+    let got = run_var_view_reduce(
+        &context,
+        &[Opcode::ViewSubscriptConst {
+            dim_idx: 0,
+            index: 1,
+        }],
+        Opcode::ArraySum {},
+        &data,
+    );
+    // VM oracle: build the same RuntimeView and apply the same subscript.
+    let mut rv = RuntimeView::for_var(
+        0,
+        SmallVec::from_slice(&[2, 3]),
+        SmallVec::from_slice(&[0, 1]),
+    );
+    rv.apply_single_subscript(0, 1);
+    let want: f64 = (0..rv.size())
+        .map(|i| {
+            let n = rv.dims.len();
+            let mut idx: SmallVec<[u16; 4]> = smallvec::smallvec![0; n];
+            let mut rem = i;
+            for d in (0..n).rev() {
+                idx[d] = (rem % rv.dims[d] as usize) as u16;
+                rem /= rv.dims[d] as usize;
+            }
+            data[rv.base_off as usize + rv.flat_offset(&idx)]
+        })
+        .sum();
+    assert_eq!(got, want);
+    assert_eq!(got, 21.0 + 22.0 + 23.0);
+}
+
+#[test]
+fn view_range_matches_vm() {
+    // 1-D dim of 5; ViewRange [1:4) keeps indices 1,2,3 -> data[1..4].
+    let context = ctx_one_dim(5);
+    let data = [1.0, 2.0, 3.0, 4.0, 5.0];
+    let got = run_var_view_reduce(
+        &context,
+        &[Opcode::ViewRange {
+            dim_idx: 0,
+            start: 1,
+            end: 4,
+        }],
+        Opcode::ArraySum {},
+        &data,
+    );
+    assert_eq!(got, 2.0 + 3.0 + 4.0);
+}
+
+#[test]
+fn view_wildcard_is_noop() {
+    // ViewWildcard leaves the dimension as-is: the sum is the full array.
+    let context = ctx_one_dim(4);
+    let data = [1.0, 2.0, 3.0, 4.0];
+    let got = run_var_view_reduce(
+        &context,
+        &[Opcode::ViewWildcard { dim_idx: 0 }],
+        Opcode::ArraySum {},
+        &data,
+    );
+    assert_eq!(got, 10.0);
+}
+
+#[test]
+fn view_transpose_then_reduce_matches_vm() {
+    // 2x3 matrix; transpose to 3x2 then sum (order-independent but exercises
+    // the stride/dim reversal addressing).
+    let mut context = ByteCodeContext::default();
+    let name_d = context.intern_name("D");
+    context.add_dimension(DimensionInfo::indexed(name_d, 2));
+    let name_e = context.intern_name("E");
+    context.add_dimension(DimensionInfo::indexed(name_e, 3));
+    context.add_dim_list(2, [0, 1, 0, 0]);
+    let data = [11.0, 12.0, 13.0, 21.0, 22.0, 23.0];
+    let got = run_var_view_reduce(
+        &context,
+        &[Opcode::ViewTranspose {}],
+        Opcode::ArraySum {},
+        &data,
+    );
+    assert_eq!(got, 11.0 + 12.0 + 13.0 + 21.0 + 22.0 + 23.0);
+}
+
+#[test]
+fn view_star_range_sparse_matches_vm() {
+    // A 1-D parent dim of 4; a star-range via a subdim relation selecting
+    // parent offsets [1, 3] -> sum of data[1] + data[3].
+    let mut context = ByteCodeContext::default();
+    let name_p = context.intern_name("P");
+    context.add_dimension(DimensionInfo::indexed(name_p, 4));
+    let name_s = context.intern_name("S");
+    context.add_dimension(DimensionInfo::indexed(name_s, 2)); // child dim
+    context.add_dim_list(1, [0, 0, 0, 0]); // parent dim list
+    context.add_subdim_relation(SubdimensionRelation::sparse(
+        0,
+        1,
+        SmallVec::from_slice(&[1, 3]),
+    ));
+    let data = [5.0, 6.0, 7.0, 8.0];
+    let got = run_var_view_reduce(
+        &context,
+        &[Opcode::ViewStarRange {
+            dim_idx: 0,
+            subdim_relation_id: 0,
+        }],
+        Opcode::ArraySum {},
+        &data,
+    );
+    assert_eq!(got, 6.0 + 8.0);
+}
+
+#[test]
+fn dup_view_then_reduce_matches_single() {
+    // DupView duplicates the top descriptor; reducing the dup gives the same
+    // result as reducing the original (and the original stays on the stack).
+    let context = ctx_one_dim(3);
+    let data = [2.0, 3.0, 5.0];
+    let got = run_var_view_reduce(&context, &[Opcode::DupView {}], Opcode::ArraySum {}, &data);
+    assert_eq!(got, 10.0);
+    // The duplicate must leave the stack balanced for the trailing PopView;
+    // a second PopView would underflow, so add one more here to drain the
+    // dup and confirm both pops succeed.
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![
+        Opcode::PushVarView {
+            base_off: 0,
+            dim_list_id: 0,
+        },
+        Opcode::DupView {},
+        Opcode::ArraySum {},
+        Opcode::PopView {}, // pop dup
+        Opcode::PopView {}, // pop original
+    ];
+    let got2 = run(&bc(vec![], code), &ctx, true, 0, &seed_run(0, &data), None);
+    assert_eq!(got2, 10.0);
+}
+
+// ── Task 2: each reducer vs an explicit VM-mirrored oracle ────────────
+
+/// Sum/Max/Min/Mean/Stddev/Size oracle over a contiguous element slice,
+/// mirroring the VM's per-reducer arms (`vm.rs:2216-2309`) exactly.
+fn reducer_oracle(op: &Opcode, elems: &[f64]) -> f64 {
+    let size = elems.len();
+    match op {
+        Opcode::ArraySum {} => elems.iter().sum(),
+        Opcode::ArraySize {} => size as f64,
+        _ if size == 0 => f64::NAN,
+        Opcode::ArrayMax {} => elems
+            .iter()
+            .copied()
+            .fold(f64::NEG_INFINITY, |a, v| if v > a { v } else { a }),
+        Opcode::ArrayMin {} => elems
+            .iter()
+            .copied()
+            .fold(f64::INFINITY, |a, v| if v < a { v } else { a }),
+        Opcode::ArrayMean {} => elems.iter().sum::<f64>() / size as f64,
+        Opcode::ArrayStddev {} => {
+            let mean = elems.iter().sum::<f64>() / size as f64;
+            let var = elems.iter().map(|v| (v - mean).powf(2.0)).sum::<f64>() / size as f64;
+            var.sqrt()
+        }
+        _ => unreachable!(),
+    }
+}
+
+fn assert_reducer_matches(op: Opcode, elems: &[f64]) {
+    // A bare contiguous 1-D static view over the data.
+    let data: Vec<f64> = elems.to_vec();
+    let view = dense_view(0, &[elems.len() as u16]);
+    let got = run_static_reduce(view, op, &data);
+    let want = reducer_oracle(&op, elems);
+    if want.is_nan() {
+        assert!(got.is_nan(), "{}: expected NaN, got {got}", op.name());
+    } else {
+        assert!(
+            (got - want).abs() < 1e-12,
+            "{}: got {got}, want {want}",
+            op.name()
+        );
+    }
+}
+
+#[test]
+fn reducer_sum_matches_vm() {
+    assert_reducer_matches(Opcode::ArraySum {}, &[1.0, 2.0, 3.0, 4.5]);
+}
+
+#[test]
+fn reducer_max_matches_vm() {
+    assert_reducer_matches(Opcode::ArrayMax {}, &[3.0, -1.0, 7.5, 2.0]);
+    // Negative-only set: max stays negative (init NEG_INFINITY never wins).
+    assert_reducer_matches(Opcode::ArrayMax {}, &[-5.0, -2.0, -9.0]);
+}
+
+#[test]
+fn reducer_min_matches_vm() {
+    assert_reducer_matches(Opcode::ArrayMin {}, &[3.0, -1.0, 7.5, 2.0]);
+    assert_reducer_matches(Opcode::ArrayMin {}, &[5.0, 2.0, 9.0]);
+}
+
+#[test]
+fn reducer_mean_matches_vm() {
+    assert_reducer_matches(Opcode::ArrayMean {}, &[2.0, 4.0, 6.0]);
+    assert_reducer_matches(Opcode::ArrayMean {}, &[1.0, 2.0]);
+}
+
+#[test]
+fn reducer_stddev_matches_vm_population_variance() {
+    // Population variance (divisor N): for [2,4,4,4,5,5,7,9] the population
+    // stddev is exactly 2.0 -- a value check, not just parity, pinning the
+    // divisor-N (not N-1) choice that matches `vm.rs::ArrayStddev`.
+    let elems = [2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0];
+    assert_reducer_matches(Opcode::ArrayStddev {}, &elems);
+    let view = dense_view(0, &[elems.len() as u16]);
+    let got = run_static_reduce(view, Opcode::ArrayStddev {}, &elems);
+    assert!(
+        (got - 2.0).abs() < 1e-12,
+        "population stddev should be 2.0, got {got}"
+    );
+}
+
+#[test]
+fn reducer_size_matches_vm() {
+    assert_reducer_matches(Opcode::ArraySize {}, &[1.0, 2.0, 3.0]);
+}
+
+#[test]
+fn reducer_size_multidim_is_product() {
+    // SIZE over a 2x3 view is 6, regardless of the data.
+    let data = [0.0; 6];
+    let view = dense_view(0, &[2, 3]);
+    let got = run_static_reduce(view, Opcode::ArraySize {}, &data);
+    assert_eq!(got, 6.0);
+}
+
+// ── Task 2: empty-but-valid view asymmetry (AC1.5) ────────────────────
+
+/// An empty-but-valid view: a `[start:start)` range collapses dim 0 to size
+/// 0 (`apply_range_checked`), valid with zero elements. Built as a static
+/// view with a zero-size dimension.
+fn empty_static_view() -> StaticArrayView {
+    StaticArrayView {
+        base_off: 0,
+        is_temp: false,
+        dims: SmallVec::from_slice(&[0]),
+        strides: SmallVec::from_slice(&[1]),
+        offset: 0,
+        sparse: SmallVec::new(),
+        dim_ids: SmallVec::from_slice(&[0]),
+    }
+}
+
+#[test]
+fn empty_valid_view_sum_is_zero() {
+    // ArraySum over an empty-but-valid view is the additive identity 0.0
+    // (`vm.rs:2216`), NOT NaN.
+    let got = run_static_reduce(empty_static_view(), Opcode::ArraySum {}, &[1.0]);
+    assert_eq!(got, 0.0);
+}
+
+#[test]
+fn empty_valid_view_max_min_mean_stddev_are_nan() {
+    for op in [
+        Opcode::ArrayMax {},
+        Opcode::ArrayMin {},
+        Opcode::ArrayMean {},
+        Opcode::ArrayStddev {},
+    ] {
+        let got = run_static_reduce(empty_static_view(), op, &[1.0]);
+        assert!(
+            got.is_nan(),
+            "{}: empty-but-valid view must be NaN",
+            op.name()
+        );
+    }
+}
+
+#[test]
+fn empty_valid_view_size_is_zero() {
+    let got = run_static_reduce(empty_static_view(), Opcode::ArraySize {}, &[1.0]);
+    assert_eq!(got, 0.0);
+}
+
+// ── Task 2: invalid view -> NaN for ALL reducers (AC1.5) ──────────────
+//
+// A static view is always valid (`valid_local` is None), so an invalid view
+// is modeled by directly setting `valid_local` to a wasm i32 local seeded to
+// 0 -- mirroring what Task 4's out-of-bounds dynamic subscript will produce.
+// Every reducer (including ArraySum) must yield NaN, matching `reduce_view`'s
+// leading `if !is_valid { return NaN }`.
+
+/// Run a reducer over a contiguous static view whose `valid_local` is forced
+/// to an i32 local pre-set to 0 (invalid). The harness function reserves the
+/// three Apply f64 scratch locals; we add one i32 local after them for the
+/// validity flag and initialize it to 0 in the emitted prologue.
+fn run_invalid_view_reduce(reduce: Opcode) -> f64 {
+    let mut context = ByteCodeContext::default();
+    // Contiguous 1-D view over 3 curr slots; geometry is valid, but the
+    // view is flagged invalid.
+    let view = dense_view(0, &[3]);
+    let view_id = context.add_static_view(view);
+
+    // Build a custom module: the opcode function declares an extra i32 local
+    // (index after the standard opcode-fn locals) for the validity flag,
+    // seeded to 0. We mark the descriptor invalid by post-processing is out
+    // of reach here, so instead emit the program through a small shim that
+    // sets `valid_local` on the pushed descriptor.
+    //
+    // Simpler: emit PushStaticView, then a hand-rolled reduce over a desc
+    // with valid_local set, by calling emit_array_reduce directly.
+    let ctx = EmitCtx {
+        temp_storage_base: TEMP_BASE,
+        ctx: &context,
+        ..ctx_with_cond_depth(0)
+    };
+
+    // The validity i32 local index: it is the first index past every standard
+    // opcode-fn local (the scratch f64, the cond i32s, the Apply f64s, and the
+    // Phase-6 vector-op f64/i32 scratch blocks), i.e. exactly where the
+    // dynamic-subscript "extra i32" locals begin. The shim below pushes a single
+    // i32 local at that index for the validity flag.
+    let valid_local = extra_i32_local_base(0, 0);
+
+    let mut module = Module::new();
+    let helpers = build_helpers();
+    let n_helpers = helpers.functions.len() as u32;
+    let mut types = TypeSection::new();
+    types.ty().function([ValType::I32], [ValType::F64]); // eval -> f64
+    for hf in &helpers.functions {
+        types.ty().function(hf.params.clone(), hf.results.clone());
+    }
+    module.section(&types);
+    let mut functions = FunctionSection::new();
+    for (i, _) in helpers.functions.iter().enumerate() {
+        functions.function(1 + i as u32);
+    }
+    functions.function(0);
+    module.section(&functions);
+    let mut memories = MemorySection::new();
+    memories.memory(MemoryType {
+        minimum: 1,
+        maximum: None,
+        memory64: false,
+        shared: false,
+        page_size_log2: None,
+    });
+    module.section(&memories);
+    let mut exports = ExportSection::new();
+    exports.export("eval", ExportKind::Func, n_helpers);
+    exports.export("mem", ExportKind::Memory, 0);
+    module.section(&exports);
+
+    let mut code = CodeSection::new();
+    for hf in helpers.functions {
+        code.function(&hf.body);
+    }
+    // opcode-fn locals plus one extra i32 for the validity flag.
+    let mut locals = opcode_fn_locals(0, 0, 0);
+    locals.push((1, ValType::I32));
+    let mut func = Function::new(locals);
+    // valid_local = 0 (invalid).
+    func.instruction(&Instruction::I32Const(0));
+    func.instruction(&Instruction::LocalSet(valid_local));
+    // Reduce over a desc built from the registered static view, but with its
+    // `valid_local` forced to the (zero-seeded) validity flag -- exactly the
+    // shape Task 4's out-of-bounds dynamic subscript will produce.
+    let mut desc = ViewDesc::from_static(ctx.ctx.get_static_view(view_id).unwrap());
+    desc.valid_local = Some(valid_local);
+    emit_array_reduce(&reduce, &desc, &ctx, &mut func).expect("reduce lowers");
+    func.instruction(&Instruction::End);
+    code.function(&func);
+    module.section(&code);
+
+    let bytes = module.finish();
+    let info = validate(&bytes).expect("invalid-view module must validate");
+    let mut store = Store::new(());
+    let inst = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("instantiate")
+        .module_addr;
+    // Seed the curr slots so a (wrongly) valid read would produce a finite
+    // value -- making the NaN assertion meaningful.
+    let mem = store
+        .instance_export(inst, "mem")
+        .unwrap()
+        .as_mem()
+        .unwrap();
+    store.mem_access_mut_slice(mem, |b| {
+        for (i, v) in [1.0f64, 2.0, 3.0].iter().enumerate() {
+            let a = i * 8;
+            b[a..a + 8].copy_from_slice(&v.to_le_bytes());
+        }
+    });
+    let eval = store
+        .instance_export(inst, "eval")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    store.invoke_simple_typed(eval, (0_i32,)).expect("invoke")
+}
+
+#[test]
+fn invalid_view_all_reducers_are_nan() {
+    // Every reducer over an invalid view is NaN -- including ArraySum, whose
+    // empty-but-valid result is 0.0 but whose invalid-view result is NaN.
+    for op in [
+        Opcode::ArraySum {},
+        Opcode::ArrayMax {},
+        Opcode::ArrayMin {},
+        Opcode::ArrayMean {},
+        Opcode::ArrayStddev {},
+    ] {
+        let got = run_invalid_view_reduce(op);
+        assert!(
+            got.is_nan(),
+            "{}: an invalid view must reduce to NaN, got {got}",
+            op.name()
+        );
+    }
+}
+
+#[test]
+fn invalid_view_size_is_still_the_size() {
+    // ArraySize is defined regardless of validity (`vm.rs:2306` reads
+    // `view.size()` with no validity gate), so an invalid 3-element view
+    // still reports size 3.
+    let got = run_invalid_view_reduce(Opcode::ArraySize {});
+    assert_eq!(got, 3.0);
+}
+
+// ════════════════════════════════════════════════════════════════════════
+// Phase 5 Task 3: iteration loops (BeginIter..EndIter) + broadcast
+//
+// The body span between `BeginIter` and `NextIterOrJump` is fully unrolled
+// over the compile-time `size()`, so each iteration's reads/writes are
+// emitted at constant addresses (mirroring the array reducer's unrolled fold
+// and the VM element-for-element). These hand-build the canonical codegen
+// shape (`PushStaticView(out); BeginIter; PushStaticView(src); <body>;
+// NextIterOrJump; EndIter; PopView; ...`) and run it under DLR-FT, reading
+// the written temp slots back and comparing to a VM-mirrored oracle.
+// ════════════════════════════════════════════════════════════════════════
+
+/// A contiguous temp `StaticArrayView` over `dims` at `temp_id`.
+fn temp_view(temp_id: u32, dims: &[u16]) -> StaticArrayView {
+    let mut v = dense_view(temp_id, dims);
+    v.is_temp = true;
+    v
+}
+
+/// A contiguous temp `StaticArrayView` carrying explicit `dim_ids` (for the
+/// broadcast-matching tests).
+fn dense_view_ids(base_off: u32, dims: &[u16], dim_ids: &[u16]) -> StaticArrayView {
+    let mut v = dense_view(base_off, dims);
+    v.dim_ids = dim_ids.iter().copied().collect();
+    v
+}
+
+/// Read `count` temp slots (starting at temp slot 0) back after running a
+/// temp-writing program. The temp region base is `TEMP_BASE`.
+fn run_and_read_temps(
+    context: &ByteCodeContext,
+    code: Vec<Opcode>,
+    literals: Vec<f64>,
+    seed: &[(u64, f64)],
+    count: usize,
+) -> Vec<f64> {
+    let ctx = ctx_with_arrays(context);
+    let bytes = build_module(&bc(literals, code), &ctx, false, 0);
+    let info = validate(&bytes).expect("emitted module must validate");
+    let mut store = Store::new(());
+    let inst = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("instantiate")
+        .module_addr;
+    if !seed.is_empty() {
+        let mem = store
+            .instance_export(inst, "mem")
+            .unwrap()
+            .as_mem()
+            .unwrap();
+        store.mem_access_mut_slice(mem, |b| {
+            for &(addr, v) in seed {
+                let a = addr as usize;
+                b[a..a + 8].copy_from_slice(&v.to_le_bytes());
+            }
+        });
+    }
+    let eval = store
+        .instance_export(inst, "eval")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    store
+        .invoke_simple_typed::<(i32,), ()>(eval, (0_i32,))
+        .expect("invoke");
+    let mem = store
+        .instance_export(inst, "mem")
+        .unwrap()
+        .as_mem()
+        .unwrap();
+    store.mem_access_mut_slice(mem, |b| {
+        (0..count)
+            .map(|i| {
+                let a = TEMP_BASE as usize + i * 8;
+                f64::from_le_bytes(b[a..a + 8].try_into().unwrap())
+            })
+            .collect()
+    })
+}
+
+#[test]
+fn iter_loop_elementwise_writes_temp_like_vm() {
+    // out_temp[i] = source[i] * 2 over a 4-element source in curr, written to
+    // temp 0. Mirrors the codegen shape: output temp view drives iteration,
+    // the source view is pushed inside, read via LoadIterViewAt{1}.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], 4); // temp 0 spans 4 slots
+    let out_view = context.add_static_view(temp_view(0, &[4]));
+    let src_view = context.add_static_view(dense_view(0, &[4]));
+    let code = vec![
+        Opcode::PushStaticView { view_id: out_view },
+        Opcode::BeginIter {
+            write_temp_id: 0,
+            has_write_temp: true,
+        },
+        Opcode::PushStaticView { view_id: src_view },
+        Opcode::LoadIterViewAt { offset: 1 },
+        Opcode::LoadConstant { id: 0 },
+        Opcode::Op2 { op: Op2::Mul },
+        Opcode::StoreIterElement {},
+        Opcode::NextIterOrJump { jump_back: -4 },
+        Opcode::EndIter {},
+        Opcode::PopView {},
+        Opcode::PopView {},
+    ];
+    // source = [10, 20, 30, 40] in curr slots 0..4.
+    let seed = seed_run(0, &[10.0, 20.0, 30.0, 40.0]);
+    let temps = run_and_read_temps(&context, code, vec![2.0], &seed, 4);
+    assert_eq!(temps, vec![20.0, 40.0, 60.0, 80.0]);
+}
+
+#[test]
+fn iter_loop_load_iter_element_reads_captured_view() {
+    // out_temp[i] = iter_view[i] (the captured view *is* the iteration view).
+    // Here the captured view is the OUTPUT temp itself, so seed the temp and
+    // copy it to itself -- a degenerate but faithful LoadIterElement check.
+    // Use a separate source temp captured as the iter view instead: push a
+    // source temp view, BeginIter captures it, LoadIterElement reads it, and
+    // StoreIterElement writes the *same* temp's slots (write_temp == source).
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], 3);
+    let src = context.add_static_view(temp_view(0, &[3]));
+    let code = vec![
+        Opcode::PushStaticView { view_id: src },
+        Opcode::BeginIter {
+            write_temp_id: 0,
+            has_write_temp: true,
+        },
+        Opcode::LoadIterElement {},
+        Opcode::LoadConstant { id: 0 },
+        Opcode::Op2 { op: Op2::Add },
+        Opcode::StoreIterElement {},
+        Opcode::NextIterOrJump { jump_back: -4 },
+        Opcode::EndIter {},
+        Opcode::PopView {},
+    ];
+    // temp 0 = [1, 2, 3]; each += 5 in place -> [6, 7, 8].
+    let seed = seed_run(u64::from(TEMP_BASE), &[1.0, 2.0, 3.0]);
+    let temps = run_and_read_temps(&context, code, vec![5.0], &seed, 3);
+    assert_eq!(temps, vec![6.0, 7.0, 8.0]);
+}
+
+#[test]
+fn iter_loop_load_iter_temp_element_reads_temp() {
+    // out_temp1[i] = temp0[i] + 100, reading temp0 via LoadIterTempElement and
+    // writing temp1. temp_offsets = [0, 3]: temp0 in slots 0..3, temp1 in 3..6.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0, 3], 6);
+    let out_view = context.add_static_view(temp_view(1, &[3])); // temp 1
+    let code = vec![
+        Opcode::PushStaticView { view_id: out_view },
+        Opcode::BeginIter {
+            write_temp_id: 1,
+            has_write_temp: true,
+        },
+        Opcode::LoadIterTempElement { temp_id: 0 },
+        Opcode::LoadConstant { id: 0 },
+        Opcode::Op2 { op: Op2::Add },
+        Opcode::StoreIterElement {},
+        Opcode::NextIterOrJump { jump_back: -4 },
+        Opcode::EndIter {},
+        Opcode::PopView {},
+    ];
+    // temp0 = [7, 8, 9] in slots 0..3.
+    let seed = seed_run(u64::from(TEMP_BASE), &[7.0, 8.0, 9.0]);
+    // Read 6 temp slots: temp1 is slots 3..6.
+    let temps = run_and_read_temps(&context, code, vec![100.0], &seed, 6);
+    assert_eq!(&temps[3..6], &[107.0, 108.0, 109.0]);
+}
+
+#[test]
+fn iter_loop_broadcast_smaller_source_matches_vm() {
+    // out_temp[A,B] = mat[A,B] + vec[A]: the iteration view is 2-D [A(2),B(3)]
+    // (dim_ids [0,1]); `vec` is 1-D [A(2)] (dim_id 0), broadcast along B. This
+    // exercises the `LoadIterViewAt` broadcast path (source dims != iter
+    // dims), which production codegen does not currently emit but the VM
+    // supports. Cross-checked element-for-element against the VM's broadcast.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], 6); // out temp [2,3]
+    // Two indexed dims so match_dimensions_two_pass can resolve is_indexed.
+    let na = context.intern_name("A");
+    context.add_dimension(DimensionInfo::indexed(na, 2)); // id 0
+    let nb = context.intern_name("B");
+    context.add_dimension(DimensionInfo::indexed(nb, 3)); // id 1
+
+    let out_view = context.add_static_view({
+        let mut v = temp_view(0, &[2, 3]);
+        v.dim_ids = SmallVec::from_slice(&[0, 1]);
+        v
+    });
+    // mat in curr slots 0..6 (dims [2,3], dim_ids [0,1]).
+    let mat = context.add_static_view(dense_view_ids(0, &[2, 3], &[0, 1]));
+    // vec in curr slots 6..8 (dims [2], dim_id 0).
+    let vec_v = context.add_static_view(dense_view_ids(6, &[2], &[0]));
+    let code = vec![
+        Opcode::PushStaticView { view_id: out_view },
+        Opcode::BeginIter {
+            write_temp_id: 0,
+            has_write_temp: true,
+        },
+        Opcode::PushStaticView { view_id: mat }, // offset 2 after vec is pushed
+        Opcode::PushStaticView { view_id: vec_v }, // offset 1
+        Opcode::LoadIterViewAt { offset: 2 },    // mat[A,B]
+        Opcode::LoadIterViewAt { offset: 1 },    // vec[A] broadcast over B
+        Opcode::Op2 { op: Op2::Add },
+        Opcode::StoreIterElement {},
+        Opcode::NextIterOrJump { jump_back: -5 },
+        Opcode::EndIter {},
+        Opcode::PopView {},
+        Opcode::PopView {},
+        Opcode::PopView {},
+    ];
+    // mat[a,b] = a*10 + b -> [0,1,2, 10,11,12]; vec[a] = a -> [0, 1].
+    let mut seed = seed_run(0, &[0.0, 1.0, 2.0, 10.0, 11.0, 12.0]);
+    seed.extend(seed_run(6 * 8, &[0.0, 1.0]));
+    let temps = run_and_read_temps(&context, code, vec![], &seed, 6);
+    // out[a,b] = mat[a,b] + vec[a].
+    let expected = [
+        0.0 + 0.0,
+        1.0 + 0.0,
+        2.0 + 0.0,
+        10.0 + 1.0,
+        11.0 + 1.0,
+        12.0 + 1.0,
+    ];
+    assert_eq!(temps, expected);
+}
+
+#[test]
+fn iter_loop_smaller_source_same_shape_writes_nan() {
+    // The iteration is over 4 elements but the source view (same dim_ids) has
+    // only 3: the VM's `LoadIterViewTop`/`LoadIterViewAt` fast path returns
+    // NaN past the source size (`vm.rs:1972`). Element 3 must be NaN.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], 4);
+    let out_view = context.add_static_view(temp_view(0, &[4]));
+    let src = context.add_static_view(dense_view(0, &[3])); // shorter
+    let code = vec![
+        Opcode::PushStaticView { view_id: out_view },
+        Opcode::BeginIter {
+            write_temp_id: 0,
+            has_write_temp: true,
+        },
+        Opcode::PushStaticView { view_id: src },
+        Opcode::LoadIterViewAt { offset: 1 },
+        Opcode::StoreIterElement {},
+        Opcode::NextIterOrJump { jump_back: -3 },
+        Opcode::EndIter {},
+        Opcode::PopView {},
+        Opcode::PopView {},
+    ];
+    let seed = seed_run(0, &[5.0, 6.0, 7.0]);
+    let temps = run_and_read_temps(&context, code, vec![], &seed, 4);
+    assert_eq!(&temps[0..3], &[5.0, 6.0, 7.0]);
+    assert!(
+        temps[3].is_nan(),
+        "element past the source size must be NaN"
+    );
+}
+
+#[test]
+fn iter_loop_then_reduce_dotprod_matches_vm() {
+    // The full SUM(a[*]*b[*]) shape: hoist a[i]*b[i] into a temp via BeginIter,
+    // then ArraySum the temp. a in curr 0..4, b in curr 4..8, temp 0.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], 4);
+    let out_view = context.add_static_view(temp_view(0, &[4]));
+    let a = context.add_static_view(dense_view(0, &[4]));
+    let b = context.add_static_view(dense_view(4, &[4]));
+    let temp_read = context.add_static_view(temp_view(0, &[4]));
+    let code = vec![
+        Opcode::PushStaticView { view_id: out_view },
+        Opcode::BeginIter {
+            write_temp_id: 0,
+            has_write_temp: true,
+        },
+        Opcode::PushStaticView { view_id: a }, // offset 2 after b
+        Opcode::PushStaticView { view_id: b }, // offset 1
+        Opcode::LoadIterViewAt { offset: 2 },
+        Opcode::LoadIterViewAt { offset: 1 },
+        Opcode::Op2 { op: Op2::Mul },
+        Opcode::StoreIterElement {},
+        Opcode::NextIterOrJump { jump_back: -5 },
+        Opcode::EndIter {},
+        Opcode::PopView {},
+        Opcode::PopView {},
+        Opcode::PopView {},
+        Opcode::PushStaticView { view_id: temp_read },
+        Opcode::ArraySum {},
+        Opcode::PopView {},
+    ];
+    // a = [1,2,3,4], b = [10,20,30,40] -> dot = 10+40+90+160 = 300.
+    let mut seed = seed_run(0, &[1.0, 2.0, 3.0, 4.0]);
+    seed.extend(seed_run(4 * 8, &[10.0, 20.0, 30.0, 40.0]));
+    let ctx = ctx_with_arrays(&context);
+    let got = run(&bc(vec![], code), &ctx, true, 0, &seed, None);
+    assert_eq!(got, 300.0);
+}
+
+#[test]
+fn iter_loop_zero_size_writes_nothing() {
+    // An empty iteration view (size 0): the unroller emits zero body copies,
+    // so the temp keeps its seeded value (no write). A trailing reducer over
+    // the empty output is 0 for SUM.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], 1);
+    let out_view = context.add_static_view({
+        let mut v = temp_view(0, &[0]); // zero-size dim
+        v.dims = SmallVec::from_slice(&[0]);
+        v
+    });
+    let code = vec![
+        Opcode::PushStaticView { view_id: out_view },
+        Opcode::BeginIter {
+            write_temp_id: 0,
+            has_write_temp: true,
+        },
+        Opcode::LoadIterElement {},
+        Opcode::StoreIterElement {},
+        Opcode::NextIterOrJump { jump_back: -2 },
+        Opcode::EndIter {},
+        Opcode::PopView {},
+    ];
+    // Seed temp slot 0 with a sentinel; the empty loop must not touch it.
+    let seed = seed_run(u64::from(TEMP_BASE), &[42.0]);
+    let temps = run_and_read_temps(&context, code, vec![], &seed, 1);
+    assert_eq!(temps, vec![42.0], "an empty iteration writes nothing");
+}
+
+// ── Broadcast iteration family (BeginBroadcastIter..EndBroadcastIter) ──
+//
+// Not emitted by current codegen, but lowered for completeness and pinned
+// against the VM's `BeginBroadcastIter`/`LoadBroadcastElement` arms
+// (`vm.rs:2314-2421`) here. The result geometry is the union of the source
+// dim_ids; a 2-D and a 1-D source broadcast into the 2-D result.
+
+#[test]
+fn broadcast_iter_unions_dims_like_vm() {
+    // dest[A,B] = mat[A,B] * vec[A]: BeginBroadcastIter with two sources
+    // (mat 2-D dim_ids [0,1], vec 1-D dim_id 0). The result unions to
+    // dim_ids [0,1] (dims [2,3]); vec broadcasts along B.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], 6);
+    let na = context.intern_name("A");
+    context.add_dimension(DimensionInfo::indexed(na, 2)); // id 0
+    let nb = context.intern_name("B");
+    context.add_dimension(DimensionInfo::indexed(nb, 3)); // id 1
+    let mat = context.add_static_view(dense_view_ids(0, &[2, 3], &[0, 1]));
+    let vec_v = context.add_static_view(dense_view_ids(6, &[2], &[0]));
+    let code = vec![
+        // Push the two sources (deepest-first): mat then vec.
+        Opcode::PushStaticView { view_id: mat },
+        Opcode::PushStaticView { view_id: vec_v },
+        Opcode::BeginBroadcastIter {
+            n_sources: 2,
+            dest_temp_id: 0,
+        },
+        Opcode::LoadBroadcastElement { source_idx: 0 }, // mat
+        Opcode::LoadBroadcastElement { source_idx: 1 }, // vec
+        Opcode::Op2 { op: Op2::Mul },
+        Opcode::StoreBroadcastElement {},
+        Opcode::NextBroadcastOrJump { jump_back: -4 },
+        Opcode::EndBroadcastIter {},
+        Opcode::PopView {},
+        Opcode::PopView {},
+    ];
+    // mat[a,b] = a*10 + b -> [0,1,2, 10,11,12]; vec[a] = a+1 -> [1, 2].
+    let mut seed = seed_run(0, &[0.0, 1.0, 2.0, 10.0, 11.0, 12.0]);
+    seed.extend(seed_run(6 * 8, &[1.0, 2.0]));
+    let temps = run_and_read_temps(&context, code, vec![], &seed, 6);
+    // dest[a,b] = mat[a,b] * vec[a].
+    let expected = [
+        0.0 * 1.0,
+        1.0 * 1.0,
+        2.0 * 1.0,
+        10.0 * 2.0,
+        11.0 * 2.0,
+        12.0 * 2.0,
+    ];
+    assert_eq!(temps, expected);
+}
+
+// ════════════════════════════════════════════════════════════════════════
+// Phase 5 Task 4: dynamic subscripts + OOB->NaN
+//
+// The legacy scalar subscript (`PushSubscriptIndex` / `LoadSubscript`) and
+// the view-stack dynamic subscript (`ViewSubscriptDynamic`) both carry a
+// runtime offset + validity flag in fresh i32 wasm locals (reserved by
+// `count_extra_i32_locals`). An out-of-bounds index clears the validity
+// flag, so the read yields NaN -- matching the VM (`vm.rs:1341-1366` for the
+// legacy path; `reduce_view`'s `if !is_valid { NaN }` for the view path).
+// ════════════════════════════════════════════════════════════════════════
+
+/// Run `code` (with `count_extra_i32_locals` reserved) returning the f64
+/// result, with `curr` seeded from `data` (slot 0 = byte 0). The literal pool
+/// holds the runtime index value(s).
+fn run_dyn(code: Vec<Opcode>, literals: Vec<f64>, data: &[f64]) -> f64 {
+    let context = ByteCodeContext::default();
+    let ctx = ctx_with_arrays(&context);
+    run(&bc(literals, code), &ctx, true, 0, &seed_run(0, data), None)
+}
+
+#[test]
+fn legacy_subscript_1d_in_range_matches_vm() {
+    // arr[idx] (idx 1-based) over a 4-element array in curr slots 0..4.
+    // idx = 3 (1-based) -> 0-based 2 -> data[2].
+    let data = [10.0, 20.0, 30.0, 40.0];
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // idx = 3.0
+        Opcode::PushSubscriptIndex { bounds: 4 },
+        Opcode::LoadSubscript { off: 0 },
+    ];
+    assert_eq!(run_dyn(code, vec![3.0], &data), 30.0);
+}
+
+#[test]
+fn legacy_subscript_oob_is_nan() {
+    let data = [10.0, 20.0, 30.0, 40.0];
+    // idx = 5 > bounds 4 -> invalid -> NaN.
+    let high = vec![
+        Opcode::LoadConstant { id: 0 },
+        Opcode::PushSubscriptIndex { bounds: 4 },
+        Opcode::LoadSubscript { off: 0 },
+    ];
+    assert!(
+        run_dyn(high, vec![5.0], &data).is_nan(),
+        "idx > bounds -> NaN"
+    );
+    // idx = 0 is invalid in 1-based indexing -> NaN.
+    let zero = vec![
+        Opcode::LoadConstant { id: 0 },
+        Opcode::PushSubscriptIndex { bounds: 4 },
+        Opcode::LoadSubscript { off: 0 },
+    ];
+    assert!(run_dyn(zero, vec![0.0], &data).is_nan(), "idx 0 -> NaN");
+}
+
+#[test]
+fn legacy_subscript_off_shifts_base_like_vm() {
+    // LoadSubscript reads curr[module_off + off + flat]; with off=2 the base
+    // shifts by 2 slots. arr starts at slot 2; idx=2 (1-based) -> slot 3.
+    let data = [99.0, 99.0, 100.0, 200.0, 300.0];
+    let code = vec![
+        Opcode::LoadConstant { id: 0 },
+        Opcode::PushSubscriptIndex { bounds: 3 },
+        Opcode::LoadSubscript { off: 2 },
+    ];
+    assert_eq!(run_dyn(code, vec![2.0], &data), 200.0);
+}
+
+#[test]
+fn legacy_subscript_2d_fold_matches_vm() {
+    // arr[i, j] over a [2,3] row-major array in curr slots 0..6. The VM folds
+    // index = i0*bounds1 + i1 (the running index times the current bound plus
+    // the current index). i=2 (1-based -> 0-based 1), j=3 (1-based -> 0-based
+    // 2): flat = 1*3 + 2 = 5 -> data[5].
+    let data = [0.0, 1.0, 2.0, 10.0, 11.0, 12.0];
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // i = 2.0
+        Opcode::PushSubscriptIndex { bounds: 2 },
+        Opcode::LoadConstant { id: 1 }, // j = 3.0
+        Opcode::PushSubscriptIndex { bounds: 3 },
+        Opcode::LoadSubscript { off: 0 },
+    ];
+    assert_eq!(run_dyn(code, vec![2.0, 3.0], &data), 12.0);
+}
+
+#[test]
+fn legacy_subscript_2d_oob_in_either_index_is_nan() {
+    let data = [0.0, 1.0, 2.0, 10.0, 11.0, 12.0];
+    // Second index out of bounds (j=4 > 3) -> NaN even though i is valid.
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // i = 1
+        Opcode::PushSubscriptIndex { bounds: 2 },
+        Opcode::LoadConstant { id: 1 }, // j = 4 (oob)
+        Opcode::PushSubscriptIndex { bounds: 3 },
+        Opcode::LoadSubscript { off: 0 },
+    ];
+    assert!(run_dyn(code, vec![1.0, 4.0], &data).is_nan());
+}
+
+#[test]
+fn legacy_subscript_floors_fractional_index() {
+    // The VM does `stack.pop().floor() as u16`; idx 2.9 -> 1-based 2 -> slot 1.
+    let data = [10.0, 20.0, 30.0];
+    let code = vec![
+        Opcode::LoadConstant { id: 0 },
+        Opcode::PushSubscriptIndex { bounds: 3 },
+        Opcode::LoadSubscript { off: 0 },
+    ];
+    assert_eq!(run_dyn(code, vec![2.9], &data), 20.0);
+}
+
+/// Build a 1-D `PushVarViewDirect` over `dim` slots, apply a dynamic subscript
+/// at dim 0 from a constant index, and `ArraySum` the resulting (scalar) view
+/// -- the `ViewSubscriptDynamic` end-to-end shape, runnable in isolation.
+fn run_view_dyn_subscript(dim: u16, index: f64, data: &[f64]) -> f64 {
+    let mut context = ByteCodeContext::default();
+    // PushVarViewDirect resolves dims from a dim-list of raw sizes.
+    context.add_dim_list(1, [dim, 0, 0, 0]);
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![
+        Opcode::PushVarViewDirect {
+            base_off: 0,
+            dim_list_id: 0,
+        },
+        Opcode::LoadConstant { id: 0 }, // dynamic index
+        Opcode::ViewSubscriptDynamic { dim_idx: 0 },
+        Opcode::ArraySum {}, // sum of the 1-element view (or NaN if invalid)
+        Opcode::PopView {},
+    ];
+    run(
+        &bc(vec![index], code),
+        &ctx,
+        true,
+        0,
+        &seed_run(0, data),
+        None,
+    )
+}
+
+#[test]
+fn view_subscript_dynamic_in_range_reads_element() {
+    // arr[idx] reduced: idx = 3 (1-based) -> data[2]; SUM of the 1-element
+    // view is that element.
+    let data = [10.0, 20.0, 30.0, 40.0];
+    assert_eq!(run_view_dyn_subscript(4, 3.0, &data), 30.0);
+}
+
+#[test]
+fn view_subscript_dynamic_oob_is_nan() {
+    let data = [10.0, 20.0, 30.0, 40.0];
+    // idx = 5 > dim 4 -> view invalid -> reducer (even SUM) yields NaN.
+    assert!(
+        run_view_dyn_subscript(4, 5.0, &data).is_nan(),
+        "idx > dim -> invalid view -> NaN"
+    );
+    // idx = 0 invalid (1-based) -> NaN.
+    assert!(
+        run_view_dyn_subscript(4, 0.0, &data).is_nan(),
+        "idx 0 -> invalid view -> NaN"
+    );
+}
+
+#[test]
+fn view_subscript_dynamic_offset_picks_right_element() {
+    // Sweep the in-range indices: each picks the matching element.
+    let data = [5.0, 6.0, 7.0, 8.0, 9.0];
+    for (idx_1based, expected) in [(1, 5.0), (2, 6.0), (3, 7.0), (4, 8.0), (5, 9.0)] {
+        assert_eq!(
+            run_view_dyn_subscript(5, idx_1based as f64, &data),
+            expected,
+            "arr[{idx_1based}] (1-based)"
+        );
+    }
+}
+
+// ── End-to-end: a runtime-OOB dynamic subscript feeding a real reducer ────
+//
+// The white-box `run_invalid_view_reduce` above hand-forces `valid_local`;
+// this composes the genuine codegen shape -- `mat[oob_row, *]` where `row` is
+// a runtime out-of-range index -- so the invalid-view NaN flows from a real
+// `ViewSubscriptDynamic` through `emit_array_reduce`'s validity gate, over a
+// multi-element (non-degenerate) row, exactly as a model would produce it.
+
+/// Build a 2-D `mat[rows][cols]` view via `PushVarViewDirect`, dynamically
+/// subscript dim 0 with a runtime `row_1based` index (leaving a `cols`-element
+/// row view), and reduce that row. The row is invalid iff `row_1based` is out
+/// of `1..=rows`. `data` seeds the row-major curr slab (rows*cols slots).
+fn run_view_dyn_row_reduce(
+    rows: u16,
+    cols: u16,
+    row_1based: f64,
+    reduce: Opcode,
+    data: &[f64],
+) -> f64 {
+    let mut context = ByteCodeContext::default();
+    context.add_dim_list(2, [rows, cols, 0, 0]);
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![
+        Opcode::PushVarViewDirect {
+            base_off: 0,
+            dim_list_id: 0,
+        },
+        Opcode::LoadConstant { id: 0 }, // runtime row index (1-based)
+        Opcode::ViewSubscriptDynamic { dim_idx: 0 },
+        reduce,
+        Opcode::PopView {},
+    ];
+    run(
+        &bc(vec![row_1based], code),
+        &ctx,
+        true,
+        0,
+        &seed_run(0, data),
+        None,
+    )
+}
+
+#[test]
+fn view_dyn_oob_row_makes_every_reducer_nan() {
+    // A 3x4 matrix; row index 5 is out of range (rows = 3). The subscripted
+    // view spans a real 4-element row, but its validity flag is 0, so EVERY
+    // reducer -- including ArraySum, whose empty-but-valid result is 0.0 --
+    // must yield NaN, matching `reduce_view`'s leading `if !is_valid`.
+    let data: Vec<f64> = (0..12).map(|i| i as f64).collect();
+    for op in [
+        Opcode::ArraySum {},
+        Opcode::ArrayMax {},
+        Opcode::ArrayMin {},
+        Opcode::ArrayMean {},
+        Opcode::ArrayStddev {},
+    ] {
+        let got = run_view_dyn_row_reduce(3, 4, 5.0, op, &data);
+        assert!(
+            got.is_nan(),
+            "{}: an out-of-range dynamic row subscript must reduce to NaN, got {got}",
+            op.name()
+        );
+    }
+    // ArraySize is defined regardless of validity: a 4-wide row reports 4.
+    assert_eq!(
+        run_view_dyn_row_reduce(3, 4, 5.0, Opcode::ArraySize {}, &data),
+        4.0
+    );
+}
+
+#[test]
+fn view_dyn_in_range_row_reduces_like_vm() {
+    // The same shape with an in-range row index reduces the real row, so the
+    // NaN above is genuinely the validity gate, not a broken reducer. Row 2
+    // (1-based) of a 3x4 row-major matrix is slots 4..8 -> [4,5,6,7].
+    let data: Vec<f64> = (0..12).map(|i| i as f64).collect();
+    let row = [4.0f64, 5.0, 6.0, 7.0];
+    let sum: f64 = row.iter().sum();
+    let mean = sum / row.len() as f64;
+    let var = row.iter().map(|v| (v - mean) * (v - mean)).sum::<f64>() / row.len() as f64;
+    assert_eq!(
+        run_view_dyn_row_reduce(3, 4, 2.0, Opcode::ArraySum {}, &data),
+        sum
+    );
+    assert_eq!(
+        run_view_dyn_row_reduce(3, 4, 2.0, Opcode::ArrayMax {}, &data),
+        7.0
+    );
+    assert_eq!(
+        run_view_dyn_row_reduce(3, 4, 2.0, Opcode::ArrayMin {}, &data),
+        4.0
+    );
+    assert_eq!(
+        run_view_dyn_row_reduce(3, 4, 2.0, Opcode::ArrayMean {}, &data),
+        mean
+    );
+    assert!(
+        (run_view_dyn_row_reduce(3, 4, 2.0, Opcode::ArrayStddev {}, &data) - var.sqrt()).abs()
+            < 1e-12
+    );
+}
+
+// ════════════════════════════════════════════════════════════════════════
+// IMPORTANT (review feedback): full-unrolling has a documented size cap.
+//
+// Reducers, `BeginIter`, and `BeginBroadcastIter` all unroll fully at compile
+// time. `EmitState::charge_unroll` bounds the cumulative element count per
+// function at `MAX_UNROLL_UNITS`, returning `Unsupported` (so the model falls
+// back to the VM) before any oversized body is emitted. These check the cap
+// directly via `emit_bytecode`, asserting an over-budget program is rejected
+// WITHOUT materializing a giant function, and an under-budget one still emits.
+// ════════════════════════════════════════════════════════════════════════
+
+/// Lower `bc` into a throwaway function, returning the lowering result. Used
+/// to assert that an over-budget program is rejected at emit time without
+/// running (or even finishing building) the module.
+fn lower_only(bc: &ByteCode, ctx: &EmitCtx) -> Result<Function, WasmGenError> {
+    let mut func = Function::new(opcode_fn_locals(
+        0,
+        count_extra_i32_locals(bc),
+        count_module_input_scratch(bc),
+    ));
+    emit_bytecode(bc, ctx, &mut func)?;
+    func.instruction(&Instruction::End);
+    Ok(func)
+}
+
+#[test]
+fn reducer_over_view_exceeding_cap_is_unsupported() {
+    // A single static view whose element count exceeds MAX_UNROLL_UNITS. Two
+    // u16 dims (300 x 300 = 90_000 > 65_536) overflow the budget; the cap is
+    // checked before the fold, so lowering returns Unsupported with no
+    // emitted body. The fixture itself is tiny -- proving we reject rather
+    // than emit a multi-megabyte function.
+    let mut context = ByteCodeContext::default();
+    let view_id = context.add_static_view(dense_view(0, &[300, 300]));
+    assert!(dense_view(0, &[300, 300]).to_runtime_view().size() > MAX_UNROLL_UNITS);
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![
+        Opcode::PushStaticView { view_id },
+        Opcode::ArraySum {},
+        Opcode::PopView {},
+    ];
+    match lower_only(&bc(vec![], code), &ctx) {
+        Err(WasmGenError::Unsupported(msg)) => assert!(
+            msg.contains("unrolling exceeds"),
+            "expected the unroll-budget message, got: {msg}"
+        ),
+        Ok(_) => panic!("a reducer over a view larger than the cap must be Unsupported"),
+    }
+}
+
+#[test]
+fn iteration_over_view_exceeding_cap_is_unsupported() {
+    // A `BeginIter` whose iteration count exceeds the cap is rejected before
+    // the body is re-emitted even once past the budget. Geometry: a 300x300
+    // temp written elementwise from a same-shaped source.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], 300 * 300);
+    let out = context.add_static_view(temp_view(0, &[300, 300]));
+    let src = context.add_static_view(dense_view(0, &[300, 300]));
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![
+        Opcode::PushStaticView { view_id: out },
+        Opcode::BeginIter {
+            write_temp_id: 0,
+            has_write_temp: true,
+        },
+        Opcode::PushStaticView { view_id: src },
+        Opcode::LoadIterViewAt { offset: 1 },
+        Opcode::StoreIterElement {},
+        Opcode::NextIterOrJump { jump_back: -3 },
+        Opcode::EndIter {},
+        Opcode::PopView {},
+        Opcode::PopView {},
+    ];
+    match lower_only(&bc(vec![], code), &ctx) {
+        Err(WasmGenError::Unsupported(msg)) => assert!(
+            msg.contains("unrolling exceeds"),
+            "expected the unroll-budget message, got: {msg}"
+        ),
+        Ok(_) => panic!("an iteration larger than the cap must be Unsupported"),
+    }
+}
+
+#[test]
+fn array_size_over_huge_view_is_free() {
+    // ArraySize emits no element reads (`size() as f64`), so it must NOT be
+    // charged against the unroll budget: a view far larger than the cap still
+    // reports its size and lowers fine.
+    let mut context = ByteCodeContext::default();
+    let view_id = context.add_static_view(dense_view(0, &[300, 300]));
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![
+        Opcode::PushStaticView { view_id },
+        Opcode::ArraySize {},
+        Opcode::PopView {},
+    ];
+    assert!(
+        lower_only(&bc(vec![], code), &ctx).is_ok(),
+        "ArraySize does no element reads and must not be capped"
+    );
+}
+
+#[test]
+fn reducer_just_under_cap_compiles_and_matches_vm() {
+    // A view sized just under the cap still lowers and runs to VM parity. We
+    // keep the fixture small/fast (a 64-element view) but assert the budget
+    // accounting admits it: 64 << MAX_UNROLL_UNITS. (The full corpus of small
+    // arrayed reducer tests above is the broad just-under-cap parity check;
+    // this pins the boundary intent.)
+    let data: Vec<f64> = (0..64).map(|i| (i as f64) * 0.5).collect();
+    let view = dense_view(0, &[64]);
+    assert!(view.to_runtime_view().size() <= MAX_UNROLL_UNITS);
+    let got = run_static_reduce(view.clone(), Opcode::ArraySum {}, &data);
+    assert_eq!(got, vm_sum(&view, &data));
+}
+
+#[test]
+fn unroll_cap_has_headroom_over_realistic_arrays() {
+    // The cap must be generous enough for real SD models. The test corpus's
+    // largest single dimension is 9; even a region x sector x cohort nest is
+    // ~10^3 elements. A compile-time assert pins that the cap clears a
+    // deliberately roomy 10^4 with margin, documenting that legitimate models
+    // never trip it.
+    const _: () = assert!(
+        MAX_UNROLL_UNITS >= 10_000,
+        "the unroll cap must leave ample headroom for realistic arrayed models"
+    );
+}
+
+// ════════════════════════════════════════════════════════════════════════
+// Phase 6 Task 1: VECTOR SELECT + VECTOR ELM MAP
+//
+// `VectorSelect` reduces two views (a selector mask + an expression array) to
+// ONE scalar pushed on the stack. `VectorElmMap` maps a source array through a
+// per-element offset array into a `write_temp_id` temp region. Both are run
+// under DLR-FT and cross-checked against the VM: VectorSelect against a faithful
+// oracle of the `vm.rs:2444-2502` arm, VectorElmMap against the sibling
+// `crate::vm_vector_elm_map::vector_elm_map` function directly.
+// ════════════════════════════════════════════════════════════════════════
+
+/// The VM `VectorSelect` oracle (mirroring `vm.rs:2444-2502`): zip the two views
+/// to the shorter size, collect `expr` where `is_truthy(sel)`, then dispatch the
+/// action (1=min, 2=mean, 3=max, 4=product, else sum) with the empty-selection
+/// fallback to `max_value`.
+fn vm_vector_select_oracle(
+    sel_view: &StaticArrayView,
+    expr_view: &StaticArrayView,
+    sel_data: &[f64],
+    expr_data: &[f64],
+    max_value: f64,
+    action: i32,
+) -> f64 {
+    let sel_rv = sel_view.to_runtime_view();
+    let expr_rv = expr_view.to_runtime_view();
+    let size = sel_rv.size().min(expr_rv.size());
+    let mut selected: Vec<f64> = Vec::new();
+    let mut sel_idx: SmallVec<[u16; 4]> = smallvec::smallvec![0; sel_rv.dims.len()];
+    let mut expr_idx: SmallVec<[u16; 4]> = smallvec::smallvec![0; expr_rv.dims.len()];
+    for _ in 0..size {
+        let sel_off = sel_rv.flat_offset(&sel_idx);
+        let sel_val = sel_data[sel_rv.base_off as usize + sel_off];
+        if crate::vm::is_truthy(sel_val) {
+            let expr_off = expr_rv.flat_offset(&expr_idx);
+            selected.push(expr_data[expr_rv.base_off as usize + expr_off]);
+        }
+        crate::vm::increment_indices(&mut sel_idx, &sel_rv.dims);
+        crate::vm::increment_indices(&mut expr_idx, &expr_rv.dims);
+    }
+    if selected.is_empty() {
+        max_value
+    } else {
+        match action {
+            1 => selected.iter().cloned().fold(f64::INFINITY, f64::min),
+            2 => selected.iter().sum::<f64>() / selected.len() as f64,
+            3 => selected.iter().cloned().fold(f64::NEG_INFINITY, f64::max),
+            4 => selected.iter().product(),
+            _ => selected.iter().sum(),
+        }
+    }
+}
+
+/// Run `PushStaticView(sel); PushStaticView(expr); VectorSelect` over a `curr`
+/// slab. The two views are pushed sel-then-expr so `expr_view = top`,
+/// `sel_view = top-1` (matching the VM). `max_value`/`action` are pushed as the
+/// two operands beneath `VectorSelect` (the VM pops `action` then `max_value`).
+#[allow(clippy::too_many_arguments)]
+fn run_vector_select(
+    sel_view: StaticArrayView,
+    expr_view: StaticArrayView,
+    sel_base: u32,
+    expr_base: u32,
+    data: &[f64],
+    max_value: f64,
+    action: f64,
+) -> f64 {
+    let mut context = ByteCodeContext::default();
+    let sel_id = context.add_static_view(sel_view);
+    let expr_id = context.add_static_view(expr_view);
+    let ctx = ctx_with_arrays(&context);
+    let _ = (sel_base, expr_base);
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // max_value (pushed first)
+        Opcode::LoadConstant { id: 1 }, // action (pushed second, on top)
+        Opcode::PushStaticView { view_id: sel_id },
+        Opcode::PushStaticView { view_id: expr_id },
+        Opcode::VectorSelect {},
+        Opcode::PopView {},
+        Opcode::PopView {},
+    ];
+    run(
+        &bc(vec![max_value, action], code),
+        &ctx,
+        true,
+        0,
+        &seed_run(0, data),
+        None,
+    )
+}
+
+/// Assert the emitted `VectorSelect` matches the VM oracle for `action`, on the
+/// shared `sel`/`expr` fixture seeded from `data` (sel slots 0..4, expr 4..8).
+fn assert_vector_select_matches(action: f64, max_value: f64) {
+    let sel = dense_view(0, &[4]);
+    let expr = dense_view(4, &[4]);
+    let data = [1.0, 0.0, 1.0, 1.0, 10.0, 20.0, 30.0, 40.0];
+    let got = run_vector_select(sel.clone(), expr.clone(), 0, 4, &data, max_value, action);
+    let want = vm_vector_select_oracle(&sel, &expr, &data, &data, max_value, action.round() as i32);
+    if want.is_nan() {
+        assert!(got.is_nan(), "action {action}: expected NaN, got {got}");
+    } else {
+        assert_eq!(got, want, "action {action}: got {got}, want {want}");
+    }
+}
+
+#[test]
+fn vector_select_sum_matches_vm() {
+    // sel = [1, 0, 1, 1] (mask), expr = [10, 20, 30, 40], action 5 (sum).
+    // Selected = [10, 30, 40] -> 80.
+    assert_vector_select_matches(5.0, -1.0);
+    let sel = dense_view(0, &[4]);
+    let expr = dense_view(4, &[4]);
+    let data = [1.0, 0.0, 1.0, 1.0, 10.0, 20.0, 30.0, 40.0];
+    let got = run_vector_select(sel, expr, 0, 4, &data, -1.0, 5.0);
+    assert_eq!(got, 80.0);
+}
+
+#[test]
+fn vector_select_each_action_matches_vm() {
+    // 1=min, 2=mean, 3=max, 4=product, and a few "else -> sum" actions. The
+    // selected set is [10, 30, 40]: min 10, mean 80/3, max 40, product 12000,
+    // sum 80.
+    for action in [1.0, 2.0, 3.0, 4.0, 0.0, 5.0, 7.0] {
+        assert_vector_select_matches(action, -1.0);
+    }
+}
+
+#[test]
+fn vector_select_empty_selection_returns_max_value() {
+    // An all-false mask selects nothing, so the result is `max_value` for every
+    // action (the VM's `if selected.is_empty() { max_value }`).
+    let sel = dense_view(0, &[4]);
+    let expr = dense_view(4, &[4]);
+    // Mask all zero.
+    let data = [0.0, 0.0, 0.0, 0.0, 10.0, 20.0, 30.0, 40.0];
+    for action in [1.0, 2.0, 3.0, 4.0, 5.0] {
+        let got = run_vector_select(sel.clone(), expr.clone(), 0, 4, &data, 123.5, action);
+        let want = vm_vector_select_oracle(&sel, &expr, &data, &data, 123.5, action.round() as i32);
+        assert_eq!(
+            got, want,
+            "action {action}: empty selection must be max_value"
+        );
+        assert_eq!(got, 123.5);
+    }
+}
+
+#[test]
+fn vector_select_nan_in_mask_is_truthy_like_vm() {
+    // is_truthy(NaN) is true (approx_eq(NaN, 0) is false), so a NaN mask entry
+    // SELECTS its expr value, exactly as the VM does. Mask = [NaN, 0, 1]:
+    // selects expr[0] and expr[2].
+    let sel = dense_view(0, &[3]);
+    let expr = dense_view(3, &[3]);
+    let data = [f64::NAN, 0.0, 1.0, 100.0, 200.0, 300.0];
+    for action in [1.0, 3.0, 5.0] {
+        let got = run_vector_select(sel.clone(), expr.clone(), 3, 3, &data, -1.0, action);
+        let want = vm_vector_select_oracle(&sel, &expr, &data, &data, -1.0, action.round() as i32);
+        assert_eq!(
+            got, want,
+            "action {action}: NaN mask entry must select its expr"
+        );
+    }
+}
+
+#[test]
+fn vector_select_zip_stops_at_shorter_view() {
+    // sel has 4 elements, expr has 2: the VM zips to min(4, 2) = 2, so only the
+    // first two (sel, expr) pairs are considered. Mask [1, 1, ...] selects
+    // expr[0], expr[1]; the trailing sel entries never read a (nonexistent)
+    // expr element.
+    let sel = dense_view(0, &[4]);
+    let expr = dense_view(4, &[2]);
+    let data = [1.0, 1.0, 1.0, 1.0, 7.0, 11.0];
+    let got = run_vector_select(sel.clone(), expr.clone(), 0, 4, &data, -1.0, 5.0);
+    let want = vm_vector_select_oracle(&sel, &expr, &data, &data, -1.0, 5);
+    assert_eq!(got, want);
+    assert_eq!(got, 18.0, "sum of the first two expr values");
+}
+
+#[test]
+fn vector_select_nan_expr_value_ignored_by_minmax_like_vm() {
+    // A selected expr value of NaN is ignored by min/max (the VM folds with
+    // `f64::min`/`f64::max`, which return the non-NaN operand), so wasm `f64.min`/
+    // `f64.max` (NaN-propagating) would diverge -- this pins the faithful
+    // NaN-ignoring fold. Selected = [10, NaN, 40]: min 10, max 40 (NOT NaN);
+    // sum/mean/product DO see the NaN (VM uses `+`/`*`, which propagate).
+    let sel = dense_view(0, &[3]);
+    let expr = dense_view(3, &[3]);
+    let data = [1.0, 1.0, 1.0, 10.0, f64::NAN, 40.0];
+    // min and max must be exactly 10 and 40 (NaN ignored).
+    assert_eq!(
+        run_vector_select(sel.clone(), expr.clone(), 3, 3, &data, -1.0, 1.0),
+        10.0
+    );
+    assert_eq!(
+        run_vector_select(sel.clone(), expr.clone(), 3, 3, &data, -1.0, 3.0),
+        40.0
+    );
+    // sum/product propagate the NaN, matching the VM (cross-checked vs oracle).
+    for action in [2.0, 4.0, 5.0] {
+        assert_vector_select_nan_expr(&sel, &expr, &data, action);
+    }
+}
+
+fn assert_vector_select_nan_expr(
+    sel: &StaticArrayView,
+    expr: &StaticArrayView,
+    data: &[f64],
+    action: f64,
+) {
+    let got = run_vector_select(sel.clone(), expr.clone(), 3, 3, data, -1.0, action);
+    let want = vm_vector_select_oracle(sel, expr, data, data, -1.0, action.round() as i32);
+    if want.is_nan() {
+        assert!(got.is_nan(), "action {action}: expected NaN, got {got}");
+    } else {
+        assert_eq!(got, want, "action {action}");
+    }
+}
+
+// ── VectorElmMap parity vs the sibling VM function ────────────────────────
+
+/// Run `PushStaticView(source); PushStaticView(offset); VectorElmMap` over a
+/// `curr` slab seeded from `data`, writing temp 0, and read back `count` temp
+/// slots. The source view is pushed first (`top-1`), the offset view second
+/// (`top`), matching the VM (`offset_view = top, source_view = top-1`).
+fn run_vector_elm_map(
+    source: StaticArrayView,
+    offset: StaticArrayView,
+    full_source_len: u32,
+    data: &[f64],
+    temp_count: usize,
+    temp_slots: usize,
+) -> Vec<f64> {
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], temp_slots);
+    let src_id = context.add_static_view(source);
+    let off_id = context.add_static_view(offset);
+    let code = vec![
+        Opcode::PushStaticView { view_id: src_id },
+        Opcode::PushStaticView { view_id: off_id },
+        Opcode::VectorElmMap {
+            write_temp_id: 0,
+            full_source_len,
+        },
+        Opcode::PopView {},
+        Opcode::PopView {},
+    ];
+    run_and_read_temps(&context, code, vec![], &seed_run(0, data), temp_count)
+}
+
+/// The VM oracle for `VectorElmMap`: run the sibling
+/// `crate::vm_vector_elm_map::vector_elm_map` over `RuntimeView`s built from the
+/// same static views, reading `curr` from `data`. Returns the written temp 0
+/// slots (`temp_slots` wide).
+fn vm_elm_map_oracle(
+    source: &StaticArrayView,
+    offset: &StaticArrayView,
+    full_source_len: u32,
+    data: &[f64],
+    temp_slots: usize,
+) -> Vec<f64> {
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], temp_slots);
+    let mut temp_storage = vec![0.0f64; temp_slots];
+    crate::vm_vector_elm_map::vector_elm_map(
+        &source.to_runtime_view(),
+        &offset.to_runtime_view(),
+        0,
+        full_source_len,
+        data,
+        &mut temp_storage,
+        &context,
+    );
+    temp_storage
+}
+
+/// Assert the emitted `VectorElmMap` matches the sibling VM function element-for-
+/// element over the `offset_view` size (NaN compares as NaN).
+fn assert_elm_map_matches(
+    source: &StaticArrayView,
+    offset: &StaticArrayView,
+    full_source_len: u32,
+    data: &[f64],
+    temp_slots: usize,
+) {
+    let got = run_vector_elm_map(
+        source.clone(),
+        offset.clone(),
+        full_source_len,
+        data,
+        temp_slots,
+        temp_slots,
+    );
+    let want = vm_elm_map_oracle(source, offset, full_source_len, data, temp_slots);
+    assert_eq!(got.len(), want.len());
+    for (i, (&g, &w)) in got.iter().zip(want.iter()).enumerate() {
+        if w.is_nan() {
+            assert!(g.is_nan(), "elm_map slot {i}: expected NaN, got {g}");
+        } else {
+            assert_eq!(g, w, "elm_map slot {i}: got {g}, want {w}");
+        }
+    }
+}
+
+#[test]
+fn vector_elm_map_full_array_in_range_matches_vm() {
+    // Full contiguous source [a,b,c,d] in curr slots 0..4; offset [1,3,0,2] in
+    // curr slots 4..8 -> result = source[round(offset[i])] = [b, d, a, c].
+    let source = dense_view(0, &[4]);
+    let offset = dense_view(4, &[4]);
+    let data = [10.0, 20.0, 30.0, 40.0, 1.0, 3.0, 0.0, 2.0];
+    assert_elm_map_matches(&source, &offset, 4, &data, 4);
+    let got = run_vector_elm_map(source, offset, 4, &data, 4, 4);
+    assert_eq!(got, vec![20.0, 40.0, 10.0, 30.0]);
+}
+
+#[test]
+fn vector_elm_map_out_of_range_offset_is_nan() {
+    // An offset that lands outside [0, full_source_len) yields NaN (no modulo).
+    // Source len 3; offsets [0, 5, -1] -> [source[0], NaN, NaN].
+    let source = dense_view(0, &[3]);
+    let offset = dense_view(3, &[3]);
+    let data = [7.0, 8.0, 9.0, 0.0, 5.0, -1.0];
+    assert_elm_map_matches(&source, &offset, 3, &data, 3);
+    let got = run_vector_elm_map(source, offset, 3, &data, 3, 3);
+    assert_eq!(got[0], 7.0);
+    assert!(got[1].is_nan() && got[2].is_nan());
+}
+
+#[test]
+fn vector_elm_map_nan_offset_is_nan() {
+    // A NaN offset yields NaN, regardless of the (would-be) index.
+    let source = dense_view(0, &[3]);
+    let offset = dense_view(3, &[3]);
+    let data = [7.0, 8.0, 9.0, 1.0, f64::NAN, 2.0];
+    assert_elm_map_matches(&source, &offset, 3, &data, 3);
+    let got = run_vector_elm_map(source, offset, 3, &data, 3, 3);
+    assert_eq!(got[0], 8.0);
+    assert!(got[1].is_nan());
+    assert_eq!(got[2], 9.0);
+}
+
+#[test]
+fn vector_elm_map_offset_rounds_half_away_like_vm() {
+    // The VM rounds the offset with `f64::round` (half away from zero), NOT wasm
+    // `f64.nearest` (half to even). Offsets [0.5, 1.5, 2.5] round to [1, 2, 3]
+    // (away from zero), not [0, 2, 2] (to even). Cross-checked vs the sibling.
+    let source = dense_view(0, &[4]);
+    let offset = dense_view(4, &[3]);
+    let data = [10.0, 20.0, 30.0, 40.0, 0.5, 1.5, 2.5];
+    assert_elm_map_matches(&source, &offset, 4, &data, 3);
+    let got = run_vector_elm_map(source, offset, 4, &data, 3, 3);
+    // round(0.5)=1 -> source[1]=20; round(1.5)=2 -> 30; round(2.5)=3 -> 40.
+    assert_eq!(got, vec![20.0, 30.0, 40.0]);
+}
+
+// ── emit_round_half_away parity vs f64::round (the VM's rounding oracle) ───
+
+/// Build a module exporting `mem` and `eval(module_off: i32)` whose body loads
+/// the f64 at memory slot 0 (byte 0), runs [`super::vector::emit_round_half_away`]
+/// directly, and stores the rounded result back to slot 0. Mirrors
+/// [`build_module`]'s helper-prefix assembly so the function declarations match
+/// production; the body is a focused probe of just the round helper.
+fn build_round_probe_module() -> Vec<u8> {
+    let mut module = Module::new();
+
+    let helpers = build_helpers();
+    let n_helpers = helpers.functions.len() as u32;
+
+    let mut types = TypeSection::new();
+    types.ty().function([ValType::I32], []); // eval(module_off) -> ()
+    for hf in &helpers.functions {
+        types.ty().function(hf.params.clone(), hf.results.clone());
+    }
+    module.section(&types);
+
+    let mut functions = FunctionSection::new();
+    for (i, _) in helpers.functions.iter().enumerate() {
+        functions.function(1 + i as u32);
+    }
+    functions.function(0);
+    module.section(&functions);
+
+    let mut memories = MemorySection::new();
+    memories.memory(MemoryType {
+        minimum: 1,
+        maximum: None,
+        memory64: false,
+        shared: false,
+        page_size_log2: None,
+    });
+    module.section(&memories);
+
+    let mut exports = ExportSection::new();
+    exports.export("eval", ExportKind::Func, n_helpers);
+    exports.export("mem", ExportKind::Memory, 0);
+    module.section(&exports);
+
+    let mut code = CodeSection::new();
+    for hf in helpers.functions {
+        code.function(&hf.body);
+    }
+    // Same local layout production uses; the round helper draws its two f64
+    // temps from `scratch_local` (index 1) and `apply_locals[0]` (index 2).
+    let ctx = ctx_with_cond_depth(0);
+    let mut func = Function::new(opcode_fn_locals(0, 0, 0));
+    // result_addr (i32) for the trailing store, then x = mem[0].
+    func.instruction(&Instruction::I32Const(0));
+    func.instruction(&Instruction::I32Const(0));
+    func.instruction(&Instruction::F64Load(memarg(0)));
+    crate::wasmgen::vector::emit_round_half_away(&mut func, ctx.scratch_local, ctx.apply_locals[0]);
+    func.instruction(&Instruction::F64Store(memarg(0)));
+    func.instruction(&Instruction::End);
+    code.function(&func);
+    module.section(&code);
+
+    module.finish()
+}
+
+/// Run the round probe over input `x` and return the f64 the helper produced.
+fn run_round_half_away(x: f64) -> f64 {
+    let bytes = build_round_probe_module();
+    let info = validate(&bytes).expect("round-probe module must validate");
+    let mut store = Store::new(());
+    let module = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("round-probe module must instantiate")
+        .module_addr;
+
+    let mem = store
+        .instance_export(module, "mem")
+        .unwrap()
+        .as_mem()
+        .unwrap();
+    store.mem_access_mut_slice(mem, |bytes| {
+        bytes[0..8].copy_from_slice(&x.to_le_bytes());
+    });
+
+    let eval = store
+        .instance_export(module, "eval")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    store
+        .invoke_simple_typed::<(i32,), ()>(eval, (0_i32,))
+        .expect("round-probe invocation must succeed");
+
+    store.mem_access_mut_slice(mem, |bytes| {
+        f64::from_le_bytes(bytes[0..8].try_into().unwrap())
+    })
+}
+
+/// Assert the emitted round helper reproduces `f64::round` (the VM's rounding
+/// oracle) bit-for-bit, including the sign of a zero result. Cross-checking
+/// against the standard-library oracle is the whole point: the prior
+/// `trunc(x + copysign(0.5, x))` form diverged from it for two reachable input
+/// classes (see `emit_round_half_away`'s rustdoc).
+fn assert_round_matches_f64_round(x: f64) {
+    let got = run_round_half_away(x);
+    let want = x.round();
+    if want.is_nan() {
+        assert!(got.is_nan(), "round({x}): expected NaN, got {got}");
+    } else {
+        assert_eq!(
+            got.to_bits(),
+            want.to_bits(),
+            "round({x}): got {got} (bits {:#x}), want {want} (bits {:#x})",
+            got.to_bits(),
+            want.to_bits()
+        );
+    }
+}
+
+#[test]
+fn round_half_away_matches_f64_round_boundary_classes() {
+    // Class (a): the largest f64 strictly below 0.5. `trunc(x + 0.5)` rounds the
+    // sum up to exactly 1.0 and yields 1; `f64::round` yields 0. The sign of the
+    // zero must be preserved (`-0.0` for the negative input).
+    let just_below_half = 0.499_999_999_999_999_94_f64; // == 0.5_f64.next_down()
+    assert_eq!(just_below_half, f64::from_bits(0x3fdf_ffff_ffff_ffff));
+    assert_round_matches_f64_round(just_below_half);
+    assert_round_matches_f64_round(-just_below_half);
+    assert_eq!(run_round_half_away(just_below_half), 0.0);
+    assert!(run_round_half_away(-just_below_half).is_sign_negative());
+
+    // Class (b): an already-integer magnitude in [2^52, 2^53). `x + 0.5` rounds
+    // up to `x + 1`; `f64::round` returns `x` unchanged.
+    let big_odd_int = 4_503_599_627_370_497.0_f64; // 2^52 + 1
+    assert_round_matches_f64_round(big_odd_int);
+    assert_round_matches_f64_round(-big_odd_int);
+    assert_eq!(run_round_half_away(big_odd_int), big_odd_int);
+
+    // Exact-half inputs: round AWAY from zero (the VM's `f64::round`), not the
+    // half-to-even of wasm `f64.nearest`.
+    for &x in &[0.5_f64, -0.5, 1.5, 2.5, -2.5, -0.0, 0.0] {
+        assert_round_matches_f64_round(x);
+    }
+}
+
+#[test]
+fn round_half_away_matches_f64_round_sampled() {
+    // A deterministic sweep of magnitudes/signs/fractions cross-checked against
+    // the `f64::round` oracle, so a future change to the helper that drifts from
+    // the VM's rounding is caught here, not only in the two boundary classes.
+    let mut state = 0x2545_f491_4f6c_dd1d_u64; // xorshift64* seed
+    let mut next = || {
+        state ^= state >> 12;
+        state ^= state << 25;
+        state ^= state >> 27;
+        state.wrapping_mul(0x2545_f491_4f6c_dd1d)
+    };
+    for _ in 0..2000 {
+        let bits = next();
+        // Span small fractions through large integer-grid magnitudes.
+        let scale = match bits % 5 {
+            0 => 1.0,
+            1 => 16.0,
+            2 => 1024.0,
+            3 => 4_503_599_627_370_496.0, // 2^52
+            _ => 9_007_199_254_740_992.0, // 2^53
+        };
+        let frac = (bits >> 8) as f64 / (u64::MAX >> 8) as f64; // [0, 1)
+        let mag = frac * scale * 2.0;
+        let x = if bits & 1 == 0 { mag } else { -mag };
+        assert_round_matches_f64_round(x);
+    }
+}
+
+#[test]
+fn vector_elm_map_sliced_source_base_i_matches_vm() {
+    // A strict-slice source: a 2-D source [DimA(2), DimB(3)] (full storage 6
+    // elements in curr 0..6), sliced... here we exercise the carried-axis base_i
+    // projection via a source whose remaining dim shares its dim_id with the
+    // offset view. Source = matrix[A,B] row-major; offset view is 2-D [A,B] with
+    // matching dim_ids, so element (a,b) reads source[base_i(a) + round(off)].
+    //
+    // Build source as [A(2), B(3)] dim_ids [0,1] over storage [0..6], and offset
+    // as [A(2)] dim_id [0] -- but VECTOR ELM MAP needs offset.size() result
+    // slots, so use a 2-D offset matching the result. We model the genuine
+    // shape: source full storage len 6, source view is the full [2,3], offset
+    // [2,3] with the same dim_ids; base_i is 0 (full array) and offset indexes
+    // the whole storage. To exercise a NON-zero base_i we instead slice the
+    // source to a single row and give the offset that row's dim.
+    //
+    // Simpler faithful base_i case: source view = row 1 of a [2,3] matrix
+    // (offset folds in 3), dim_ids [1] (DimB); offset view [3] dim_id [1]. Then
+    // base_i = source.flat_offset([b]) projects DimB, and the result reads
+    // storage[3 + round(off)]. full_source_len = 6.
+    let mut source = dense_view(0, &[3]); // the sliced row: dims [3]
+    source.offset = 3; // row 1 of a [2,3] matrix starts at flat 3
+    source.dim_ids = SmallVec::from_slice(&[1]); // DimB
+    let mut offset = dense_view(6, &[3]);
+    offset.dim_ids = SmallVec::from_slice(&[1]); // DimB, matching the source
+    // Storage: matrix rows [r0: 100,101,102][r1: 200,201,202]; offsets [0,1,2].
+    let data = [100.0, 101.0, 102.0, 200.0, 201.0, 202.0, 0.0, 1.0, 2.0];
+    assert_elm_map_matches(&source, &offset, 6, &data, 3);
+    let got = run_vector_elm_map(source, offset, 6, &data, 3, 3);
+    // base_i for element b is source.flat_offset([b]) = 3 + b; + round(off[b]):
+    //   b=0: 3 + 0 -> storage[3]=200; b=1: 4 + 1 -> storage[5]=202;
+    //   b=2: 5 + 2 = 7 -> OOB (>=6) -> NaN.
+    assert_eq!(got[0], 200.0);
+    assert_eq!(got[1], 202.0);
+    assert!(got[2].is_nan());
+}
+
+// ── VectorSortOrder / Rank parity vs the VM (stable sort) ─────────────────
+
+/// Run `PushStaticView(input); Vector{SortOrder|Rank}` over a `curr` slab seeded
+/// from `data`, writing temp 0, and read back `temp_count` temp slots. The
+/// `direction` operand is pushed beneath the op.
+fn run_sort_op(
+    input: StaticArrayView,
+    op: Opcode,
+    direction: f64,
+    data: &[f64],
+    temp_count: usize,
+    temp_slots: usize,
+) -> Vec<f64> {
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], temp_slots);
+    let in_id = context.add_static_view(input);
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // direction
+        Opcode::PushStaticView { view_id: in_id },
+        op,
+        Opcode::PopView {},
+    ];
+    run_and_read_temps(
+        &context,
+        code,
+        vec![direction],
+        &seed_run(0, data),
+        temp_count,
+    )
+}
+
+/// The VM oracle for `VectorSortOrder`: run the sibling
+/// `crate::vm_vector_sort_order::vector_sort_order` over a `RuntimeView`.
+fn vm_sort_order_oracle(
+    input: &StaticArrayView,
+    direction: i32,
+    data: &[f64],
+    temp_slots: usize,
+) -> Vec<f64> {
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], temp_slots);
+    let mut temp_storage = vec![0.0f64; temp_slots];
+    crate::vm_vector_sort_order::vector_sort_order(
+        &input.to_runtime_view(),
+        direction,
+        0,
+        data,
+        &mut temp_storage,
+        &context,
+    );
+    temp_storage
+}
+
+/// A faithful local oracle for `Rank` (mirroring `vm.rs:2540-2584`): over the
+/// whole view, collect `(value, orig_idx)`, stable sort (asc if direction==1
+/// else desc, NaN-as-Equal), write `temp[orig_idx] = rank_0based + 1`.
+fn vm_rank_oracle(
+    input: &StaticArrayView,
+    direction: i32,
+    data: &[f64],
+    temp_slots: usize,
+) -> Vec<f64> {
+    let rv = input.to_runtime_view();
+    let size = rv.size();
+    let mut indexed: Vec<(f64, usize)> = Vec::with_capacity(size);
+    let mut idx: SmallVec<[u16; 4]> = smallvec::smallvec![0; rv.dims.len()];
+    for i in 0..size {
+        let flat = rv.flat_offset(&idx);
+        indexed.push((data[rv.base_off as usize + flat], i));
+        crate::vm::increment_indices(&mut idx, &rv.dims);
+    }
+    if direction == 1 {
+        indexed.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
+    } else {
+        indexed.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
+    }
+    let mut temp = vec![0.0f64; temp_slots];
+    for (rank_0based, &(_, orig_idx)) in indexed.iter().enumerate() {
+        temp[orig_idx] = (rank_0based + 1) as f64;
+    }
+    temp
+}
+
+fn assert_sort_order_matches(input: &StaticArrayView, direction: f64, data: &[f64], slots: usize) {
+    let got = run_sort_op(
+        input.clone(),
+        Opcode::VectorSortOrder { write_temp_id: 0 },
+        direction,
+        data,
+        slots,
+        slots,
+    );
+    let want = vm_sort_order_oracle(input, direction.round() as i32, data, slots);
+    assert_eq!(got, want, "sort_order direction {direction}");
+}
+
+fn assert_rank_matches(input: &StaticArrayView, direction: f64, data: &[f64], slots: usize) {
+    let got = run_sort_op(
+        input.clone(),
+        Opcode::Rank { write_temp_id: 0 },
+        direction,
+        data,
+        slots,
+        slots,
+    );
+    let want = vm_rank_oracle(input, direction.round() as i32, data, slots);
+    assert_eq!(got, want, "rank direction {direction}");
+}
+
+#[test]
+fn vector_sort_order_1d_ascending_matches_vm() {
+    // input [30, 10, 20, 40]; ascending -> the sorted in-row source indices are
+    // [1 (10), 2 (20), 0 (30), 3 (40)].
+    let input = dense_view(0, &[4]);
+    let data = [30.0, 10.0, 20.0, 40.0];
+    assert_sort_order_matches(&input, 1.0, &data, 4);
+    let got = run_sort_op(
+        input,
+        Opcode::VectorSortOrder { write_temp_id: 0 },
+        1.0,
+        &data,
+        4,
+        4,
+    );
+    assert_eq!(got, vec![1.0, 2.0, 0.0, 3.0]);
+}
+
+#[test]
+fn vector_sort_order_1d_descending_matches_vm() {
+    // direction != 1 sorts descending: [30,10,20,40] -> indices of [40,30,20,10]
+    // = [3, 0, 2, 1].
+    let input = dense_view(0, &[4]);
+    let data = [30.0, 10.0, 20.0, 40.0];
+    assert_sort_order_matches(&input, 0.0, &data, 4);
+    let got = run_sort_op(
+        input,
+        Opcode::VectorSortOrder { write_temp_id: 0 },
+        0.0,
+        &data,
+        4,
+        4,
+    );
+    assert_eq!(got, vec![3.0, 0.0, 2.0, 1.0]);
+}
+
+#[test]
+fn vector_sort_order_tie_stability_matches_vm() {
+    // Equal values keep input order (stable). [5, 5, 1, 5]: ascending sorts the
+    // single 1 (index 2) first, then the three 5s in input order [0, 1, 3].
+    let input = dense_view(0, &[4]);
+    let data = [5.0, 5.0, 1.0, 5.0];
+    assert_sort_order_matches(&input, 1.0, &data, 4);
+    let got = run_sort_op(
+        input,
+        Opcode::VectorSortOrder { write_temp_id: 0 },
+        1.0,
+        &data,
+        4,
+        4,
+    );
+    assert_eq!(got, vec![2.0, 0.0, 1.0, 3.0]);
+}
+
+#[test]
+fn vector_sort_order_multi_row_matches_vm() {
+    // A 2x3 source: each ROW is sorted independently (the innermost dim is the
+    // sorted axis), and result indices are 0-based WITHIN the row. Row 0
+    // [30,10,20] asc -> [1,2,0]; row 1 [5,9,7] asc -> [0,2,1]. The output is
+    // row-major, so temp = [1,2,0, 0,2,1].
+    let input = dense_view(0, &[2, 3]);
+    let data = [30.0, 10.0, 20.0, 5.0, 9.0, 7.0];
+    assert_sort_order_matches(&input, 1.0, &data, 6);
+    let got = run_sort_op(
+        input,
+        Opcode::VectorSortOrder { write_temp_id: 0 },
+        1.0,
+        &data,
+        6,
+        6,
+    );
+    assert_eq!(got, vec![1.0, 2.0, 0.0, 0.0, 2.0, 1.0]);
+}
+
+#[test]
+fn vector_sort_order_nan_element_is_stable_like_vm() {
+    // A NaN element compares Equal to everything (the VM's
+    // partial_cmp.unwrap_or(Equal) under a stable sort), so it neither displaces
+    // a non-NaN nor reorders -- it stays in input order. Cross-checked
+    // element-for-element vs the sibling VM function.
+    let input = dense_view(0, &[4]);
+    let data = [3.0, f64::NAN, 1.0, 2.0];
+    assert_sort_order_matches(&input, 1.0, &data, 4);
+    assert_sort_order_matches(&input, 0.0, &data, 4);
+}
+
+#[test]
+fn vector_sort_order_transposed_view_matches_vm() {
+    // A non-contiguous (transposed) view exercises the strided element reads in
+    // the gather. Cross-checked vs the sibling over every element.
+    let view = StaticArrayView {
+        base_off: 0,
+        is_temp: false,
+        dims: SmallVec::from_slice(&[3, 2]),
+        strides: SmallVec::from_slice(&[1, 3]),
+        offset: 0,
+        sparse: SmallVec::new(),
+        dim_ids: SmallVec::from_slice(&[0, 0]),
+    };
+    assert!(!view.to_runtime_view().is_contiguous());
+    let data = [11.0, 12.0, 13.0, 21.0, 22.0, 23.0];
+    assert_sort_order_matches(&view, 1.0, &data, 6);
+    assert_sort_order_matches(&view, 0.0, &data, 6);
+}
+
+#[test]
+fn rank_whole_view_ascending_matches_vm() {
+    // Rank over the WHOLE view, 1-based, indexed by ORIGINAL position. [30,10,20,
+    // 40] ascending: 10 is rank 1, 20 rank 2, 30 rank 3, 40 rank 4, so the result
+    // at the original positions is [3, 1, 2, 4].
+    let input = dense_view(0, &[4]);
+    let data = [30.0, 10.0, 20.0, 40.0];
+    assert_rank_matches(&input, 1.0, &data, 4);
+    let got = run_sort_op(input, Opcode::Rank { write_temp_id: 0 }, 1.0, &data, 4, 4);
+    assert_eq!(got, vec![3.0, 1.0, 2.0, 4.0]);
+}
+
+#[test]
+fn rank_whole_view_descending_matches_vm() {
+    // Descending: 40 rank 1, 30 rank 2, 20 rank 3, 10 rank 4 -> [2, 4, 3, 1].
+    let input = dense_view(0, &[4]);
+    let data = [30.0, 10.0, 20.0, 40.0];
+    assert_rank_matches(&input, 0.0, &data, 4);
+    let got = run_sort_op(input, Opcode::Rank { write_temp_id: 0 }, 0.0, &data, 4, 4);
+    assert_eq!(got, vec![2.0, 4.0, 3.0, 1.0]);
+}
+
+#[test]
+fn rank_multi_dim_ranks_whole_view_not_per_row() {
+    // Unlike VectorSortOrder, Rank ranks the WHOLE view (not per-row). A 2x3
+    // view ranks all 6 cells together. Cross-checked vs the faithful oracle.
+    let input = dense_view(0, &[2, 3]);
+    let data = [30.0, 10.0, 20.0, 5.0, 9.0, 7.0];
+    assert_rank_matches(&input, 1.0, &data, 6);
+    // Sorted ascending: 5(idx3),9(idx4),7(idx5)... actually [5,7,9,10,20,30]
+    // -> ranks at original positions: 30->6, 10->4, 20->5, 5->1, 9->3, 7->2.
+    let got = run_sort_op(input, Opcode::Rank { write_temp_id: 0 }, 1.0, &data, 6, 6);
+    assert_eq!(got, vec![6.0, 4.0, 5.0, 1.0, 3.0, 2.0]);
+}
+
+#[test]
+fn rank_tie_stability_matches_vm() {
+    // Equal values keep input order: [5, 5, 1, 5] ascending. The 1 (idx 2) is
+    // rank 1; the three 5s get ranks 2, 3, 4 in input order (idx 0, 1, 3).
+    let input = dense_view(0, &[4]);
+    let data = [5.0, 5.0, 1.0, 5.0];
+    assert_rank_matches(&input, 1.0, &data, 4);
+    let got = run_sort_op(input, Opcode::Rank { write_temp_id: 0 }, 1.0, &data, 4, 4);
+    assert_eq!(got, vec![2.0, 3.0, 1.0, 4.0]);
+}
+
+#[test]
+fn rank_nan_element_matches_vm() {
+    // A NaN element compares Equal (stable). Cross-checked vs the faithful oracle
+    // (the NaN keeps its input position in the stable sort, so its rank is its
+    // sorted slot among the Equal-treated elements).
+    let input = dense_view(0, &[4]);
+    let data = [3.0, f64::NAN, 1.0, 2.0];
+    assert_rank_matches(&input, 1.0, &data, 4);
+    assert_rank_matches(&input, 0.0, &data, 4);
+}
+
+/// Build `mat[rows][cols]` via `PushVarViewDirect`, dynamically subscript dim 0
+/// with an out-of-range `row_1based` (so the resulting `cols`-element row view's
+/// validity flag is 0), run `op` writing temp 0, and read back the `cols` temp
+/// slots. An invalid input view must fill the whole temp region with NaN.
+fn run_dyn_sort_op(rows: u16, cols: u16, row_1based: f64, op: Opcode, data: &[f64]) -> Vec<f64> {
+    let mut context = ByteCodeContext::default();
+    context.add_dim_list(2, [rows, cols, 0, 0]);
+    context.set_temp_info(vec![0], cols as usize);
+    let code = vec![
+        Opcode::PushVarViewDirect {
+            base_off: 0,
+            dim_list_id: 0,
+        },
+        Opcode::LoadConstant { id: 0 }, // direction
+        Opcode::LoadConstant { id: 1 }, // runtime row index (1-based)
+        Opcode::ViewSubscriptDynamic { dim_idx: 0 },
+        op,
+        Opcode::PopView {},
+    ];
+    run_and_read_temps(
+        &context,
+        code,
+        vec![1.0, row_1based],
+        &seed_run(0, data),
+        cols as usize,
+    )
+}
+
+#[test]
+fn vector_sort_order_invalid_view_fills_temp_with_nan() {
+    // A 3x4 matrix; row 5 is out of range, so the dynamically-subscripted row
+    // view is invalid and VectorSortOrder must fill the whole temp with NaN
+    // (the VM's `!is_valid -> fill_temp_nan`).
+    let data: Vec<f64> = (0..12).map(|i| i as f64).collect();
+    let got = run_dyn_sort_op(
+        3,
+        4,
+        5.0,
+        Opcode::VectorSortOrder { write_temp_id: 0 },
+        &data,
+    );
+    assert!(
+        got.iter().all(|v| v.is_nan()),
+        "invalid view must fill the temp with NaN, got {got:?}"
+    );
+    // A valid row (row 2) writes real 0-based in-row ranks (no NaN).
+    let ok = run_dyn_sort_op(
+        3,
+        4,
+        2.0,
+        Opcode::VectorSortOrder { write_temp_id: 0 },
+        &data,
+    );
+    assert!(ok.iter().all(|v| !v.is_nan()), "valid row must not be NaN");
+}
+
+#[test]
+fn rank_invalid_view_fills_temp_with_nan() {
+    let data: Vec<f64> = (0..12).map(|i| i as f64).collect();
+    let got = run_dyn_sort_op(3, 4, 5.0, Opcode::Rank { write_temp_id: 0 }, &data);
+    assert!(
+        got.iter().all(|v| v.is_nan()),
+        "invalid view must fill the temp with NaN, got {got:?}"
+    );
+    let ok = run_dyn_sort_op(3, 4, 2.0, Opcode::Rank { write_temp_id: 0 }, &data);
+    assert!(ok.iter().all(|v| !v.is_nan()), "valid row must not be NaN");
+}
+
+// ── LookupArray parity vs the VM (per-element arrayed GF) ─────────────────
+
+// GF region base for the LookupArray tests: past the curr/next chunks
+// (4096..8192), TEMP_BASE (8192), and VECTOR_SCRATCH_BASE (16384), within the
+// harness's single 64 KiB page.
+const LA_GF_BASE: u32 = 24576;
+
+/// Seed `tables` into the GF directory + data regions at `LA_GF_BASE` (the
+/// directory's N 8-byte entries, then each table's knots), matching the
+/// production layout the `LookupArray`/`Lookup` opcodes read.
+fn seed_gf_tables(tables: &[&[(f64, f64)]]) -> Vec<(u64, f64)> {
+    let n = tables.len() as u32;
+    let data_base = LA_GF_BASE + n * 8; // past the N directory entries
+    let mut seed = Vec::new();
+    let mut data_rel = 0u32;
+    for (t, knots) in tables.iter().enumerate() {
+        let abs = data_base + data_rel;
+        seed.push((
+            u64::from(LA_GF_BASE) + (t as u64) * 8,
+            dir_entry_f64(abs, knots.len() as u32),
+        ));
+        for (k, &(x, y)) in knots.iter().enumerate() {
+            let knot = u64::from(abs) + (k as u64) * 16;
+            seed.push((knot, x));
+            seed.push((knot + 8, y));
+        }
+        data_rel += knots.len() as u32 * 16;
+    }
+    seed
+}
+
+/// Run `PushStaticView(input); LookupArray{base_gf, table_count, mode}; PopView`
+/// over the seeded GF tables, writing temp 0, and read back `temp_count` slots.
+/// `index` (the shared scalar lookup index) is pushed beneath the opcode.
+#[allow(clippy::too_many_arguments)]
+fn run_lookup_array(
+    input: StaticArrayView,
+    base_gf: GraphicalFunctionId,
+    table_count: u16,
+    mode: LookupMode,
+    index: f64,
+    tables: &[&[(f64, f64)]],
+    temp_count: usize,
+    temp_slots: usize,
+    input_data: &[f64],
+) -> Vec<f64> {
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], temp_slots);
+    let in_id = context.add_static_view(input);
+    let ctx = EmitCtx {
+        gf_directory_base: LA_GF_BASE,
+        gf_data_base: LA_GF_BASE,
+        temp_storage_base: TEMP_BASE,
+        ctx: &context,
+        ..ctx_with_cond_depth(0)
+    };
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // index
+        Opcode::PushStaticView { view_id: in_id },
+        Opcode::LookupArray {
+            base_gf,
+            table_count,
+            mode,
+            write_temp_id: 0,
+        },
+        Opcode::PopView {},
+    ];
+    let mut seed = seed_run(0, input_data);
+    seed.extend(seed_gf_tables(tables));
+    let bytes = build_module(&bc(vec![index], code), &ctx, false, 0);
+    let info = validate(&bytes).expect("emitted module must validate");
+    let mut store = Store::new(());
+    let inst = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("instantiate")
+        .module_addr;
+    let mem = store
+        .instance_export(inst, "mem")
+        .unwrap()
+        .as_mem()
+        .unwrap();
+    store.mem_access_mut_slice(mem, |b| {
+        for &(addr, v) in &seed {
+            let a = addr as usize;
+            b[a..a + 8].copy_from_slice(&v.to_le_bytes());
+        }
+    });
+    let eval = store
+        .instance_export(inst, "eval")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    store
+        .invoke_simple_typed::<(i32,), ()>(eval, (0_i32,))
+        .expect("invoke");
+    store.mem_access_mut_slice(mem, |b| {
+        (0..temp_count)
+            .map(|i| {
+                let a = TEMP_BASE as usize + i * 8;
+                f64::from_le_bytes(b[a..a + 8].try_into().unwrap())
+            })
+            .collect()
+    })
+}
+
+/// Faithful oracle for `LookupArray` (mirroring `vm.rs:2586-2629`): for each
+/// element `i`, `elem_off = flat_offset(indices)`; NaN if `elem_off >=
+/// table_count`, else the VM lookup over `tables[base_gf + elem_off]` at `index`.
+fn vm_lookup_array_oracle(
+    input: &StaticArrayView,
+    base_gf: GraphicalFunctionId,
+    table_count: u16,
+    mode: LookupMode,
+    index: f64,
+    tables: &[&[(f64, f64)]],
+    temp_slots: usize,
+) -> Vec<f64> {
+    let rv = input.to_runtime_view();
+    let size = rv.size();
+    let mut idx: SmallVec<[u16; 4]> = smallvec::smallvec![0; rv.dims.len()];
+    let mut temp = vec![0.0f64; temp_slots];
+    for slot in temp.iter_mut().take(size) {
+        let elem_off = rv.flat_offset(&idx);
+        *slot = if elem_off >= table_count as usize {
+            f64::NAN
+        } else {
+            let gf = tables[base_gf as usize + elem_off];
+            vm_lookup_oracle(mode, gf, index)
+        };
+        crate::vm::increment_indices(&mut idx, &rv.dims);
+    }
+    temp
+}
+
+#[allow(clippy::too_many_arguments)]
+fn assert_lookup_array_matches(
+    input: &StaticArrayView,
+    base_gf: GraphicalFunctionId,
+    table_count: u16,
+    mode: LookupMode,
+    index: f64,
+    tables: &[&[(f64, f64)]],
+    slots: usize,
+    input_data: &[f64],
+) {
+    let got = run_lookup_array(
+        input.clone(),
+        base_gf,
+        table_count,
+        mode,
+        index,
+        tables,
+        slots,
+        slots,
+        input_data,
+    );
+    let want = vm_lookup_array_oracle(input, base_gf, table_count, mode, index, tables, slots);
+    assert_eq!(got.len(), want.len());
+    for (i, (&g, &w)) in got.iter().zip(want.iter()).enumerate() {
+        if w.is_nan() {
+            assert!(g.is_nan(), "lookup_array slot {i}: expected NaN, got {g}");
+        } else {
+            assert_eq!(g, w, "lookup_array slot {i}: got {g}, want {w}");
+        }
+    }
+}
+
+#[test]
+fn lookup_array_interp_matches_vm() {
+    // Three per-element tables; a contiguous 3-element input view -> elem_off
+    // [0, 1, 2]. Each element looks up its own table at the shared index.
+    let t0: &[(f64, f64)] = &[(0.0, 0.0), (10.0, 100.0)]; // y = 10x
+    let t1: &[(f64, f64)] = &[(0.0, 1.0), (10.0, 2.0)]; // y = x/10 + 1
+    let t2: &[(f64, f64)] = &[(0.0, 5.0), (10.0, 5.0)]; // constant 5
+    let tables = [t0, t1, t2];
+    let input = dense_view(0, &[3]);
+    let input_data = [0.0, 0.0, 0.0];
+    assert_lookup_array_matches(
+        &input,
+        0,
+        3,
+        LookupMode::Interpolate,
+        5.0,
+        &tables,
+        3,
+        &input_data,
+    );
+    let got = run_lookup_array(
+        input,
+        0,
+        3,
+        LookupMode::Interpolate,
+        5.0,
+        &tables,
+        3,
+        3,
+        &input_data,
+    );
+    // index 5: t0 interp 50, t1 interp 1.5, t2 constant 5.
+    assert_eq!(got, vec![50.0, 1.5, 5.0]);
+}
+
+/// A monotonic-x table fixture (reused across modes/indices).
+const LA_TABLE_A: &[(f64, f64)] = &[(0.0, 10.0), (1.0, 20.0), (2.5, 5.0), (4.0, 40.0)];
+const LA_TABLE_B: &[(f64, f64)] = &[(0.0, 0.0), (2.0, 8.0), (2.0, 12.0), (5.0, 50.0)];
+
+#[test]
+fn lookup_array_all_modes_over_domain_match_vm() {
+    // Two per-element tables, a 2-element input view (elem_off [0, 1]). For each
+    // mode, probe several indices spanning below/at/between/above the knots; each
+    // element's result must match the corresponding VM lookup over its table.
+    let tables = [LA_TABLE_A, LA_TABLE_B];
+    let input = dense_view(0, &[2]);
+    let input_data = [0.0, 0.0];
+    for mode in [
+        LookupMode::Interpolate,
+        LookupMode::Forward,
+        LookupMode::Backward,
+    ] {
+        for &index in &[-1.0, 0.0, 0.5, 1.0, 2.0, 2.001, 3.25, 4.0, 100.0] {
+            assert_lookup_array_matches(&input, 0, 2, mode, index, &tables, 2, &input_data);
+        }
+    }
+}
+
+#[test]
+fn lookup_array_out_of_range_element_offset_is_nan() {
+    // table_count = 2, but the input view has 3 elements -> elem_off [0, 1, 2].
+    // Element 2's offset (2) is >= table_count (2), so its result is NaN
+    // (matching the scalar Lookup bound), while elements 0 and 1 look up tables
+    // 0 and 1.
+    let tables = [LA_TABLE_A, LA_TABLE_B];
+    let input = dense_view(0, &[3]);
+    let input_data = [0.0, 0.0, 0.0];
+    assert_lookup_array_matches(
+        &input,
+        0,
+        2,
+        LookupMode::Interpolate,
+        1.0,
+        &tables,
+        3,
+        &input_data,
+    );
+    let got = run_lookup_array(
+        input,
+        0,
+        2,
+        LookupMode::Interpolate,
+        1.0,
+        &tables,
+        3,
+        3,
+        &input_data,
+    );
+    assert_eq!(got[0], 20.0); // t0 at index 1 (exact knot)
+    assert!(got[2].is_nan(), "element offset 2 >= table_count 2 -> NaN");
+}
+
+#[test]
+fn lookup_array_base_gf_offsets_into_directory() {
+    // base_gf selects a starting table; a 2-element view with base_gf=1 reads
+    // tables 1 and 2 (NOT 0 and 1). Three tables, table_count covers all three.
+    let t0: &[(f64, f64)] = &[(0.0, 0.0), (10.0, 100.0)];
+    let t1: &[(f64, f64)] = &[(0.0, 1.0), (10.0, 2.0)];
+    let t2: &[(f64, f64)] = &[(0.0, 7.0), (10.0, 7.0)];
+    let tables = [t0, t1, t2];
+    let input = dense_view(0, &[2]);
+    let input_data = [0.0, 0.0];
+    // base_gf=1, table_count=3 (the bound is on elem_off, not base_gf+elem_off,
+    // matching the VM): elem_off [0,1], tables base_gf+elem_off = [1, 2].
+    assert_lookup_array_matches(
+        &input,
+        1,
+        3,
+        LookupMode::Interpolate,
+        5.0,
+        &tables,
+        2,
+        &input_data,
+    );
+    let got = run_lookup_array(
+        input,
+        1,
+        3,
+        LookupMode::Interpolate,
+        5.0,
+        &tables,
+        2,
+        2,
+        &input_data,
+    );
+    // t1 interp at 5 -> 1.5; t2 constant 7.
+    assert_eq!(got, vec![1.5, 7.0]);
+}
+
+#[test]
+fn lookup_array_strided_view_offsets_match_vm() {
+    // A transposed (non-contiguous) input view exercises the per-element
+    // flat_offset projection for elem_off. dim_ids/strides differ from row-major,
+    // so a mis-addressed elem_off would pick the wrong table. Cross-checked vs the
+    // faithful oracle, which uses the same `flat_offset`.
+    let t0: &[(f64, f64)] = &[(0.0, 0.0), (10.0, 100.0)];
+    let t1: &[(f64, f64)] = &[(0.0, 1.0), (10.0, 2.0)];
+    let t2: &[(f64, f64)] = &[(0.0, 20.0), (10.0, 30.0)];
+    let t3: &[(f64, f64)] = &[(0.0, 5.0), (10.0, 5.0)];
+    let tables = [t0, t1, t2, t3];
+    // 2x2 transposed: dims [2,2], strides [1,2] -> elem_offs visited row-major
+    // are [0, 2, 1, 3].
+    let input = StaticArrayView {
+        base_off: 0,
+        is_temp: false,
+        dims: SmallVec::from_slice(&[2, 2]),
+        strides: SmallVec::from_slice(&[1, 2]),
+        offset: 0,
+        sparse: SmallVec::new(),
+        dim_ids: SmallVec::from_slice(&[0, 0]),
+    };
+    let input_data = [0.0, 0.0, 0.0, 0.0];
+    assert_lookup_array_matches(
+        &input,
+        0,
+        4,
+        LookupMode::Interpolate,
+        5.0,
+        &tables,
+        4,
+        &input_data,
+    );
+}
+
+#[test]
+fn lookup_array_invalid_view_fills_temp_with_nan() {
+    // A dynamically-subscripted-out-of-range input view -> the whole temp region
+    // is filled with NaN (the VM's `!is_valid -> fill_temp_nan`).
+    let t0: &[(f64, f64)] = &[(0.0, 0.0), (10.0, 100.0)];
+    let tables = [t0, t0, t0, t0];
+    let mut context = ByteCodeContext::default();
+    context.add_dim_list(2, [3, 4, 0, 0]); // mat[3][4]
+    context.set_temp_info(vec![0], 4);
+    let ctx = EmitCtx {
+        gf_directory_base: LA_GF_BASE,
+        gf_data_base: LA_GF_BASE,
+        temp_storage_base: TEMP_BASE,
+        ctx: &context,
+        ..ctx_with_cond_depth(0)
+    };
+    // mat[5, *]: row 5 out of range -> invalid 4-element row view.
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // index
+        Opcode::PushVarViewDirect {
+            base_off: 0,
+            dim_list_id: 0,
+        },
+        Opcode::LoadConstant { id: 1 }, // runtime row index (1-based)
+        Opcode::ViewSubscriptDynamic { dim_idx: 0 },
+        Opcode::LookupArray {
+            base_gf: 0,
+            table_count: 4,
+            mode: LookupMode::Interpolate,
+            write_temp_id: 0,
+        },
+        Opcode::PopView {},
+    ];
+    let mut seed = seed_run(0, &(0..12).map(|i| i as f64).collect::<Vec<_>>());
+    seed.extend(seed_gf_tables(&tables));
+    let bytes = build_module(&bc(vec![5.0, 5.0], code), &ctx, false, 0);
+    let info = validate(&bytes).expect("module must validate");
+    let mut store = Store::new(());
+    let inst = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("instantiate")
+        .module_addr;
+    let mem = store
+        .instance_export(inst, "mem")
+        .unwrap()
+        .as_mem()
+        .unwrap();
+    store.mem_access_mut_slice(mem, |b| {
+        for &(addr, v) in &seed {
+            let a = addr as usize;
+            b[a..a + 8].copy_from_slice(&v.to_le_bytes());
+        }
+    });
+    let eval = store
+        .instance_export(inst, "eval")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    store
+        .invoke_simple_typed::<(i32,), ()>(eval, (0_i32,))
+        .expect("invoke");
+    let temps: Vec<f64> = store.mem_access_mut_slice(mem, |b| {
+        (0..4)
+            .map(|i| {
+                let a = TEMP_BASE as usize + i * 8;
+                f64::from_le_bytes(b[a..a + 8].try_into().unwrap())
+            })
+            .collect()
+    });
+    assert!(
+        temps.iter().all(|v| v.is_nan()),
+        "invalid input view must fill the LookupArray temp with NaN, got {temps:?}"
+    );
+}
+
+// ════════════════════════════════════════════════════════════════════════
+// Phase 6 Task 4: AllocateAvailable + AllocateByPriority (opcode lowering)
+//
+// These run the emitted opcode programs under DLR-FT and cross-check the
+// written temp region against the VM's own arm logic (`vm.rs:2631-2794`),
+// which gathers requests/profiles from the views and calls
+// `crate::alloc::allocate_available`. The oracle below reproduces that gather
+// (the `pp_cols`/defaults for AllocateAvailable, the rectangular-profile
+// synthesis for AllocateByPriority) and calls the same `allocate_available`,
+// so a passing test proves the wasm opcode == the VM opcode element-for-element.
+// The full `Vm::new(sim).run_to_end()` parity on a real model lives in
+// `module.rs`'s `compile_simulation_allocate_available_matches_vm`.
+// ════════════════════════════════════════════════════════════════════════
+
+/// Run `PushStaticView(requests); PushStaticView(profile); AllocateAvailable;
+/// PopView; PopView` over a `curr` slab seeded from `data`, writing temp 0, and
+/// read back `n` temp slots. The views are pushed requests-then-profile so
+/// `profile_view = top`, `requests_view = top-1` (matching the VM); `avail` is
+/// the single operand pushed beneath the opcode.
+fn run_allocate_available(
+    requests: StaticArrayView,
+    profile: StaticArrayView,
+    avail: f64,
+    data: &[f64],
+    n: usize,
+) -> Vec<f64> {
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], n);
+    let req_id = context.add_static_view(requests);
+    let prof_id = context.add_static_view(profile);
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // avail
+        Opcode::PushStaticView { view_id: req_id },
+        Opcode::PushStaticView { view_id: prof_id },
+        Opcode::AllocateAvailable { write_temp_id: 0 },
+        Opcode::PopView {},
+        Opcode::PopView {},
+    ];
+    run_and_read_temps(&context, code, vec![avail], &seed_run(0, data), n)
+}
+
+/// Run `PushStaticView(requests); PushStaticView(priority); AllocateByPriority;
+/// PopView; PopView`. The operands are `width` (pushed first) then `supply`
+/// (pushed last, on top) -- matching the VM's `supply = pop`, `width = pop`.
+fn run_allocate_by_priority(
+    requests: StaticArrayView,
+    priority: StaticArrayView,
+    width: f64,
+    supply: f64,
+    data: &[f64],
+    n: usize,
+) -> Vec<f64> {
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], n);
+    let req_id = context.add_static_view(requests);
+    let pri_id = context.add_static_view(priority);
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // width (pushed first)
+        Opcode::LoadConstant { id: 1 }, // supply (pushed second, on top)
+        Opcode::PushStaticView { view_id: req_id },
+        Opcode::PushStaticView { view_id: pri_id },
+        Opcode::AllocateByPriority { write_temp_id: 0 },
+        Opcode::PopView {},
+        Opcode::PopView {},
+    ];
+    run_and_read_temps(&context, code, vec![width, supply], &seed_run(0, data), n)
+}
+
+/// The VM `AllocateAvailable` oracle (`vm.rs:2631-2721`): gather requests +
+/// the flattened profile array from the views, build the per-requester
+/// `(ptype, ppriority, pwidth, pextra)` tuples via the `pp_cols`/defaults logic,
+/// then call `crate::alloc::allocate_available`.
+fn vm_allocate_available_oracle(
+    requests_view: &StaticArrayView,
+    profile_view: &StaticArrayView,
+    avail: f64,
+    data: &[f64],
+) -> Vec<f64> {
+    let requests: Vec<f64> = (0..requests_view.to_runtime_view().size())
+        .map(|i| vm_view_element(requests_view, data, i))
+        .collect();
+    let n = requests.len();
+    let pp_size = profile_view.to_runtime_view().size();
+    let pp_values: Vec<f64> = (0..pp_size)
+        .map(|i| vm_view_element(profile_view, data, i))
+        .collect();
+    let pp_cols = if !pp_values.is_empty() && n > 0 && pp_size.is_multiple_of(n) {
+        pp_size / n
+    } else {
+        4
+    };
+    let profiles: Vec<(f64, f64, f64, f64)> = (0..n)
+        .map(|i| {
+            let base = i * pp_cols;
+            let g = |k: usize, dflt: f64| pp_values.get(base + k).copied().unwrap_or(dflt);
+            (g(0, 0.0), g(1, 0.0), g(2, 1.0), g(3, 0.0))
+        })
+        .collect();
+    crate::alloc::allocate_available(&requests, &profiles, avail)
+}
+
+/// The VM `AllocateByPriority` oracle (`vm.rs:2723-2794`): gather requests +
+/// priorities, synthesize rectangular profiles `(1, priorities[i] or 0, width,
+/// 0)`, then call `crate::alloc::allocate_available` with `supply`.
+fn vm_allocate_by_priority_oracle(
+    requests_view: &StaticArrayView,
+    priority_view: &StaticArrayView,
+    width: f64,
+    supply: f64,
+    data: &[f64],
+) -> Vec<f64> {
+    let requests: Vec<f64> = (0..requests_view.to_runtime_view().size())
+        .map(|i| vm_view_element(requests_view, data, i))
+        .collect();
+    let n = requests.len();
+    let priorities: Vec<f64> = (0..priority_view.to_runtime_view().size())
+        .map(|i| vm_view_element(priority_view, data, i))
+        .collect();
+    let profiles: Vec<(f64, f64, f64, f64)> = (0..n)
+        .map(|i| (1.0, priorities.get(i).copied().unwrap_or(0.0), width, 0.0))
+        .collect();
+    crate::alloc::allocate_available(&requests, &profiles, supply)
+}
+
+/// Assert the emitted `AllocateAvailable` matches the VM oracle (NaN as NaN,
+/// else exact -- the wasm helpers are bit-faithful ports, so the only drift is
+/// the leaf `exp`/`pow` approximations; use a tight tolerance).
+fn assert_allocate_available_matches(
+    requests_view: &StaticArrayView,
+    profile_view: &StaticArrayView,
+    avail: f64,
+    data: &[f64],
+    n: usize,
+) {
+    let got = run_allocate_available(requests_view.clone(), profile_view.clone(), avail, data, n);
+    let want = vm_allocate_available_oracle(requests_view, profile_view, avail, data);
+    assert_eq!(got.len(), want.len());
+    for (i, (&g, &w)) in got.iter().zip(want.iter()).enumerate() {
+        if w.is_nan() {
+            assert!(
+                g.is_nan(),
+                "allocate_available slot {i}: expected NaN, got {g}"
+            );
+        } else {
+            let diff = (g - w).abs();
+            let rel = if w != 0.0 { diff / w.abs() } else { diff };
+            assert!(
+                diff <= 1e-9 || rel <= 1e-9,
+                "allocate_available slot {i}: got {g}, want {w} (diff {diff:.3e})"
+            );
+        }
+    }
+}
+
+#[test]
+fn allocate_available_full_grant_matches_vm() {
+    // avail >= total_demand: each requester gets request.max(0). requests in
+    // curr slots 0..3, the flat profile [3 requesters x 4 fields] in slots 3..15.
+    // Rectangular (ptype 1) profiles. total_demand = 3+2+4 = 9 < avail 100.
+    let requests = dense_view(0, &[3]);
+    let profile = dense_view(3, &[3, 4]);
+    let mut data = vec![3.0, 2.0, 4.0];
+    // Profile rows (region-major): (ptype, ppriority, pwidth, pextra).
+    data.extend_from_slice(&[1.0, 1.0, 1.0, 0.0]);
+    data.extend_from_slice(&[1.0, 2.0, 1.0, 0.0]);
+    data.extend_from_slice(&[1.0, 3.0, 1.0, 0.0]);
+    assert_allocate_available_matches(&requests, &profile, 100.0, &data, 3);
+    // Full grant returns the requests verbatim.
+    let got = run_allocate_available(requests, profile, 100.0, &data, 3);
+    assert_eq!(got, vec![3.0, 2.0, 4.0]);
+}
+
+#[test]
+fn allocate_available_zeros_when_supply_nonpositive_matches_vm() {
+    let requests = dense_view(0, &[3]);
+    let profile = dense_view(3, &[3, 4]);
+    let mut data = vec![3.0, 2.0, 4.0];
+    data.extend_from_slice(&[1.0, 1.0, 1.0, 0.0]);
+    data.extend_from_slice(&[1.0, 2.0, 1.0, 0.0]);
+    data.extend_from_slice(&[1.0, 3.0, 1.0, 0.0]);
+    assert_allocate_available_matches(&requests, &profile, 0.0, &data, 3);
+    let got = run_allocate_available(requests, profile, -5.0, &data, 3);
+    assert_eq!(got, vec![0.0, 0.0, 0.0]);
+}
+
+#[test]
+fn allocate_available_partial_bisection_rectangular_matches_vm() {
+    // 0 < avail < total_demand forces the bisection. Rectangular profiles.
+    let requests = dense_view(0, &[3]);
+    let profile = dense_view(3, &[3, 4]);
+    let mut data = vec![3.0, 2.0, 4.0];
+    data.extend_from_slice(&[1.0, 1.0, 1.0, 0.0]);
+    data.extend_from_slice(&[1.0, 2.0, 1.0, 0.0]);
+    data.extend_from_slice(&[1.0, 3.0, 1.0, 0.0]);
+    for avail in [1.0, 3.0, 5.0, 7.0, 8.5] {
+        assert_allocate_available_matches(&requests, &profile, avail, &data, 3);
+    }
+}
+
+#[test]
+fn allocate_available_partial_bisection_across_profile_types_matches_vm() {
+    // A mix of profile types (fixed/triangular/normal/exponential/CES), so the
+    // search-range `spread` per type and each curve at the converged price are
+    // exercised. 5 requesters x 4 profile fields.
+    let requests = dense_view(0, &[5]);
+    let profile = dense_view(5, &[5, 4]);
+    let mut data = vec![4.0, 3.0, 5.0, 2.0, 6.0];
+    data.extend_from_slice(&[0.0, 2.0, 1.0, 0.0]); // fixed
+    data.extend_from_slice(&[2.0, 3.0, 1.5, 0.0]); // triangular
+    data.extend_from_slice(&[3.0, 2.5, 1.0, 0.0]); // normal
+    data.extend_from_slice(&[4.0, 2.0, 1.2, 0.0]); // exponential
+    data.extend_from_slice(&[5.0, 3.0, 1.0, 2.0]); // CES
+    for avail in [2.0, 6.0, 10.0, 15.0, 19.0] {
+        assert_allocate_available_matches(&requests, &profile, avail, &data, 5);
+    }
+}
+
+#[test]
+fn allocate_available_pp_cols_defaults_when_not_divisible_matches_vm() {
+    // When pp_size is not a multiple of n, pp_cols falls back to 4 and the
+    // out-of-range profile fields take the defaults (0,0,1,0). Here n=3 but the
+    // profile view is 1-D of size 5 (not a multiple of 3), so pp_cols=4 and
+    // every requester reads past the end -> all-default profiles.
+    let requests = dense_view(0, &[3]);
+    let profile = dense_view(3, &[5]);
+    let data = vec![3.0, 2.0, 4.0, 9.0, 9.0, 9.0, 9.0, 9.0];
+    for avail in [0.5, 4.0, 100.0] {
+        assert_allocate_available_matches(&requests, &profile, avail, &data, 3);
+    }
+}
+
+#[test]
+fn allocate_available_invalid_view_fills_temp_with_nan() {
+    // A dynamically-subscripted requests view made invalid at runtime (row index
+    // out of bounds) takes the VM's `fill_temp_nan` short-circuit. Build the
+    // requests view via PushVarViewDirect + an out-of-bounds ViewSubscriptDynamic.
+    let mut context = ByteCodeContext::default();
+    context.set_temp_info(vec![0], 3);
+    context.add_dim_list(2, [3, 3, 0, 0]); // a [3,3] base for the dynamic subscript
+    let prof_id = context.add_static_view(dense_view(20, &[3, 4]));
+    let ctx = ctx_with_arrays(&context);
+    let code = vec![
+        Opcode::LoadConstant { id: 0 }, // avail
+        // requests view: PushVarViewDirect over a [3,3] base, then subscript row
+        // 9 (out of bounds) -> invalid view.
+        Opcode::PushVarViewDirect {
+            base_off: 0,
+            dim_list_id: 0,
+        },
+        Opcode::LoadConstant { id: 1 }, // runtime row index (1-based, OOB)
+        Opcode::ViewSubscriptDynamic { dim_idx: 0 },
+        Opcode::PushStaticView { view_id: prof_id },
+        Opcode::AllocateAvailable { write_temp_id: 0 },
+        Opcode::PopView {},
+        Opcode::PopView {},
+    ];
+    let bytes = build_module(&bc(vec![5.0, 9.0], code), &ctx, false, 0);
+    let info = validate(&bytes).expect("module must validate");
+    let mut store = Store::new(());
+    let inst = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("instantiate")
+        .module_addr;
+    let eval = store
+        .instance_export(inst, "eval")
+        .unwrap()
+        .as_func()
+        .unwrap();
+    store
+        .invoke_simple_typed::<(i32,), ()>(eval, (0_i32,))
+        .expect("invoke");
+    let mem = store
+        .instance_export(inst, "mem")
+        .unwrap()
+        .as_mem()
+        .unwrap();
+    let temps: Vec<f64> = store.mem_access_mut_slice(mem, |b| {
+        (0..3)
+            .map(|i| {
+                let a = TEMP_BASE as usize + i * 8;
+                f64::from_le_bytes(b[a..a + 8].try_into().unwrap())
+            })
+            .collect()
+    });
+    assert!(
+        temps.iter().all(|v| v.is_nan()),
+        "invalid input view must fill the AllocateAvailable temp with NaN, got {temps:?}"
+    );
+}
+
+#[test]
+fn allocate_by_priority_full_grant_matches_vm() {
+    // avail >= total_demand: full grant. requests in slots 0..3, priorities in
+    // slots 3..6. width=1, supply=100, total_demand=9.
+    let requests = dense_view(0, &[3]);
+    let priority = dense_view(3, &[3]);
+    let data = vec![3.0, 2.0, 4.0, 1.0, 2.0, 3.0];
+    assert_allocate_by_priority_matches(&requests, &priority, 1.0, 100.0, &data, 3);
+    let got = run_allocate_by_priority(requests, priority, 1.0, 100.0, &data, 3);
+    assert_eq!(got, vec![3.0, 2.0, 4.0]);
+}
+
+#[test]
+fn allocate_by_priority_zeros_when_supply_nonpositive_matches_vm() {
+    let requests = dense_view(0, &[3]);
+    let priority = dense_view(3, &[3]);
+    let data = vec![3.0, 2.0, 4.0, 1.0, 2.0, 3.0];
+    assert_allocate_by_priority_matches(&requests, &priority, 1.0, 0.0, &data, 3);
+}
+
+#[test]
+fn allocate_by_priority_partial_bisection_matches_vm() {
+    // 0 < supply < total_demand forces the bisection over the synthesized
+    // rectangular (ptype 1) profiles. Sweep several partial supplies and widths.
+    let requests = dense_view(0, &[3]);
+    let priority = dense_view(3, &[3]);
+    let data = vec![3.0, 2.0, 4.0, 1.0, 2.0, 3.0];
+    for &(width, supply) in &[(1.0, 1.0), (1.0, 5.0), (2.0, 4.0), (0.5, 7.0), (3.0, 8.5)] {
+        assert_allocate_by_priority_matches(&requests, &priority, width, supply, &data, 3);
+    }
+}
+
+/// Assert the emitted `AllocateByPriority` matches the VM oracle.
+fn assert_allocate_by_priority_matches(
+    requests_view: &StaticArrayView,
+    priority_view: &StaticArrayView,
+    width: f64,
+    supply: f64,
+    data: &[f64],
+    n: usize,
+) {
+    let got = run_allocate_by_priority(
+        requests_view.clone(),
+        priority_view.clone(),
+        width,
+        supply,
+        data,
+        n,
+    );
+    let want = vm_allocate_by_priority_oracle(requests_view, priority_view, width, supply, data);
+    assert_eq!(got.len(), want.len());
+    for (i, (&g, &w)) in got.iter().zip(want.iter()).enumerate() {
+        if w.is_nan() {
+            assert!(
+                g.is_nan(),
+                "allocate_by_priority slot {i}: expected NaN, got {g}"
+            );
+        } else {
+            let diff = (g - w).abs();
+            let rel = if w != 0.0 { diff / w.abs() } else { diff };
+            assert!(
+                diff <= 1e-9 || rel <= 1e-9,
+                "allocate_by_priority slot {i}: got {g}, want {w} (diff {diff:.3e})"
+            );
+        }
+    }
+}
diff --git a/src/simlin-engine/src/wasmgen/math.rs b/src/simlin-engine/src/wasmgen/math.rs
new file mode 100644
index 000000000..ff42f8403
--- /dev/null
+++ b/src/simlin-engine/src/wasmgen/math.rs
@@ -0,0 +1,1319 @@
+// Copyright 2026 The Simlin Authors. All rights reserved.
+// Use of this source code is governed by the Apache License,
+// Version 2.0, that can be found in the LICENSE file.
+
+// pattern: Functional Core
+// Pure transformation: each public function emits a self-contained wasm helper
+// `Function` (instruction sequence) for one transcendental. No I/O; the only
+// side effect is in `#[cfg(test)]`, which executes the emitted helpers under the
+// DLR-FT interpreter and compares against Rust `f64`.
+
+//! Open-coded transcendental helpers for the wasm simulation backend.
+//!
+//! WebAssembly's MVP numeric instruction set provides `f64.sqrt`/`abs`/`floor`/
+//! `ceil`/`trunc`/`nearest`/`min`/`max` and the arithmetic/compare ops, but *no*
+//! transcendental instructions (`sin`/`cos`/`exp`/`ln`/...). The bytecode VM
+//! reaches those through libm (`f64::sin` etc., `vm.rs::apply`). To stay a
+//! self-contained module that imports no host math, this backend emits one wasm
+//! helper function per transcendental, each built from range reduction plus a
+//! polynomial/rational kernel over only the natively-available ops (plus
+//! `i64.reinterpret_f64`/`f64.reinterpret_i64` for the exponent/mantissa bit
+//! tricks `exp`/`ln` need).
+//!
+//! ## Accuracy bar
+//!
+//! These need not be bit-identical to libm. The bar is the `simulate.rs`
+//! corpus tolerances (abs `2e-3` / rel `5e-6`, VDF `1%`): a model run through
+//! this backend must clear the same comparison the VM clears. The kernels here
+//! are chosen so each helper's worst-case error over its domain sits *far*
+//! inside that bar (each emitter's rustdoc records the measured worst-case error
+//! and the test that pins it); the slack absorbs any DLR-FT-vs-native rounding
+//! drift. The per-helper unit tests assert against Rust `f64` with a documented
+//! tolerance comfortably tighter than the corpus bar.
+//!
+//! ## Composition
+//!
+//! `tan = sin/cos`, `log10 = ln * (1/ln10)`, `asin = atan(x/sqrt(1-x^2))`,
+//! `acos = pi/2 - asin`, and `pow(x, y) = exp(y * ln x)`. `pow` therefore
+//! matches `f64::powf` only for a positive base; a negative base diverges
+//! (`ln` of a negative is NaN). That is a documented limitation -- no corpus
+//! model raises a negative base to a power -- so it is not chased here.
+//!
+//! ## Wiring
+//!
+//! Each emitter is pushed once by [`super::lower::build_helpers`], which records
+//! the resulting function index in [`super::lower::HelperFns`]; the `Apply`
+//! lowering (`lower.rs`, Phase 2 Task 4) and `Op2::Exp` (Task 3) reference a
+//! helper by that index via `call`. No index is hard-coded.
+
+use wasm_encoder::{Function, Instruction as Ins, ValType};
+
+use super::lower::f64_const;
+
+// ── Shared numeric constants (the kernels' magic numbers) ──────────────────
+
+/// `ln(2)` (the exp/ln exponent <-> natural-log conversion).
+const LN2: f64 = std::f64::consts::LN_2;
+/// `1/ln(2) = log2(e)` (scales `x` to a base-2 exponent count in `exp`).
+const LOG2E: f64 = std::f64::consts::LOG2_E;
+/// `2/pi` (scales `x` to a count of `pi/2` quadrants in `sin`/`cos`).
+const FRAC_2_PI: f64 = std::f64::consts::FRAC_2_PI;
+/// `1/ln(10)` (converts a natural log to a base-10 log).
+const INV_LN10: f64 = 1.0 / std::f64::consts::LN_10;
+
+// IEEE-754 binary64 field geometry, used by the exp/ln bit tricks.
+const EXP_BIAS: i64 = 1023;
+const EXP_MASK: i64 = 0x7ff; // 11 exponent bits
+const MANTISSA_BITS: i64 = 52;
+const MANTISSA_MASK: i64 = 0x000f_ffff_ffff_ffff;
+/// The exponent field of `1.0` (bias), pre-shifted into place: makes a raw
+/// mantissa into a value in `[1, 2)`.
+const ONE_EXP_FIELD: i64 = EXP_BIAS << MANTISSA_BITS;
+
+// `exp` overflow/underflow thresholds (matching `f64::exp`): just past these,
+// `exp(x)` rounds to `+inf` / `0`. Guarding here keeps the `2^k` exponent
+// assembly inside the representable exponent range.
+const EXP_OVERFLOW: f64 = 709.782_712_893_384;
+const EXP_UNDERFLOW: f64 = -745.133_219_101_941_2;
+
+// Cody-Waite three-part split of `pi/2` (the canonical fdlibm constants, each
+// exactly representable in f64; `PIO2_1`'s low mantissa bits are zero so
+// `x - k*PIO2_1` is exact). This keeps `r = x - k*(pi/2)` full-precision for
+// `|k|` up to ~2^20 (sin/cos argument up to ~1e6).
+const PIO2_1: f64 = 1.570_796_251_296_997; // pi/2, high ~33 bits
+const PIO2_2: f64 = 7.549_789_415_861_596e-8; // next chunk
+const PIO2_3: f64 = 5.390_302_529_957_765e-15; // remaining chunk
+
+// atan reduction constants.
+const SQRT3: f64 = 1.732_050_807_568_877_2;
+const TAN_PI_12: f64 = 0.267_949_192_431_122_7; // 2 - sqrt(3) = tan(pi/12)
+
+// ── Horner polynomial evaluation ────────────────────────────────────────────
+
+/// Emit a Horner evaluation of `sum(coeffs[i] * v^i)` where `v` is the f64 in
+/// `var_local`. Coefficients are given low-order-first; the emitter folds them
+/// high-order-first (`acc = acc*v + c`), leaving the result on the stack.
+///
+/// `v` must already be materialized in `var_local` (a plain f64 local) because
+/// Horner reads it once per term and the wasm operand stack is strict LIFO.
+///
+/// Shared with `super::alloc` (the `erfc_approx` Abramowitz-Stegun polynomial
+/// folds with the identical `acc = acc*v + c` order, so reusing this keeps the
+/// emitted op sequence bit-faithful to the Rust reference).
+pub(crate) fn emit_horner(f: &mut Function, var_local: u32, coeffs: &[f64]) {
+    // Start from the highest-order coefficient.
+    let mut it = coeffs.iter().rev();
+    let first = *it
+        .next()
+        .expect("polynomial needs at least one coefficient");
+    f.instruction(&f64_const(first));
+    for &c in it {
+        // acc = acc * v + c
+        f.instruction(&Ins::LocalGet(var_local));
+        f.instruction(&Ins::F64Mul);
+        f.instruction(&f64_const(c));
+        f.instruction(&Ins::F64Add);
+    }
+}
+
+// ── exp ─────────────────────────────────────────────────────────────────────
+
+// `exp` local layout. Param 0 is `x`; the rest are scratch.
+const EXP_X: u32 = 0;
+const EXP_K: u32 = 1; // f64 k = round(x * log2e)
+const EXP_R: u32 = 2; // f64 reduced argument r = x - k*ln2
+const EXP_KI: u32 = 3; // i64 k as integer (the power of two to apply)
+
+/// Taylor coefficients of `exp(r)` (`1/n!`, n = 0..=11). On `|r| <= ln2/2 ~=
+/// 0.347` the degree-11 truncation is ~5e-15 relative -- far inside the bar.
+const EXP_COEFFS: [f64; 12] = [
+    1.0,
+    1.0,
+    1.0 / 2.0,
+    1.0 / 6.0,
+    1.0 / 24.0,
+    1.0 / 120.0,
+    1.0 / 720.0,
+    1.0 / 5040.0,
+    1.0 / 40320.0,
+    1.0 / 362880.0,
+    1.0 / 3628800.0,
+    1.0 / 39916800.0,
+];
+
+/// Emit `exp(x: f64) -> f64`.
+///
+/// Range reduction `x = k*ln2 + r`, `|r| <= ln2/2`, then `exp(x) = 2^k *
+/// exp(r)`: `exp(r)` is the Taylor poly ([`EXP_COEFFS`]), and `2^k` is applied
+/// by adding `k` to the result's IEEE exponent field (`f64.reinterpret_i64`).
+/// Guards: `NaN -> NaN`, `x > EXP_OVERFLOW -> +inf`, `x < EXP_UNDERFLOW -> 0`.
+/// Because the post-guard `exp(r)` is always a normal number in `[0.70, 1.42]`
+/// (exponent field `EXP_BIAS-1` or `EXP_BIAS`) and `k` is bounded by the
+/// guards, the exponent-assembly path needs no subnormal special-case; an
+/// out-of-range assembled exponent still saturates to `+inf`/`0` to be safe.
+///
+/// Worst-case error vs `f64::exp` over `[-700, 700]`: rel `~8e-14`. Pinned by
+/// `exp_matches_f64`.
+pub(crate) fn emit_exp() -> Function {
+    // Locals (param 0 = x): f64 EXP_K(1)/EXP_R(2), i64 EXP_KI(3), then the
+    // `emit_ldexp_exp_field` scratch f64 LDEXP_VAL(4) + i64 LDEXP_BITS(5)/
+    // LDEXP_NEWEXP(6). Declaration order fixes these indices.
+    let mut f = Function::new([
+        (2, ValType::F64),
+        (1, ValType::I64),
+        (1, ValType::F64),
+        (2, ValType::I64),
+    ]);
+
+    // NaN guard: x != x. If NaN, return x (which is NaN).
+    f.instruction(&Ins::LocalGet(EXP_X));
+    f.instruction(&Ins::LocalGet(EXP_X));
+    f.instruction(&Ins::F64Ne);
+    f.instruction(&Ins::If(wasm_encoder::BlockType::Empty));
+    f.instruction(&Ins::LocalGet(EXP_X));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // Overflow guard: x > EXP_OVERFLOW -> +inf.
+    f.instruction(&Ins::LocalGet(EXP_X));
+    f.instruction(&f64_const(EXP_OVERFLOW));
+    f.instruction(&Ins::F64Gt);
+    f.instruction(&Ins::If(wasm_encoder::BlockType::Empty));
+    f.instruction(&f64_const(f64::INFINITY));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // Underflow guard: x < EXP_UNDERFLOW -> 0.
+    f.instruction(&Ins::LocalGet(EXP_X));
+    f.instruction(&f64_const(EXP_UNDERFLOW));
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::If(wasm_encoder::BlockType::Empty));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // k = nearest(x * log2e)
+    f.instruction(&Ins::LocalGet(EXP_X));
+    f.instruction(&f64_const(LOG2E));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::F64Nearest);
+    f.instruction(&Ins::LocalTee(EXP_K));
+    // ki = trunc(k) as i64 (k is integer-valued; saturating is safe).
+    f.instruction(&Ins::I64TruncSatF64S);
+    f.instruction(&Ins::LocalSet(EXP_KI));
+
+    // r = x - k*ln2
+    f.instruction(&Ins::LocalGet(EXP_X));
+    f.instruction(&Ins::LocalGet(EXP_K));
+    f.instruction(&f64_const(LN2));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::LocalSet(EXP_R));
+
+    // poly = exp(r) via Horner; leaves exp(r) on the stack.
+    emit_horner(&mut f, EXP_R, &EXP_COEFFS);
+
+    // Apply 2^k by adding ki to exp(r)'s exponent field.
+    // bits = reinterpret(exp(r)); exp_field = (bits >> 52) & 0x7ff;
+    // new_exp = exp_field + ki.
+    emit_ldexp_exp_field(&mut f, EXP_KI);
+
+    f.instruction(&Ins::End);
+    f
+}
+
+// `emit_ldexp_exp_field` scratch (declared at the END of exp's locals so it does
+// not collide with EXP_X/K/R/KI). The f64 `exp(r)` value is consumed off the
+// stack into a fresh local.
+const LDEXP_VAL: u32 = 4; // f64 exp(r)
+const LDEXP_BITS: u32 = 5; // i64 raw bits of exp(r)
+const LDEXP_NEWEXP: u32 = 6; // i64 candidate new exponent field
+
+/// Consume the f64 on the stack (a *normal* value `e`, here always `exp(r) in
+/// [0.70, 1.42]`) and push `e * 2^ki`, by adding `ki` (in `ki_local`) to `e`'s
+/// IEEE exponent field. If the resulting exponent field is `>= EXP_MASK` push
+/// `+inf` (e is positive here); if `<= 0` push `0`. Both saturations are
+/// defensive: the `exp` over/underflow guards already bound `ki` so the in-range
+/// branch is the one taken across the supported domain.
+///
+/// Requires three scratch locals declared by the caller: a f64 (`LDEXP_VAL`)
+/// and two i64 (`LDEXP_BITS`, `LDEXP_NEWEXP`).
+fn emit_ldexp_exp_field(f: &mut Function, ki_local: u32) {
+    f.instruction(&Ins::LocalSet(LDEXP_VAL));
+
+    // bits = reinterpret(val)
+    f.instruction(&Ins::LocalGet(LDEXP_VAL));
+    f.instruction(&Ins::I64ReinterpretF64);
+    f.instruction(&Ins::LocalSet(LDEXP_BITS));
+
+    // new_exp = ((bits >> 52) & 0x7ff) + ki
+    f.instruction(&Ins::LocalGet(LDEXP_BITS));
+    f.instruction(&Ins::I64Const(MANTISSA_BITS));
+    f.instruction(&Ins::I64ShrU);
+    f.instruction(&Ins::I64Const(EXP_MASK));
+    f.instruction(&Ins::I64And);
+    f.instruction(&Ins::LocalGet(ki_local));
+    f.instruction(&Ins::I64Add);
+    f.instruction(&Ins::LocalSet(LDEXP_NEWEXP));
+
+    // if new_exp >= 0x7ff -> +inf
+    f.instruction(&Ins::LocalGet(LDEXP_NEWEXP));
+    f.instruction(&Ins::I64Const(EXP_MASK));
+    f.instruction(&Ins::I64GeS);
+    f.instruction(&Ins::If(wasm_encoder::BlockType::Result(ValType::F64)));
+    f.instruction(&f64_const(f64::INFINITY));
+    f.instruction(&Ins::Else);
+    // if new_exp <= 0 -> 0
+    f.instruction(&Ins::LocalGet(LDEXP_NEWEXP));
+    f.instruction(&Ins::I64Const(0));
+    f.instruction(&Ins::I64LeS);
+    f.instruction(&Ins::If(wasm_encoder::BlockType::Result(ValType::F64)));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::Else);
+    // in range: rebuild bits with the new exponent field.
+    // new_bits = (bits & ~(0x7ff << 52)) | (new_exp << 52)
+    f.instruction(&Ins::LocalGet(LDEXP_BITS));
+    f.instruction(&Ins::I64Const(!(EXP_MASK << MANTISSA_BITS)));
+    f.instruction(&Ins::I64And);
+    f.instruction(&Ins::LocalGet(LDEXP_NEWEXP));
+    f.instruction(&Ins::I64Const(MANTISSA_BITS));
+    f.instruction(&Ins::I64Shl);
+    f.instruction(&Ins::I64Or);
+    f.instruction(&Ins::F64ReinterpretI64);
+    f.instruction(&Ins::End); // end inner if
+    f.instruction(&Ins::End); // end outer if
+}
+
+// ── ln ─────────────────────────────────────────────────────────────────────
+
+// `ln` local layout. Param 0 is `x`.
+const LN_X: u32 = 0;
+const LN_E: u32 = 1; // f64 exponent (after centering)
+const LN_M: u32 = 2; // f64 mantissa in [sqrt(2)/2, sqrt(2))
+const LN_S: u32 = 3; // f64 s = (m-1)/(m+1)
+const LN_S2: u32 = 4; // f64 s^2
+const LN_BITS: u32 = 5; // i64 raw bits of x
+
+/// atanh-series coefficients `1/(2k+1)`, k = 0..=6, in `s^2`. On `|s| <= 0.1716`
+/// (`m in [sqrt(2)/2, sqrt(2))`) the degree-13 truncation is ~1e-15 relative.
+const LN_COEFFS: [f64; 7] = [
+    1.0,
+    1.0 / 3.0,
+    1.0 / 5.0,
+    1.0 / 7.0,
+    1.0 / 9.0,
+    1.0 / 11.0,
+    1.0 / 13.0,
+];
+
+/// Emit `ln(x: f64) -> f64`.
+///
+/// Decompose `x = m * 2^e` with `m in [1, 2)` by reading the IEEE exponent and
+/// mantissa fields; center `m` to `[sqrt(2)/2, sqrt(2))` (halve `m` and bump
+/// `e` when `m > sqrt(2)`) so the atanh series in `s = (m-1)/(m+1)` converges
+/// fast; `ln(x) = e*ln2 + 2*(s + s^3/3 + ...)`. Guards: `NaN or x < 0 -> NaN`,
+/// `x == 0 -> -inf`, `+inf -> +inf`. Subnormal `x` (exponent field 0) is
+/// normalized by scaling with `2^54` and subtracting 54 from `e`.
+///
+/// Worst-case error vs `f64::ln` over `[1e-10, 1e10]`: abs `~5e-13`. Pinned by
+/// `ln_matches_f64`.
+pub(crate) fn emit_ln() -> Function {
+    // Locals (param 0 = x): f64 LN_E(1)/LN_M(2)/LN_S(3)/LN_S2(4), i64 LN_BITS(5).
+    let mut f = Function::new([(4, ValType::F64), (1, ValType::I64)]);
+
+    // NaN-or-negative guard: !(x >= 0) (true for NaN and x<0) -> NaN.
+    // x < 0 -> NaN; NaN handled by the same (x != x) check folded in below.
+    // Use: if (x < 0) | (x != x) -> return NaN.
+    f.instruction(&Ins::LocalGet(LN_X));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::LocalGet(LN_X));
+    f.instruction(&Ins::LocalGet(LN_X));
+    f.instruction(&Ins::F64Ne);
+    f.instruction(&Ins::I32Or);
+    f.instruction(&Ins::If(wasm_encoder::BlockType::Empty));
+    f.instruction(&f64_const(f64::NAN));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // x == 0 -> -inf.
+    f.instruction(&Ins::LocalGet(LN_X));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Eq);
+    f.instruction(&Ins::If(wasm_encoder::BlockType::Empty));
+    f.instruction(&f64_const(f64::NEG_INFINITY));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // +inf -> +inf.
+    f.instruction(&Ins::LocalGet(LN_X));
+    f.instruction(&f64_const(f64::INFINITY));
+    f.instruction(&Ins::F64Eq);
+    f.instruction(&Ins::If(wasm_encoder::BlockType::Empty));
+    f.instruction(&f64_const(f64::INFINITY));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // Decompose. Handle subnormal (exponent field == 0) by scaling up first.
+    // if ((reinterpret(x) >> 52) & 0x7ff) == 0 { x *= 2^54; e_adjust = -54 }
+    // We fold the adjust into LN_E after extracting the (now-normal) fields.
+    f.instruction(&Ins::LocalGet(LN_X));
+    f.instruction(&Ins::I64ReinterpretF64);
+    f.instruction(&Ins::I64Const(MANTISSA_BITS));
+    f.instruction(&Ins::I64ShrU);
+    f.instruction(&Ins::I64Const(EXP_MASK));
+    f.instruction(&Ins::I64And);
+    f.instruction(&Ins::I64Eqz); // exponent field == 0 (subnormal/zero; zero already handled)
+    f.instruction(&Ins::If(wasm_encoder::BlockType::Result(ValType::F64)));
+    // subnormal: x_scaled = x * 2^54, and remember -54 in LN_E.
+    f.instruction(&Ins::LocalGet(LN_X));
+    f.instruction(&f64_const(f64::from_bits(((EXP_BIAS + 54) as u64) << 52)));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&f64_const(-54.0));
+    f.instruction(&Ins::LocalSet(LN_E));
+    f.instruction(&Ins::Else);
+    // normal: x unchanged, e adjust 0.
+    f.instruction(&Ins::LocalGet(LN_X));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::LocalSet(LN_E));
+    f.instruction(&Ins::End);
+    // stack: [x_norm]. bits = reinterpret(x_norm).
+    f.instruction(&Ins::I64ReinterpretF64);
+    f.instruction(&Ins::LocalSet(LN_BITS));
+
+    // m = mantissa-with-exponent-of-1.0 (value in [1,2)).
+    f.instruction(&Ins::LocalGet(LN_BITS));
+    f.instruction(&Ins::I64Const(MANTISSA_MASK));
+    f.instruction(&Ins::I64And);
+    f.instruction(&Ins::I64Const(ONE_EXP_FIELD));
+    f.instruction(&Ins::I64Or);
+    f.instruction(&Ins::F64ReinterpretI64);
+    f.instruction(&Ins::LocalSet(LN_M));
+
+    // e += (exponent_field - bias).
+    f.instruction(&Ins::LocalGet(LN_E));
+    f.instruction(&Ins::LocalGet(LN_BITS));
+    f.instruction(&Ins::I64Const(MANTISSA_BITS));
+    f.instruction(&Ins::I64ShrU);
+    f.instruction(&Ins::I64Const(EXP_MASK));
+    f.instruction(&Ins::I64And);
+    f.instruction(&Ins::I64Const(EXP_BIAS));
+    f.instruction(&Ins::I64Sub);
+    f.instruction(&Ins::F64ConvertI64S);
+    f.instruction(&Ins::F64Add);
+    f.instruction(&Ins::LocalSet(LN_E));
+
+    // Center: if m > sqrt(2) { m *= 0.5; e += 1 }.
+    f.instruction(&Ins::LocalGet(LN_M));
+    f.instruction(&f64_const(std::f64::consts::SQRT_2));
+    f.instruction(&Ins::F64Gt);
+    f.instruction(&Ins::If(wasm_encoder::BlockType::Empty));
+    f.instruction(&Ins::LocalGet(LN_M));
+    f.instruction(&f64_const(0.5));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::LocalSet(LN_M));
+    f.instruction(&Ins::LocalGet(LN_E));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::F64Add);
+    f.instruction(&Ins::LocalSet(LN_E));
+    f.instruction(&Ins::End);
+
+    // s = (m - 1) / (m + 1); s2 = s*s.
+    f.instruction(&Ins::LocalGet(LN_M));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::LocalGet(LN_M));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::F64Add);
+    f.instruction(&Ins::F64Div);
+    f.instruction(&Ins::LocalTee(LN_S));
+    f.instruction(&Ins::LocalGet(LN_S));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::LocalSet(LN_S2));
+
+    // ln(m) = 2 * s * poly(s2); result = e*ln2 + ln(m).
+    f.instruction(&Ins::LocalGet(LN_E));
+    f.instruction(&f64_const(LN2));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&f64_const(2.0));
+    f.instruction(&Ins::LocalGet(LN_S));
+    f.instruction(&Ins::F64Mul);
+    emit_horner(&mut f, LN_S2, &LN_COEFFS);
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::F64Add);
+
+    f.instruction(&Ins::End);
+    f
+}
+
+// ── sin / cos (shared kernel) ────────────────────────────────────────────────
+
+// sin/cos local layout. Param 0 is `x`.
+const SC_X: u32 = 0;
+const SC_K: u32 = 1; // f64 quadrant count
+const SC_R: u32 = 2; // f64 reduced argument in [-pi/4, pi/4]
+const SC_R2: u32 = 3; // f64 r^2
+const SC_SR: u32 = 4; // f64 sin(r)
+const SC_CR: u32 = 5; // f64 cos(r)
+const SC_KQ: u32 = 6; // i64 quadrant index k mod 4
+
+/// `sin(r)/r` Taylor coefficients in `r^2` (so the series is `r * poly(r^2)`):
+/// `(-1)^n / (2n+1)!`, n = 0..=5 (through `r^11`).
+const SIN_COEFFS: [f64; 6] = [
+    1.0,
+    -1.0 / 6.0,
+    1.0 / 120.0,
+    -1.0 / 5040.0,
+    1.0 / 362880.0,
+    -1.0 / 39916800.0,
+];
+
+/// `cos(r)` Taylor coefficients in `r^2`: `(-1)^n / (2n)!`, n = 0..=5 (through
+/// `r^10`).
+const COS_COEFFS: [f64; 6] = [
+    1.0,
+    -1.0 / 2.0,
+    1.0 / 24.0,
+    -1.0 / 720.0,
+    1.0 / 40320.0,
+    -1.0 / 3628800.0,
+];
+
+/// Emit the shared sin/cos body. `want_sin` selects which result the function
+/// returns; both `sin(r)` and `cos(r)` are computed (cheap) and the quadrant
+/// `k mod 4` selects/sign-flips the right one, exactly mirroring the kernel
+/// the prototype validated.
+fn emit_sincos(want_sin: bool) -> Function {
+    // Locals (param 0 = x): f64 SC_K(1)/SC_R(2)/SC_R2(3)/SC_SR(4)/SC_CR(5),
+    // i64 SC_KQ(6).
+    let mut f = Function::new([(5, ValType::F64), (1, ValType::I64)]);
+
+    // NaN/inf guard: if !(|x| < +inf) return NaN. (|x| < inf is false for NaN
+    // and for +-inf.)
+    f.instruction(&Ins::LocalGet(SC_X));
+    f.instruction(&Ins::F64Abs);
+    f.instruction(&f64_const(f64::INFINITY));
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::I32Eqz);
+    f.instruction(&Ins::If(wasm_encoder::BlockType::Empty));
+    f.instruction(&f64_const(f64::NAN));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // k = nearest(x * 2/pi); kq = k mod 4 (normalized to 0..=3).
+    f.instruction(&Ins::LocalGet(SC_X));
+    f.instruction(&f64_const(FRAC_2_PI));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::F64Nearest);
+    f.instruction(&Ins::LocalTee(SC_K));
+    // kq = ((k as i64) % 4 + 4) % 4
+    f.instruction(&Ins::I64TruncSatF64S);
+    f.instruction(&Ins::I64Const(4));
+    f.instruction(&Ins::I64RemS);
+    f.instruction(&Ins::I64Const(4));
+    f.instruction(&Ins::I64Add);
+    f.instruction(&Ins::I64Const(4));
+    f.instruction(&Ins::I64RemS);
+    f.instruction(&Ins::LocalSet(SC_KQ));
+
+    // r = ((x - k*PIO2_1) - k*PIO2_2) - k*PIO2_3.
+    f.instruction(&Ins::LocalGet(SC_X));
+    f.instruction(&Ins::LocalGet(SC_K));
+    f.instruction(&f64_const(PIO2_1));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::LocalGet(SC_K));
+    f.instruction(&f64_const(PIO2_2));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::LocalGet(SC_K));
+    f.instruction(&f64_const(PIO2_3));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::LocalTee(SC_R));
+    // r2 = r*r
+    f.instruction(&Ins::LocalGet(SC_R));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::LocalSet(SC_R2));
+
+    // sr = r * poly_sin(r2)
+    f.instruction(&Ins::LocalGet(SC_R));
+    emit_horner(&mut f, SC_R2, &SIN_COEFFS);
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::LocalSet(SC_SR));
+    // cr = poly_cos(r2)
+    emit_horner(&mut f, SC_R2, &COS_COEFFS);
+    f.instruction(&Ins::LocalSet(SC_CR));
+
+    // Quadrant select. For sin: kq 0->sr, 1->cr, 2->-sr, 3->-cr.
+    // For cos: kq 0->cr, 1->-sr, 2->-cr, 3->sr.
+    // Emit a 4-way nested select keyed on kq.
+    emit_quadrant_select(&mut f, want_sin);
+
+    f.instruction(&Ins::End);
+    f
+}
+
+/// Push the quadrant-selected result for sin (`want_sin`) or cos. Reads
+/// `SC_SR`/`SC_CR`/`SC_KQ`. Implemented as three chained `select`s, keyed on
+/// `kq != n`, avoiding branches.
+///
+/// wasm `select` pops `[a, b, cond]` and yields the *deeper* operand `a` when
+/// `cond != 0`, else the shallower `b`. The running result (the default for
+/// `kq == 0`, refined by earlier iterations) is already on the stack as the
+/// deeper operand; pushing the override `q_n` above it and selecting on
+/// `kq != n` keeps the running value when `kq != n` and switches to `q_n`
+/// otherwise.
+fn emit_quadrant_select(f: &mut Function, want_sin: bool) {
+    // The four results per quadrant (one `push_*` emitter each).
+    let [q0, q1, q2, q3]: [PushFn; 4] = if want_sin {
+        [push_sr, push_cr, push_neg_sr, push_neg_cr]
+    } else {
+        [push_cr, push_neg_sr, push_neg_cr, push_sr]
+    };
+
+    q0(f); // running result, default for kq == 0
+    for (n, push_q) in [(1i64, q1), (2, q2), (3, q3)] {
+        push_q(f); // override candidate (shallower)
+        push_kq_ne(f, n); // cond: keep the running (deeper) value when kq != n
+        f.instruction(&Ins::Select);
+    }
+}
+
+/// An emitter that pushes one quadrant result (`sr`/`cr`/`-sr`/`-cr`) onto the
+/// stack from the precomputed `SC_SR`/`SC_CR` locals.
+type PushFn = fn(&mut Function);
+
+fn push_sr(f: &mut Function) {
+    f.instruction(&Ins::LocalGet(SC_SR));
+}
+fn push_cr(f: &mut Function) {
+    f.instruction(&Ins::LocalGet(SC_CR));
+}
+fn push_neg_sr(f: &mut Function) {
+    f.instruction(&Ins::LocalGet(SC_SR));
+    f.instruction(&Ins::F64Neg);
+}
+fn push_neg_cr(f: &mut Function) {
+    f.instruction(&Ins::LocalGet(SC_CR));
+    f.instruction(&Ins::F64Neg);
+}
+/// Push i32 `1` when `SC_KQ != n`, else `0`. Used as the `select` condition so
+/// the deeper (running) operand is kept when `kq != n`.
+fn push_kq_ne(f: &mut Function, n: i64) {
+    f.instruction(&Ins::LocalGet(SC_KQ));
+    f.instruction(&Ins::I64Const(n));
+    f.instruction(&Ins::I64Ne);
+}
+
+/// Emit `sin(x: f64) -> f64`. Worst-case error vs `f64::sin` over `[-1e6, 1e6]`:
+/// abs `~1.2e-10`. Pinned by `sin_matches_f64`.
+pub(crate) fn emit_sin() -> Function {
+    emit_sincos(true)
+}
+
+/// Emit `cos(x: f64) -> f64`. Worst-case error vs `f64::cos` over `[-1e6, 1e6]`:
+/// abs `~1.2e-10`. Pinned by `cos_matches_f64`.
+pub(crate) fn emit_cos() -> Function {
+    emit_sincos(false)
+}
+
+// ── tan = sin / cos ──────────────────────────────────────────────────────────
+
+const TAN_X: u32 = 0;
+
+/// Emit `tan(x: f64) -> f64` as `sin(x) / cos(x)` by `call`ing the sin/cos
+/// helpers. Worst-case relative error over `[-1.5, 1.5]` (away from the poles):
+/// `~1.5e-10`. Pinned by `tan_matches_f64`.
+///
+/// `sin_idx`/`cos_idx` are the module function indices of [`emit_sin`] /
+/// [`emit_cos`].
+pub(crate) fn emit_tan(sin_idx: u32, cos_idx: u32) -> Function {
+    let mut f = Function::new([]);
+    f.instruction(&Ins::LocalGet(TAN_X));
+    f.instruction(&Ins::Call(sin_idx));
+    f.instruction(&Ins::LocalGet(TAN_X));
+    f.instruction(&Ins::Call(cos_idx));
+    f.instruction(&Ins::F64Div);
+    f.instruction(&Ins::End);
+    f
+}
+
+// ── atan ─────────────────────────────────────────────────────────────────────
+
+// atan local layout. Param 0 is `x`.
+const AT_X: u32 = 0;
+const AT_AX: u32 = 1; // f64 |x|
+const AT_Z: u32 = 2; // f64 reduced argument
+const AT_Z2: u32 = 3; // f64 z^2
+const AT_RECIP: u32 = 4; // i32 1 if |x| > 1
+const AT_SHIFT: u32 = 5; // i32 1 if the pi/6 shift was applied
+const AT_SIGN: u32 = 6; // f64 sign of x (+-1)
+
+/// `atan(z)/z` Taylor coefficients in `z^2`: `(-1)^n / (2n+1)`, n = 0..=6
+/// (through `z^13`). On `|z| <= tan(pi/12) ~= 0.268` the truncation is
+/// ~1e-10 relative.
+const ATAN_COEFFS: [f64; 7] = [
+    1.0,
+    -1.0 / 3.0,
+    1.0 / 5.0,
+    -1.0 / 7.0,
+    1.0 / 9.0,
+    -1.0 / 11.0,
+    1.0 / 13.0,
+];
+
+/// Emit `atan(x: f64) -> f64`.
+///
+/// Two-stage range reduction to a small argument:
+/// 1. `|x| > 1` -> `atan(|x|) = pi/2 - atan(1/|x|)` (so `z0 in [0, 1]`).
+/// 2. `z0 > tan(pi/12)` -> `atan(z0) = pi/6 + atan((z0*sqrt3 - 1)/(sqrt3 + z0))`
+///    (so the poly argument `z in [-(2-sqrt3), 2-sqrt3]`).
+///
+/// then `atan(z) = z * poly(z^2)`, undoing the shifts and applying the sign.
+/// `+-inf -> +-pi/2`, `NaN -> NaN` (the poly of a NaN is NaN, and the
+/// reductions preserve it). Worst-case error vs `f64::atan` over `[-1000,
+/// 1000]`: rel `~6e-10`. Pinned by `atan_matches_f64`.
+pub(crate) fn emit_atan() -> Function {
+    use wasm_encoder::BlockType;
+    // Locals (param 0 = x): f64 AT_AX(1)/AT_Z(2)/AT_Z2(3), i32 AT_RECIP(4)/
+    // AT_SHIFT(5), f64 AT_SIGN(6).
+    let mut f = Function::new([(3, ValType::F64), (2, ValType::I32), (1, ValType::F64)]);
+
+    // +inf -> pi/2, -inf -> -pi/2 (handled first so the reciprocal 1/inf = 0
+    // path is not relied upon).
+    f.instruction(&Ins::LocalGet(AT_X));
+    f.instruction(&f64_const(f64::INFINITY));
+    f.instruction(&Ins::F64Eq);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&f64_const(std::f64::consts::FRAC_PI_2));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+    f.instruction(&Ins::LocalGet(AT_X));
+    f.instruction(&f64_const(f64::NEG_INFINITY));
+    f.instruction(&Ins::F64Eq);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&f64_const(-std::f64::consts::FRAC_PI_2));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // sign = x < 0 ? -1 : 1 ; ax = |x|.
+    f.instruction(&f64_const(-1.0));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::LocalGet(AT_X));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::Select);
+    f.instruction(&Ins::LocalSet(AT_SIGN));
+    f.instruction(&Ins::LocalGet(AT_X));
+    f.instruction(&Ins::F64Abs);
+    f.instruction(&Ins::LocalSet(AT_AX));
+
+    // recip = ax > 1 ; z0 = recip ? 1/ax : ax.
+    f.instruction(&Ins::LocalGet(AT_AX));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::F64Gt);
+    f.instruction(&Ins::LocalSet(AT_RECIP));
+    // z0 = select(1/ax, ax, recip)
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::LocalGet(AT_AX));
+    f.instruction(&Ins::F64Div);
+    f.instruction(&Ins::LocalGet(AT_AX));
+    f.instruction(&Ins::LocalGet(AT_RECIP));
+    f.instruction(&Ins::Select);
+    f.instruction(&Ins::LocalSet(AT_Z));
+
+    // shift = z0 > tan(pi/12) ; z = shift ? (z0*sqrt3 - 1)/(sqrt3 + z0) : z0.
+    f.instruction(&Ins::LocalGet(AT_Z));
+    f.instruction(&f64_const(TAN_PI_12));
+    f.instruction(&Ins::F64Gt);
+    f.instruction(&Ins::LocalSet(AT_SHIFT));
+    // shifted = (z0*sqrt3 - 1)/(sqrt3 + z0)
+    f.instruction(&Ins::LocalGet(AT_Z));
+    f.instruction(&f64_const(SQRT3));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&f64_const(SQRT3));
+    f.instruction(&Ins::LocalGet(AT_Z));
+    f.instruction(&Ins::F64Add);
+    f.instruction(&Ins::F64Div);
+    // select(shifted, z0, shift)
+    f.instruction(&Ins::LocalGet(AT_Z));
+    f.instruction(&Ins::LocalGet(AT_SHIFT));
+    f.instruction(&Ins::Select);
+    f.instruction(&Ins::LocalTee(AT_Z));
+    // z2 = z*z
+    f.instruction(&Ins::LocalGet(AT_Z));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::LocalSet(AT_Z2));
+
+    // at = z * poly(z2)
+    f.instruction(&Ins::LocalGet(AT_Z));
+    emit_horner(&mut f, AT_Z2, &ATAN_COEFFS);
+    f.instruction(&Ins::F64Mul);
+    // at += shift ? pi/6 : 0
+    f.instruction(&f64_const(std::f64::consts::FRAC_PI_6));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::LocalGet(AT_SHIFT));
+    f.instruction(&Ins::Select);
+    f.instruction(&Ins::F64Add);
+    // at = recip ? pi/2 - at : at
+    // compute (pi/2 - at) and select.
+    f.instruction(&Ins::LocalSet(AT_Z)); // reuse AT_Z to hold the running atan value
+    f.instruction(&f64_const(std::f64::consts::FRAC_PI_2));
+    f.instruction(&Ins::LocalGet(AT_Z));
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::LocalGet(AT_Z));
+    f.instruction(&Ins::LocalGet(AT_RECIP));
+    f.instruction(&Ins::Select);
+    // * sign
+    f.instruction(&Ins::LocalGet(AT_SIGN));
+    f.instruction(&Ins::F64Mul);
+
+    f.instruction(&Ins::End);
+    f
+}
+
+// ── asin / acos ───────────────────────────────────────────────────────────────
+
+const AS_X: u32 = 0;
+
+/// Emit `asin(x: f64) -> f64` as `atan(x / sqrt(1 - x^2))` with endpoint and
+/// domain handling: `|x| > 1 -> NaN`, `x == 1 -> pi/2`, `x == -1 -> -pi/2`
+/// (at the endpoints `sqrt(1-x^2)=0` would divide by zero). `NaN -> NaN`.
+/// Worst-case error vs `f64::asin` over `[-1, 1]`: abs `~1.6e-10`. Pinned by
+/// `asin_matches_f64`. `atan_idx` is [`emit_atan`]'s module function index.
+pub(crate) fn emit_asin(atan_idx: u32) -> Function {
+    use wasm_encoder::BlockType;
+    let mut f = Function::new([]);
+
+    // |x| > 1 -> NaN (also catches nothing for NaN; NaN handled by falling
+    // through to the poly which yields NaN, but be explicit:)
+    // if (x > 1) | (x < -1) -> NaN
+    f.instruction(&Ins::LocalGet(AS_X));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::F64Gt);
+    f.instruction(&Ins::LocalGet(AS_X));
+    f.instruction(&f64_const(-1.0));
+    f.instruction(&Ins::F64Lt);
+    f.instruction(&Ins::I32Or);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&f64_const(f64::NAN));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // x == 1 -> pi/2.
+    f.instruction(&Ins::LocalGet(AS_X));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::F64Eq);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&f64_const(std::f64::consts::FRAC_PI_2));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+    // x == -1 -> -pi/2.
+    f.instruction(&Ins::LocalGet(AS_X));
+    f.instruction(&f64_const(-1.0));
+    f.instruction(&Ins::F64Eq);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&f64_const(-std::f64::consts::FRAC_PI_2));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // atan(x / sqrt(1 - x*x))
+    f.instruction(&Ins::LocalGet(AS_X));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::LocalGet(AS_X));
+    f.instruction(&Ins::LocalGet(AS_X));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::F64Sqrt);
+    f.instruction(&Ins::F64Div);
+    f.instruction(&Ins::Call(atan_idx));
+    f.instruction(&Ins::End);
+    f
+}
+
+const AC_X: u32 = 0;
+
+/// Emit `acos(x: f64) -> f64` as `pi/2 - asin(x)`. Domain `|x| > 1 -> NaN`
+/// (inherited from asin), `NaN -> NaN`. Worst-case error vs `f64::acos` over
+/// `[-1, 1]`: abs `~1.6e-10`. Pinned by `acos_matches_f64`. `asin_idx` is
+/// [`emit_asin`]'s module function index.
+pub(crate) fn emit_acos(asin_idx: u32) -> Function {
+    let mut f = Function::new([]);
+    f.instruction(&f64_const(std::f64::consts::FRAC_PI_2));
+    f.instruction(&Ins::LocalGet(AC_X));
+    f.instruction(&Ins::Call(asin_idx));
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::End);
+    f
+}
+
+// ── log10 = ln * (1/ln10) ─────────────────────────────────────────────────────
+
+const LOG10_X: u32 = 0;
+
+/// Emit `log10(x: f64) -> f64` as `ln(x) * (1/ln10)`. Inherits `ln`'s domain
+/// handling (`x < 0 -> NaN`, `x == 0 -> -inf`). Worst-case error vs
+/// `f64::log10` over `[1e-10, 1e10]`: abs `~2e-13`. Pinned by
+/// `log10_matches_f64`. `ln_idx` is [`emit_ln`]'s module function index.
+pub(crate) fn emit_log10(ln_idx: u32) -> Function {
+    let mut f = Function::new([]);
+    f.instruction(&Ins::LocalGet(LOG10_X));
+    f.instruction(&Ins::Call(ln_idx));
+    f.instruction(&f64_const(INV_LN10));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::End);
+    f
+}
+
+// ── pow = exp(y * ln x) ────────────────────────────────────────────────────────
+
+const POW_X: u32 = 0;
+const POW_Y: u32 = 1;
+
+/// Emit `pow(x: f64, y: f64) -> f64` as `exp(y * ln x)`.
+///
+/// Matches `f64::powf` for a positive base `x`. Special cases mirrored from
+/// `powf`: `y == 0 -> 1` (including `pow(anything, 0) == 1`), `x == 1 -> 1`.
+/// A negative base yields NaN (`ln` of a negative is NaN) -- this is the
+/// documented limitation; no corpus model raises a negative base to a power.
+/// Worst-case relative error over `x in [0.01, 100]`, `y in [-5, 5]`:
+/// `~2.3e-12`. Pinned by `pow_matches_f64`. `exp_idx`/`ln_idx` are the module
+/// function indices of [`emit_exp`] / [`emit_ln`].
+pub(crate) fn emit_pow(exp_idx: u32, ln_idx: u32) -> Function {
+    use wasm_encoder::BlockType;
+    let mut f = Function::new([]);
+
+    // y == 0 -> 1.
+    f.instruction(&Ins::LocalGet(POW_Y));
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::F64Eq);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+    // x == 1 -> 1.
+    f.instruction(&Ins::LocalGet(POW_X));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::F64Eq);
+    f.instruction(&Ins::If(BlockType::Empty));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::Return);
+    f.instruction(&Ins::End);
+
+    // exp(y * ln(x))
+    f.instruction(&Ins::LocalGet(POW_Y));
+    f.instruction(&Ins::LocalGet(POW_X));
+    f.instruction(&Ins::Call(ln_idx));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::Call(exp_idx));
+    f.instruction(&Ins::End);
+    f
+}
+
+#[cfg(test)]
+mod tests {
+    use super::super::lower::build_helpers;
+    use checked::Store;
+    use wasm::validate;
+    use wasm_encoder::{
+        CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction,
+        MemorySection, MemoryType, Module, TypeSection, ValType,
+    };
+
+    /// Which transcendental helper a test module exports as `f`.
+    #[derive(Clone, Copy)]
+    enum Which {
+        Exp,
+        Ln,
+        Sin,
+        Cos,
+        Tan,
+        Atan,
+        Asin,
+        Acos,
+        Log10,
+        Pow,
+    }
+
+    /// Resolve a [`Which`] to its function index in the assembled helper table.
+    fn helper_index(which: Which) -> u32 {
+        let h = build_helpers().fns;
+        match which {
+            Which::Exp => h.exp,
+            Which::Ln => h.ln,
+            Which::Sin => h.sin,
+            Which::Cos => h.cos,
+            Which::Tan => h.tan,
+            Which::Atan => h.atan,
+            Which::Asin => h.asin,
+            Which::Acos => h.acos,
+            Which::Log10 => h.log10,
+            Which::Pow => h.pow,
+        }
+    }
+
+    /// Build a module containing *every* helper body (so inter-helper `call`s
+    /// resolve) plus a thin exported wrapper `f` that forwards to the
+    /// helper-under-test. Unary helpers export `f(x: f64) -> f64`; `pow` exports
+    /// `f(x: f64, y: f64) -> f64`. Mirrors `lower.rs`'s production assembly:
+    /// helpers occupy function indices `0..N`, the wrapper follows at `N`.
+    fn build_helper_module(which: Which) -> Vec<u8> {
+        let helpers = build_helpers();
+        let n_helpers = helpers.functions.len() as u32;
+        let target = helper_index(which);
+        let binary = matches!(which, Which::Pow);
+
+        let mut module = Module::new();
+
+        // Type 0 is the wrapper's signature; each helper's signature follows.
+        let mut types = TypeSection::new();
+        if binary {
+            types
+                .ty()
+                .function([ValType::F64, ValType::F64], [ValType::F64]);
+        } else {
+            types.ty().function([ValType::F64], [ValType::F64]);
+        }
+        for hf in &helpers.functions {
+            types.ty().function(hf.params.clone(), hf.results.clone());
+        }
+        module.section(&types);
+
+        let mut functions = FunctionSection::new();
+        for (i, _) in helpers.functions.iter().enumerate() {
+            functions.function(1 + i as u32);
+        }
+        functions.function(0);
+        module.section(&functions);
+
+        // The GF lookup helpers (`super::lookup`) `f64.load` from memory 0, so
+        // a module that includes every helper body must declare a memory even
+        // though the transcendental wrappers here never touch it.
+        let mut memories = MemorySection::new();
+        memories.memory(MemoryType {
+            minimum: 1,
+            maximum: None,
+            memory64: false,
+            shared: false,
+            page_size_log2: None,
+        });
+        module.section(&memories);
+
+        let mut exports = ExportSection::new();
+        exports.export("f", ExportKind::Func, n_helpers);
+        module.section(&exports);
+
+        let mut code = CodeSection::new();
+        for hf in &helpers.functions {
+            code.function(&hf.body);
+        }
+        let mut wrapper = Function::new([]);
+        wrapper.instruction(&Instruction::LocalGet(0));
+        if binary {
+            wrapper.instruction(&Instruction::LocalGet(1));
+        }
+        wrapper.instruction(&Instruction::Call(target));
+        wrapper.instruction(&Instruction::End);
+        code.function(&wrapper);
+        module.section(&code);
+
+        module.finish()
+    }
+
+    /// Run a unary helper on `x` under the DLR-FT interpreter. The module is
+    /// (re)built per call; the samples are deliberately small (a few hundred
+    /// points each) so this stays well under the per-test time budget.
+    fn run_unary(which: Which, x: f64) -> f64 {
+        let bytes = build_helper_module(which);
+        let info = validate(&bytes).expect("helper module must validate");
+        let mut store = Store::new(());
+        let module = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("helper module must instantiate")
+            .module_addr;
+        let f = store
+            .instance_export(module, "f")
+            .unwrap()
+            .as_func()
+            .unwrap();
+        store
+            .invoke_simple_typed::<(f64,), f64>(f, (x,))
+            .expect("invocation must succeed")
+    }
+
+    /// Run `pow(x, y)` under the interpreter.
+    fn run_pow(x: f64, y: f64) -> f64 {
+        let bytes = build_helper_module(Which::Pow);
+        let info = validate(&bytes).expect("pow module must validate");
+        let mut store = Store::new(());
+        let module = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("pow module must instantiate")
+            .module_addr;
+        let f = store
+            .instance_export(module, "f")
+            .unwrap()
+            .as_func()
+            .unwrap();
+        store
+            .invoke_simple_typed::<(f64, f64), f64>(f, (x, y))
+            .expect("invocation must succeed")
+    }
+
+    /// A linear sample of `n+1` points across `[lo, hi]` inclusive.
+    fn linspace(lo: f64, hi: f64, n: usize) -> Vec<f64> {
+        (0..=n)
+            .map(|i| lo + (hi - lo) * (i as f64) / (n as f64))
+            .collect()
+    }
+
+    /// Assert `got` matches `want` within absolute *or* relative tolerance,
+    /// propagating the float specials the way the kernels are documented to.
+    fn assert_close(name: &str, x: f64, got: f64, want: f64, abs_tol: f64, rel_tol: f64) {
+        if want.is_nan() {
+            assert!(got.is_nan(), "{name}({x}): expected NaN, got {got}");
+            return;
+        }
+        assert!(!got.is_nan(), "{name}({x}): got NaN, expected {want}");
+        if want.is_infinite() {
+            assert_eq!(got, want, "{name}({x}): expected {want}, got {got}");
+            return;
+        }
+        let abs = (got - want).abs();
+        let rel = if want != 0.0 { abs / want.abs() } else { abs };
+        assert!(
+            abs <= abs_tol || rel <= rel_tol,
+            "{name}({x}): got {got}, want {want} (abs {abs:.3e}, rel {rel:.3e})",
+        );
+    }
+
+    // The corpus bar is abs 2e-3 / rel 5e-6. Every per-helper tolerance below is
+    // far inside that, leaving ample slack for DLR-FT-vs-native rounding drift.
+
+    // ── exp ───────────────────────────────────────────────────────────────
+
+    #[test]
+    fn exp_matches_f64() {
+        // Anchor values exercise the wrapper end-to-end.
+        assert_eq!(run_unary(Which::Exp, 0.0), 1.0);
+        assert_close(
+            "exp",
+            1.0,
+            run_unary(Which::Exp, 1.0),
+            std::f64::consts::E,
+            0.0,
+            1e-12,
+        );
+        // Dense sweep across the representable exponent range.
+        for x in linspace(-700.0, 700.0, 300) {
+            assert_close("exp", x, run_unary(Which::Exp, x), x.exp(), 0.0, 1e-12);
+        }
+        // Edge / special cases.
+        assert!(run_unary(Which::Exp, f64::NAN).is_nan());
+        assert_eq!(run_unary(Which::Exp, f64::INFINITY), f64::INFINITY);
+        assert_eq!(run_unary(Which::Exp, f64::NEG_INFINITY), 0.0);
+        assert_eq!(run_unary(Which::Exp, 720.0), f64::INFINITY); // overflow
+        assert_eq!(run_unary(Which::Exp, -750.0), 0.0); // underflow
+    }
+
+    // ── ln ────────────────────────────────────────────────────────────────
+
+    #[test]
+    fn ln_matches_f64() {
+        assert_eq!(run_unary(Which::Ln, 1.0), 0.0);
+        assert_close(
+            "ln",
+            std::f64::consts::E,
+            run_unary(Which::Ln, std::f64::consts::E),
+            1.0,
+            1e-12,
+            1e-12,
+        );
+        // Geometric sweep over many decades (where ln is interesting).
+        for e in linspace(-300.0, 300.0, 300) {
+            let x = 10f64.powf(e / 30.0);
+            assert_close("ln", x, run_unary(Which::Ln, x), x.ln(), 1e-12, 1e-11);
+        }
+        // Subnormal input (exercises the 2^54 normalization path).
+        let sub = f64::from_bits(1);
+        assert_close("ln", sub, run_unary(Which::Ln, sub), sub.ln(), 1e-9, 1e-12);
+        // Domain edges.
+        assert_eq!(run_unary(Which::Ln, 0.0), f64::NEG_INFINITY);
+        assert!(run_unary(Which::Ln, -1.0).is_nan());
+        assert!(run_unary(Which::Ln, f64::NAN).is_nan());
+        assert_eq!(run_unary(Which::Ln, f64::INFINITY), f64::INFINITY);
+    }
+
+    // ── sin / cos ───────────────────────────────────────────────────────────
+
+    #[test]
+    fn sin_matches_f64() {
+        assert_eq!(run_unary(Which::Sin, 0.0), 0.0);
+        for x in linspace(-100.0, 100.0, 400) {
+            assert_close("sin", x, run_unary(Which::Sin, x), x.sin(), 1e-9, 1e-9);
+        }
+        // A few large arguments to exercise the Cody-Waite reduction.
+        for &x in &[1.0e3, -1.0e4, 1.0e5, -650_400.0] {
+            assert_close("sin", x, run_unary(Which::Sin, x), x.sin(), 1e-8, 1e-7);
+        }
+        assert!(run_unary(Which::Sin, f64::NAN).is_nan());
+        assert!(run_unary(Which::Sin, f64::INFINITY).is_nan());
+    }
+
+    #[test]
+    fn cos_matches_f64() {
+        assert_eq!(run_unary(Which::Cos, 0.0), 1.0);
+        for x in linspace(-100.0, 100.0, 400) {
+            assert_close("cos", x, run_unary(Which::Cos, x), x.cos(), 1e-9, 1e-9);
+        }
+        for &x in &[1.0e3, -1.0e4, 1.0e5, -650_400.0] {
+            assert_close("cos", x, run_unary(Which::Cos, x), x.cos(), 1e-8, 1e-7);
+        }
+        assert!(run_unary(Which::Cos, f64::NAN).is_nan());
+        assert!(run_unary(Which::Cos, f64::NEG_INFINITY).is_nan());
+    }
+
+    // ── tan ─────────────────────────────────────────────────────────────────
+
+    #[test]
+    fn tan_matches_f64() {
+        assert_eq!(run_unary(Which::Tan, 0.0), 0.0);
+        // Stay away from the +-pi/2 poles where the function is ill-conditioned.
+        for x in linspace(-1.4, 1.4, 400) {
+            assert_close("tan", x, run_unary(Which::Tan, x), x.tan(), 1e-9, 1e-8);
+        }
+        assert!(run_unary(Which::Tan, f64::NAN).is_nan());
+    }
+
+    // ── atan ────────────────────────────────────────────────────────────────
+
+    #[test]
+    fn atan_matches_f64() {
+        assert_eq!(run_unary(Which::Atan, 0.0), 0.0);
+        for x in linspace(-1000.0, 1000.0, 400) {
+            assert_close("atan", x, run_unary(Which::Atan, x), x.atan(), 1e-9, 1e-9);
+        }
+        // Dense small region around the two reduction breakpoints (1 and
+        // tan(pi/12)).
+        for x in linspace(-2.0, 2.0, 200) {
+            assert_close("atan", x, run_unary(Which::Atan, x), x.atan(), 1e-9, 1e-9);
+        }
+        assert_close(
+            "atan",
+            f64::INFINITY,
+            run_unary(Which::Atan, f64::INFINITY),
+            std::f64::consts::FRAC_PI_2,
+            1e-12,
+            0.0,
+        );
+        assert_close(
+            "atan",
+            f64::NEG_INFINITY,
+            run_unary(Which::Atan, f64::NEG_INFINITY),
+            -std::f64::consts::FRAC_PI_2,
+            1e-12,
+            0.0,
+        );
+        assert!(run_unary(Which::Atan, f64::NAN).is_nan());
+    }
+
+    // ── asin / acos ───────────────────────────────────────────────────────────
+
+    #[test]
+    fn asin_matches_f64() {
+        for x in linspace(-1.0, 1.0, 400) {
+            assert_close("asin", x, run_unary(Which::Asin, x), x.asin(), 1e-9, 1e-9);
+        }
+        // Exact endpoints.
+        assert_close(
+            "asin",
+            1.0,
+            run_unary(Which::Asin, 1.0),
+            std::f64::consts::FRAC_PI_2,
+            1e-12,
+            0.0,
+        );
+        assert_close(
+            "asin",
+            -1.0,
+            run_unary(Which::Asin, -1.0),
+            -std::f64::consts::FRAC_PI_2,
+            1e-12,
+            0.0,
+        );
+        // Out of domain.
+        assert!(run_unary(Which::Asin, 1.5).is_nan());
+        assert!(run_unary(Which::Asin, -1.5).is_nan());
+        assert!(run_unary(Which::Asin, f64::NAN).is_nan());
+    }
+
+    #[test]
+    fn acos_matches_f64() {
+        for x in linspace(-1.0, 1.0, 400) {
+            assert_close("acos", x, run_unary(Which::Acos, x), x.acos(), 1e-9, 1e-9);
+        }
+        assert_close("acos", 1.0, run_unary(Which::Acos, 1.0), 0.0, 1e-9, 0.0);
+        assert_close(
+            "acos",
+            -1.0,
+            run_unary(Which::Acos, -1.0),
+            std::f64::consts::PI,
+            1e-12,
+            1e-12,
+        );
+        assert!(run_unary(Which::Acos, 1.5).is_nan());
+        assert!(run_unary(Which::Acos, f64::NAN).is_nan());
+    }
+
+    // ── log10 ──────────────────────────────────────────────────────────────
+
+    #[test]
+    fn log10_matches_f64() {
+        assert_close(
+            "log10",
+            1000.0,
+            run_unary(Which::Log10, 1000.0),
+            3.0,
+            1e-12,
+            1e-12,
+        );
+        for e in linspace(-300.0, 300.0, 300) {
+            let x = 10f64.powf(e / 30.0);
+            assert_close(
+                "log10",
+                x,
+                run_unary(Which::Log10, x),
+                x.log10(),
+                1e-12,
+                1e-11,
+            );
+        }
+        assert_eq!(run_unary(Which::Log10, 0.0), f64::NEG_INFINITY);
+        assert!(run_unary(Which::Log10, -1.0).is_nan());
+    }
+
+    // ── pow ─────────────────────────────────────────────────────────────────
+
+    #[test]
+    fn pow_matches_f64() {
+        // y == 0 and x == 1 short-circuits.
+        assert_eq!(run_pow(123.4, 0.0), 1.0);
+        assert_eq!(run_pow(1.0, 567.8), 1.0);
+        // Positive-base grid (the supported regime), integer and fractional y.
+        for i in 0..40 {
+            for j in 0..40 {
+                let x = 0.01 + 100.0 * (i as f64) / 40.0;
+                let y = -5.0 + 10.0 * (j as f64) / 40.0;
+                let want = x.powf(y);
+                if want.is_finite() {
+                    assert_close("pow", x, run_pow(x, y), want, 1e-9, 1e-9);
+                }
+            }
+        }
+        // Known limitation: a negative base diverges (ln of negative is NaN).
+        assert!(run_pow(-2.0, 2.0).is_nan());
+    }
+}
diff --git a/src/simlin-engine/src/wasmgen/mod.rs b/src/simlin-engine/src/wasmgen/mod.rs
new file mode 100644
index 000000000..1057672f2
--- /dev/null
+++ b/src/simlin-engine/src/wasmgen/mod.rs
@@ -0,0 +1,64 @@
+// Copyright 2026 The Simlin Authors. All rights reserved.
+// Use of this source code is governed by the Apache License,
+// Version 2.0, that can be found in the LICENSE file.
+
+//! WebAssembly code-generation backend.
+//!
+//! This backend is an alternative to the bytecode VM (`crate::vm`). Instead of
+//! interpreting opcodes, it lowers a salsa-compiled `CompiledSimulation` (the
+//! VM's own input) into a self-contained WebAssembly module that runs the whole
+//! simulation in one exported call, writing results into its own linear memory.
+//! The intended use case is interactive scrubbing: compile a model to wasm
+//! once, then re-run it on every slider change at display refresh rates.
+//!
+//! The backend walks every module instance's un-fused opcode programs
+//! (`compiled_initials`/`compiled_flows`/`compiled_stocks`) and emits a wasm
+//! function-triple per `(model, input_set)` instance plus a `run` driver (see
+//! `lower` for the per-opcode lowering and `module` for whole-model assembly).
+//! Modules are emitted with the `wasm-encoder` crate; correctness is validated
+//! in tests by executing the emitted module under the DLR-FT `wasm-interpreter`
+//! and comparing against the bytecode VM.
+//!
+//! Status: the full scalar + array opcode set (every `Op2` operator, every
+//! `Apply` builtin, the view/reducer/iteration/vector ops, scalar/array
+//! lookups), Euler/RK2/RK4 integration, and nested modules (incl. SMOOTH/DELAY
+//! stdlib expansions) are in place. A genuine runtime view range
+//! (`ViewRangeDynamic`) or array unrolling past the per-function budget returns
+//! `WasmGenError::Unsupported`.
+
+mod alloc;
+mod lookup;
+mod lower;
+mod math;
+mod module;
+mod vector;
+mod views;
+
+pub use module::{
+    WasmArtifact, WasmLayout, compile_datamodel_to_artifact, compile_datamodel_to_wasm,
+    compile_simulation,
+};
+
+use std::fmt;
+
+/// Error from the WebAssembly code-generation backend.
+///
+/// The backend covers the full scalar + array opcode set, Euler/RK2/RK4
+/// integration, and nested modules (including SMOOTH/DELAY stdlib expansions).
+/// A genuine runtime view range (`ViewRangeDynamic`) or array unrolling past the
+/// per-function budget returns `Unsupported` rather than silently emitting an
+/// incorrect module.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum WasmGenError {
+    Unsupported(String),
+}
+
+impl fmt::Display for WasmGenError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            WasmGenError::Unsupported(what) => write!(f, "{what}"),
+        }
+    }
+}
+
+impl std::error::Error for WasmGenError {}
diff --git a/src/simlin-engine/src/wasmgen/module.rs b/src/simlin-engine/src/wasmgen/module.rs
new file mode 100644
index 000000000..8839646f4
--- /dev/null
+++ b/src/simlin-engine/src/wasmgen/module.rs
@@ -0,0 +1,4557 @@
+// Copyright 2026 The Simlin Authors. All rights reserved.
+// Use of this source code is governed by the Apache License,
+// Version 2.0, that can be found in the LICENSE file.
+
+// pattern: Functional Core
+// Pure transformation: a `CompiledSimulation` (or datamodel routed through the
+// in-memory salsa compile) in, a self-contained wasm module (`Vec<u8>`) plus its
+// `WasmLayout` out. No filesystem/network I/O; tests execute the result under
+// the DLR-FT interpreter.
+
+//! Whole-model code generation: lower a salsa-compiled `CompiledSimulation` to
+//! a self-contained WebAssembly module that runs an entire simulation in one
+//! exported call.
+//!
+//! The emitted module exports its own linear `memory`, a `run` function, and
+//! three i32 geometry globals (`n_slots`/`n_chunks`/`results_offset`). It emits
+//! one `initials`/`flows`/`stocks` function-triple *per unique `(model,
+//! input_set)` module instance* in `CompiledSimulation.modules`, each taking a
+//! runtime `module_off: i32` plus its module inputs as f64 params and lowered by
+//! [`super::lower::emit_bytecode`] over the shared slab. An `EvalModule` `call`s
+//! the child instance's function for the current phase (passing `module_off +
+//! decl.off` and the inputs), so one shared `CompiledModule` runs at every base
+//! offset it is instantiated at. A final `run` function seeds the reserved
+//! globals, calls the *root* instance's initials, and drives the integration
+//! loop. `run` lays the slab out as: a `curr` working chunk, a `next` working
+//! chunk, then a results region of `n_chunks` step-major snapshots. It records a
+//! snapshot of `curr` on the same cadence the bytecode VM uses (`vm.rs::run_to`):
+//! the t=start sample is forced, then every `save_every = round(save_step/dt)`
+//! steps, up to `n_chunks` samples.
+//!
+//! Unlike the VM's chunk-ring buffer, this uses a single `curr` chunk plus a
+//! `next` chunk that holds only the freshly integrated stock values (including
+//! nested-module stocks, collected by recursing through `EvalModule`): after
+//! recording a snapshot, the updated stocks are copied back into `curr` and time
+//! is advanced. Auxiliaries/flows are recomputed each step, so `curr` always
+//! holds the full, correct state for the timestep it represents.
+//!
+//! Current scope: the full scalar + array opcode set, Euler/RK2/RK4 integration,
+//! and nested modules (incl. SMOOTH/DELAY stdlib expansions). A genuine runtime
+//! view range (`ViewRangeDynamic`) or array unrolling past the per-function
+//! budget returns `WasmGenError::Unsupported`.
+
+use wasm_encoder::Instruction as I;
+use wasm_encoder::{
+    BlockType, CodeSection, ConstExpr, DataSection, ExportKind, ExportSection, Function,
+    FunctionSection, GlobalSection, GlobalType, MemorySection, MemoryType, Module as WasmModule,
+    TypeSection, ValType,
+};
+
+use std::collections::HashMap;
+
+use crate::bytecode::{ByteCode, CompiledModule, Opcode};
+use crate::results::{Method, Specs};
+use crate::vm::{CompiledSimulation, ModuleKey, StepPart};
+
+use super::WasmGenError;
+use super::lower::{self, BuiltHelpers, build_helpers, f64_const, max_condition_depth, memarg};
+
+// Reserved global slots, mirroring `crate::vm`.
+const TIME_OFF: usize = 0;
+const DT_OFF: usize = 1;
+const INITIAL_TIME_OFF: usize = 2;
+const FINAL_TIME_OFF: usize = 3;
+
+const SLOT_SIZE: u32 = 8;
+const WASM_PAGE_SIZE: u32 = 65536;
+
+// Slot-0 byte base of the `curr` chunk, and the byte address of `curr[TIME]`
+// (an absolute, module-independent global slot). Both run-loop and snapshot
+// code address `curr` from byte 0.
+const CURR_BASE: u32 = 0;
+const TIME_ADDR: u64 = TIME_OFF as u64 * SLOT_SIZE as u64;
+
+// Global indices. The three self-describing geometry globals come first (so the
+// exported indices 0/1/2 stay stable for hosts); `use_prev_fallback` -- the only
+// mutable global -- follows at index 3. It gates `LoadPrev`: init 1 (return the
+// fallback) until the first `prev_values` snapshot clears it (`vm.rs:668`).
+const G_N_SLOTS: u32 = 0;
+const G_N_CHUNKS: u32 = 1;
+const G_RESULTS_OFFSET: u32 = 2;
+const G_USE_PREV_FALLBACK: u32 = 3;
+
+// `run`'s i32 locals.
+const L_SAVED: u32 = 0;
+const L_STEP_ACCUM: u32 = 1;
+const L_DST: u32 = 2;
+
+/// Compile the named model of a datamodel `Project` to a full [`WasmArtifact`]
+/// (the wasm blob plus its [`WasmLayout`]), through the salsa incremental
+/// pipeline and [`compile_simulation`].
+///
+/// This is the entry point `libsimlin` uses across the FFI boundary
+/// (`simlin_model_compile_to_wasm`): it works from a datamodel alone, with no
+/// `Vm`/`SimlinSim`, returning both the blob and the name->offset layout. An
+/// incremental-compile failure or an unsupported construct surfaces as
+/// [`WasmGenError`] (the FFI maps it to a `SimlinError`, never a panic).
+pub fn compile_datamodel_to_artifact(
+    datamodel: &crate::datamodel::Project,
+    model_name: &str,
+) -> Result<WasmArtifact, WasmGenError> {
+    let mut db = crate::db::SimlinDb::default();
+    let sync = crate::db::sync_from_datamodel_incremental(&mut db, datamodel, None);
+    let sim =
+        crate::db::compile_project_incremental(&db, sync.project, model_name).map_err(|e| {
+            WasmGenError::Unsupported(format!("wasmgen: incremental compile failed: {e:?}"))
+        })?;
+    compile_simulation(&sim)
+}
+
+/// Compile the named model of a datamodel `Project` to a self-contained wasm
+/// module, dropping the [`WasmLayout`] (callers that need the layout use
+/// [`compile_datamodel_to_artifact`]). Kept as the stable raw-bytes entry point
+/// for the `wasm-backend-poc.mjs` exploratory script and any blob-only consumer.
+pub fn compile_datamodel_to_wasm(
+    datamodel: &crate::datamodel::Project,
+    model_name: &str,
+) -> Result<Vec<u8>, WasmGenError> {
+    Ok(compile_datamodel_to_artifact(datamodel, model_name)?.wasm)
+}
+
+// ============================================================================
+// CompiledSimulation -> wasm (the production path; consumes salsa bytecode)
+// ============================================================================
+
+/// A compiled simulation wasm module together with the layout metadata a host
+/// needs to read its results by variable name.
+pub struct WasmArtifact {
+    pub wasm: Vec<u8>,
+    pub layout: WasmLayout,
+}
+
+/// Geometry + variable-offset map describing a [`WasmArtifact`]'s results
+/// region. The wasm module also exports `n_slots`/`n_chunks`/`results_offset`
+/// as i32 globals so a host can stride results with no external metadata; this
+/// struct mirrors those values and adds the canonical-name -> slot map needed
+/// for by-name reads.
+pub struct WasmLayout {
+    pub n_slots: usize,
+    pub n_chunks: usize,
+    /// Byte offset of the results region within linear memory.
+    pub results_offset: usize,
+    /// Byte offset of the GF directory region (8 bytes/entry, indexed by global
+    /// table index: `(data_byte_offset: i32, n_points: i32)`). Zero when the
+    /// model has no graphical functions.
+    pub gf_directory_offset: usize,
+    /// Byte offset of the GF data region (every table's `(x,y)` knots as
+    /// consecutive f64 LE pairs). Zero when the model has no graphical
+    /// functions.
+    pub gf_data_offset: usize,
+    /// Canonical variable name -> slot offset within a chunk.
+    pub var_offsets: Vec<(String, usize)>,
+}
+
+impl WasmLayout {
+    /// Serialize the layout to a self-describing, length-prefixed byte buffer for
+    /// the FFI (no protobuf -- it rides the same malloc-return convention as the
+    /// wasm blob). The format is, all integers little-endian:
+    ///
+    /// ```text
+    /// n_slots:        u64
+    /// n_chunks:       u64
+    /// results_offset: u64
+    /// count:          u32              (number of var_offsets entries)
+    /// repeated count times:
+    ///     name_len:   u32
+    ///     name:       name_len bytes   (UTF-8, the canonical variable name)
+    ///     offset:     u64              (slot offset within a chunk)
+    /// ```
+    ///
+    /// The GF region offsets are intentionally NOT serialized: a host reads
+    /// results by name (via `n_slots`/`results_offset` + the name->offset map),
+    /// never the GF regions directly. [`deserialize`] is the exact inverse over
+    /// the geometry + name map (it leaves the GF offsets 0).
+    ///
+    /// [`deserialize`]: Self::deserialize
+    pub fn serialize(&self) -> Vec<u8> {
+        let mut out = Vec::new();
+        out.extend_from_slice(&(self.n_slots as u64).to_le_bytes());
+        out.extend_from_slice(&(self.n_chunks as u64).to_le_bytes());
+        out.extend_from_slice(&(self.results_offset as u64).to_le_bytes());
+        out.extend_from_slice(&(self.var_offsets.len() as u32).to_le_bytes());
+        for (name, offset) in &self.var_offsets {
+            let bytes = name.as_bytes();
+            out.extend_from_slice(&(bytes.len() as u32).to_le_bytes());
+            out.extend_from_slice(bytes);
+            out.extend_from_slice(&(*offset as u64).to_le_bytes());
+        }
+        out
+    }
+
+    /// Parse a buffer produced by [`serialize`]. Returns `None` if the buffer is
+    /// truncated, an integer is malformed, or a name is not valid UTF-8 -- a host
+    /// gets a clean failure rather than a panic on a corrupt buffer. The GF region
+    /// offsets are reconstructed as 0 (they are not in the serialized format).
+    ///
+    /// This is the inverse used by the libsimlin FFI tests and any host that wants
+    /// to round-trip the layout in Rust; a non-Rust host re-implements the same
+    /// little-endian parse against the documented format.
+    ///
+    /// [`serialize`]: Self::serialize
+    pub fn deserialize(bytes: &[u8]) -> Option<WasmLayout> {
+        let mut pos = 0usize;
+        let take = |pos: &mut usize, n: usize| -> Option<&[u8]> {
+            let end = pos.checked_add(n)?;
+            let slice = bytes.get(*pos..end)?;
+            *pos = end;
+            Some(slice)
+        };
+        let read_u64 = |pos: &mut usize| -> Option<u64> {
+            Some(u64::from_le_bytes(take(pos, 8)?.try_into().ok()?))
+        };
+        let read_u32 = |pos: &mut usize| -> Option<u32> {
+            Some(u32::from_le_bytes(take(pos, 4)?.try_into().ok()?))
+        };
+
+        let n_slots = read_u64(&mut pos)? as usize;
+        let n_chunks = read_u64(&mut pos)? as usize;
+        let results_offset = read_u64(&mut pos)? as usize;
+        let count = read_u32(&mut pos)? as usize;
+        let mut var_offsets = Vec::with_capacity(count);
+        for _ in 0..count {
+            let name_len = read_u32(&mut pos)? as usize;
+            let name_bytes = take(&mut pos, name_len)?;
+            let name = std::str::from_utf8(name_bytes).ok()?.to_string();
+            let offset = read_u64(&mut pos)? as usize;
+            var_offsets.push((name, offset));
+        }
+        Some(WasmLayout {
+            n_slots,
+            n_chunks,
+            results_offset,
+            gf_directory_offset: 0,
+            gf_data_offset: 0,
+            var_offsets,
+        })
+    }
+}
+
+// GF region geometry. The directory holds one 8-byte entry per global table
+// index (two i32: the table's absolute data byte offset, and its point count);
+// the data region holds every table's knots as consecutive f64 LE `(x, y)`
+// pairs (16 bytes/point).
+const GF_DIRECTORY_ENTRY_BYTES: u32 = 8; // i32 data_offset + i32 n_points
+const GF_KNOT_BYTES: u32 = 16; // f64 x + f64 y
+
+/// The two read-only graphical-function regions for a model, laid out at a
+/// caller-chosen `region_base` byte offset within the module's linear memory.
+///
+/// `directory_base` == `region_base`; the data region follows the directory.
+/// Each directory entry's first i32 is the *absolute* byte offset of its
+/// table's first knot (so the lookup helpers can `f64.load` a knot with no
+/// further base arithmetic); the second i32 is the table's point count. The
+/// concatenation order is the global table order in
+/// `ByteCodeContext.graphical_functions`, so the `Lookup` opcode's
+/// `base_gf + element_offset` indexes directly into the directory.
+struct GfRegions {
+    directory_base: u32,
+    data_base: u32,
+    /// `directory` ++ `data` would be the full image, but they are kept
+    /// separate so each can be emitted as its own active `DataSection` segment
+    /// at its own base.
+    directory: Vec<u8>,
+    data: Vec<u8>,
+    /// Total byte span of both regions (directory + data), for growing `pages`.
+    total_bytes: u32,
+}
+
+/// Build the GF directory + data regions for `tables` (the root's
+/// `graphical_functions`) at `region_base`. Returns `None` (no regions, no
+/// growth) when there are no tables. Returns a layout error if the regions
+/// would overflow a u32 byte address.
+fn build_gf_regions(
+    tables: &[Vec<(f64, f64)>],
+    region_base: u32,
+) -> Result<Option<GfRegions>, WasmGenError> {
+    if tables.is_empty() {
+        return Ok(None);
+    }
+    let too_large =
+        || WasmGenError::Unsupported("wasmgen: graphical functions too large".to_string());
+
+    let n_tables = u32::try_from(tables.len()).map_err(|_| too_large())?;
+    let directory_bytes = n_tables
+        .checked_mul(GF_DIRECTORY_ENTRY_BYTES)
+        .ok_or_else(too_large)?;
+    let directory_base = region_base;
+    let data_base = directory_base
+        .checked_add(directory_bytes)
+        .ok_or_else(too_large)?;
+
+    let mut directory = Vec::with_capacity(directory_bytes as usize);
+    let mut data: Vec<u8> = Vec::new();
+    // The running byte offset of the next table's first knot, relative to
+    // `data_base`. Promoted to an absolute address when written into the
+    // directory so a helper can load a knot directly.
+    let mut data_rel_offset: u32 = 0;
+    for table in tables {
+        let n_points = u32::try_from(table.len()).map_err(|_| too_large())?;
+        let abs_data_offset = data_base
+            .checked_add(data_rel_offset)
+            .ok_or_else(too_large)?;
+        directory.extend_from_slice(&(abs_data_offset as i32).to_le_bytes());
+        directory.extend_from_slice(&(n_points as i32).to_le_bytes());
+
+        for &(x, y) in table {
+            data.extend_from_slice(&x.to_le_bytes());
+            data.extend_from_slice(&y.to_le_bytes());
+        }
+        let table_bytes = n_points.checked_mul(GF_KNOT_BYTES).ok_or_else(too_large)?;
+        data_rel_offset = data_rel_offset
+            .checked_add(table_bytes)
+            .ok_or_else(too_large)?;
+    }
+
+    let total_bytes = directory_bytes
+        .checked_add(data_rel_offset)
+        .ok_or_else(too_large)?;
+    Ok(Some(GfRegions {
+        directory_base,
+        data_base,
+        directory,
+        data,
+        total_bytes,
+    }))
+}
+
+// Offsets of an instance's three program functions within its function-triple.
+// The module's function slots are: the emitted helper functions
+// ([`lower::build_helpers`]) at `0..n_helpers`, then one
+// `[initials, flows, stocks]` triple per module instance (in `instance_order`),
+// then `run` last. So instance `i`'s `StepPart` function is at
+// `n_helpers + i*FUNCS_PER_INSTANCE + {F_INITIALS,F_FLOWS,F_STOCKS}`, and `run`
+// is at `n_helpers + n_instances*FUNCS_PER_INSTANCE`. Keeping these relative
+// (and adding `n_helpers`/the triple base at the call/export sites) means new
+// helpers or instances shift the indices automatically.
+const F_INITIALS: u32 = 0;
+const F_FLOWS: u32 = 1;
+const F_STOCKS: u32 = 2;
+const FUNCS_PER_INSTANCE: u32 = 3;
+
+// Type-section indices. The `run` type comes first; one opcode-program type per
+// distinct module-input count follows (`(i32, f64*k) -> ()`), and helper types
+// are appended after those. `run` is `() -> ()`.
+const TYPE_RUN_FN: u32 = 0; // () -> ()
+
+// Param 0 of every opcode-program function is `module_off` (i32); params
+// `1..=n_inputs` are the f64 module inputs. Declared locals follow.
+const L_MODULE_OFF: u32 = 0;
+
+/// Everything an instance's `EmitCtx` needs that varies per `(model, input_set)`
+/// module instance: its own `ByteCodeContext`, the disjoint linear-memory bases
+/// the emitter threads in for that instance's array tables / GF lookups, its
+/// module-input parameter count, and (when it has graphical functions) its slice
+/// of the combined GF region. Computed once in [`compile_simulation`] before any
+/// function is emitted, in `instance_order`.
+struct PerInstance<'a> {
+    module: &'a CompiledModule,
+    /// Number of f64 module-input parameters this instance's three functions
+    /// take (param 0 is `module_off`, params `1..=n_inputs` are the inputs).
+    /// `0` for the root and any uninstantiated module. Drawn from the
+    /// `EvalModule { n_inputs }` of its call sites (the count the VM passes).
+    n_inputs: u32,
+    /// Byte base of this instance's GF directory region (`0` when it has no
+    /// graphical functions). Threaded into the instance's `EmitCtx`.
+    gf_directory_base: u32,
+    /// Byte base of this instance's GF data region (`0` when it has no GFs).
+    gf_data_base: u32,
+    /// Byte base of this instance's disjoint `temp_storage` region.
+    temp_storage_base: u32,
+    /// This instance's GF region image (directory + data + bases), for the
+    /// `DataSection`; `None` when the instance has no graphical functions.
+    gf_regions: Option<GfRegions>,
+    /// The relative offsets this instance's module assigns via a flows
+    /// `AssignConstCurr` -- its overridable constants (Phase 7 Task 2). Threaded
+    /// into the instance's `EmitCtx` so an `AssignConstCurr { off }` whose `off`
+    /// is in this set sources from the constants-override region.
+    flows_const_offsets: std::collections::HashSet<u16>,
+}
+
+/// Compile a `CompiledSimulation` (produced by the salsa incremental pipeline)
+/// into a self-contained wasm module.
+///
+/// Every unique `(model, input_set)` module instance in `sim.modules` becomes its
+/// own initials/flows/stocks wasm function-triple taking `(module_off: i32,
+/// in_0..in_{k-1}: f64)`; an `EvalModule` resolves the child instance and `call`s
+/// its function for the current phase (passing `module_off + decl.off` and the
+/// inputs), so one shared `CompiledModule` runs at every base offset it is
+/// instantiated at. The opcode programs a `CompiledSimulation` carries are the
+/// plain, un-fused scalar set (the VM's superinstruction fusion runs on a private
+/// execution copy), so each `Opcode` lowers via [`lower::emit_bytecode`].
+/// Anything outside the supported set -- an unsupported opcode, or array
+/// unrolling past the per-function budget -- returns [`WasmGenError::Unsupported`]
+/// rather than emitting a wrong module.
+pub fn compile_simulation(sim: &CompiledSimulation) -> Result<WasmArtifact, WasmGenError> {
+    // `wasmgen` is in-crate, so it reads `CompiledSimulation`'s `pub(crate)`
+    // fields directly rather than through accessors.
+    let specs = &sim.specs;
+    // The run-loop shape is selected from `specs.method` below; all three
+    // methods (`Euler`/`RungeKutta2`/`RungeKutta4`) are supported.
+
+    let root = sim
+        .modules
+        .get(&sim.root)
+        .ok_or_else(|| WasmGenError::Unsupported("wasmgen: root module not found".to_string()))?;
+    let too_large = || WasmGenError::Unsupported("wasmgen: model too large to lower".to_string());
+
+    // Enumerate every module instance in a deterministic order (sorted by key),
+    // and the count of inputs each receives. The root receives 0 inputs (it is
+    // called by `run`); every other instance's input count is the `n_inputs` of
+    // its `EvalModule` call sites -- exactly what the VM sizes `module_inputs` to.
+    let mut instance_order: Vec<ModuleKey> = sim.modules.keys().cloned().collect();
+    instance_order.sort();
+    let instance_n_inputs = collect_instance_input_counts(sim);
+
+    // The stock data-buffer offsets the *whole simulation* integrates, recursing
+    // through `EvalModule` so submodule (SMOOTH/DELAY) stocks are included --
+    // mirroring the VM's `collect_stock_offsets` (`vm.rs:512-543`). The Euler
+    // advance copies these `next -> curr`; the RK loops index `rk_scratch` by
+    // their position here. Collected up front so the RK scratch region is sized
+    // below.
+    let stock_offsets = collect_all_stock_offsets(&sim.modules, &sim.root, 0);
+    let n_stocks = u32::try_from(stock_offsets.len()).map_err(|_| too_large())?;
+    // `n_slots` is the ROOT module's slot count, which spans the whole slab
+    // including every nested module's slots (`vm.rs::n_slots` returns the root's).
+    let n_slots = u32::try_from(root.n_slots).map_err(|_| too_large())?;
+    let n_chunks = u32::try_from(specs.n_chunks).map_err(|_| too_large())?;
+    let stride = n_slots.checked_mul(SLOT_SIZE).ok_or_else(too_large)?;
+    let curr_base = 0u32;
+    let next_base = stride;
+    let results_base = stride.checked_mul(2).ok_or_else(too_large)?;
+    let results_bytes = n_chunks.checked_mul(stride).ok_or_else(too_large)?;
+    let mut total_bytes = results_base
+        .checked_add(results_bytes)
+        .ok_or_else(too_large)?;
+
+    // Per-instance GF regions follow the results region, concatenated in
+    // `instance_order` (each instance's directory+data sits at its own base, so
+    // its directory entry 0 maps to its own table 0). The `Lookup` opcode reads
+    // the directory at `instance_gf_directory_base + table_idx*8`, so each
+    // instance's `EmitCtx` carries its own base. They are initialized at
+    // instantiation by active `DataSection` segments.
+    let mut instance_gf: HashMap<ModuleKey, (u32, u32, Option<GfRegions>)> = HashMap::new();
+    for key in &instance_order {
+        let module = &sim.modules[key];
+        let regions = build_gf_regions(&module.context.graphical_functions, total_bytes)?;
+        let (dir_base, data_base) = regions
+            .as_ref()
+            .map(|r| (r.directory_base, r.data_base))
+            .unwrap_or((0, 0));
+        if let Some(r) = &regions {
+            total_bytes = total_bytes
+                .checked_add(r.total_bytes)
+                .ok_or_else(too_large)?;
+        }
+        instance_gf.insert(key.clone(), (dir_base, data_base, regions));
+    }
+    // The layout reports the ROOT instance's GF bases (a host reads results, not
+    // GF directly; this preserves the single-root-model layout exactly).
+    let (root_gf_directory_base, root_gf_data_base) = instance_gf
+        .get(&sim.root)
+        .map(|(d, dd, _)| (*d, *dd))
+        .unwrap_or((0, 0));
+
+    // The two snapshot regions follow the GF regions, each `n_slots` wide
+    // (`vm.rs:617-618`). `initial_values` backs `INIT(x)` (captured once after
+    // initials); `prev_values` backs `PREVIOUS(x)` (captured after each step, or
+    // after the end-of-step flows re-eval under RK). Their bases are threaded
+    // into every `EmitCtx` so `LoadInitial`/`LoadPrev` can address them. They are
+    // shared across instances: a child reads `initial_values[module_off + off]`,
+    // the same single snapshot the VM keeps.
+    let snapshot_bytes = n_slots.checked_mul(SLOT_SIZE).ok_or_else(too_large)?;
+    let initial_values_base = total_bytes;
+    let prev_values_base = initial_values_base
+        .checked_add(snapshot_bytes)
+        .ok_or_else(too_large)?;
+    total_bytes = prev_values_base
+        .checked_add(snapshot_bytes)
+        .ok_or_else(too_large)?;
+
+    // The RK scratch region (`saved`(n_stocks) ++ `accum`(n_stocks)) follows the
+    // snapshot regions. It holds each stock's stage-1 value and running RK
+    // accumulator across the stages (`vm.rs:655`, the VM's `rk_scratch`
+    // split). `n_stocks` now spans nested module stocks. Euler needs neither, so
+    // the region is only reserved for RK.
+    let rk = matches!(specs.method, Method::RungeKutta2 | Method::RungeKutta4);
+    let stock_scratch_bytes = n_stocks.checked_mul(SLOT_SIZE).ok_or_else(too_large)?;
+    let rk_saved_base = total_bytes;
+    let rk_accum_base = rk_saved_base
+        .checked_add(stock_scratch_bytes)
+        .ok_or_else(too_large)?;
+    if rk {
+        total_bytes = rk_accum_base
+            .checked_add(stock_scratch_bytes)
+            .ok_or_else(too_large)?;
+    }
+
+    // Per-instance `temp_storage` regions follow the snapshot/RK regions, one
+    // disjoint region per instance (sized by that instance's `temp_total_size`).
+    // The VM shares one `temp_storage` buffer across modules (per-module
+    // `temp_offsets`); disjoint regions are unconditionally correct because a
+    // parent's temps never survive across an `EvalModule` call (the child would
+    // otherwise clobber a shared slot the VM relies on not surviving), so giving
+    // each instance its own region cannot diverge from the VM. The largest
+    // per-instance `temp_total_size` also bounds the shared vector/alloc scratch.
+    let mut instance_temp_base: HashMap<ModuleKey, u32> = HashMap::new();
+    let mut max_temp_total_size = 0u32;
+    for key in &instance_order {
+        let module = &sim.modules[key];
+        let temp_total_size =
+            u32::try_from(module.context.temp_total_size).map_err(|_| too_large())?;
+        max_temp_total_size = max_temp_total_size.max(temp_total_size);
+        instance_temp_base.insert(key.clone(), total_bytes);
+        let temp_bytes = temp_total_size
+            .checked_mul(SLOT_SIZE)
+            .ok_or_else(too_large)?;
+        total_bytes = total_bytes.checked_add(temp_bytes).ok_or_else(too_large)?;
+    }
+
+    // The vector-op + allocation scratch regions follow the temp regions. They
+    // are shared across instances (the staging is within a single opcode, never
+    // live across an `EvalModule` boundary -- the same reason the VM shares
+    // them). A vector/alloc op's element count is bounded by the largest view it
+    // processes, in turn bounded by the largest per-instance `temp_total_size`
+    // and the slab's `n_slots`; see the detailed sizing invariant retained on the
+    // per-region comments below. `2 * max(...)` f64 for the sort-pair vector
+    // scratch, `6 * max(...)` f64 for the allocation staging.
+    let scratch_view_bound = max_temp_total_size.max(n_slots);
+    let vector_scratch_base = total_bytes;
+    let vector_scratch_slots = scratch_view_bound.checked_mul(2).ok_or_else(too_large)?;
+    let vector_scratch_bytes = vector_scratch_slots
+        .checked_mul(SLOT_SIZE)
+        .ok_or_else(too_large)?;
+    total_bytes = vector_scratch_base
+        .checked_add(vector_scratch_bytes)
+        .ok_or_else(too_large)?;
+
+    let alloc_scratch_base = total_bytes;
+    let alloc_scratch_slots = scratch_view_bound.checked_mul(6).ok_or_else(too_large)?;
+    let alloc_scratch_bytes = alloc_scratch_slots
+        .checked_mul(SLOT_SIZE)
+        .ok_or_else(too_large)?;
+    total_bytes = alloc_scratch_base
+        .checked_add(alloc_scratch_bytes)
+        .ok_or_else(too_large)?;
+
+    // The constants-override region (Phase 7 Task 2) follows the scratch regions:
+    // an `n_slots`-wide f64 region indexed by ABSOLUTE slab offset, holding each
+    // overridable constant's current value (initialized to the compiled default).
+    // It is `n_slots` wide -- not `n_overridable` -- so a redirected
+    // `AssignConstCurr { off }` reads it with the same `module_off`-relative
+    // addressing the slab uses (`const_region_base + (module_off + off) * 8`),
+    // which is what lets one shared `CompiledModule` running at several
+    // `module_off`s pick up each instance's distinct override. A parallel
+    // `n_slots`-byte validity region marks which absolute slots `set_value` may
+    // write (1 = overridable). Both are initialized by active `DataSection`
+    // segments built from `collect_overridable_defaults` (which mirrors the VM's
+    // `collect_constant_info` recursion).
+    let const_region_base = total_bytes;
+    let const_region_bytes = n_slots.checked_mul(SLOT_SIZE).ok_or_else(too_large)?;
+    total_bytes = const_region_base
+        .checked_add(const_region_bytes)
+        .ok_or_else(too_large)?;
+    let const_valid_base = total_bytes;
+    // One validity byte per slot.
+    total_bytes = const_valid_base
+        .checked_add(n_slots)
+        .ok_or_else(too_large)?;
+
+    let overridable_defaults = collect_overridable_defaults(&sim.modules, &sim.root, 0);
+    // Defense in depth: the offsets `collect_overridable_defaults` reports must
+    // be exactly the set the VM considers overridable (`constant_offsets`, the
+    // keys of `cached_constant_info`). Both walk the same flows-`AssignConstCurr`
+    // overridability rule, so any divergence is a bug -- a blob's `set_value`
+    // would then accept/reject a different set than the VM. Checked only in debug.
+    debug_assert!(
+        {
+            let mut ours: Vec<usize> = overridable_defaults.iter().map(|(off, _)| *off).collect();
+            ours.sort_unstable();
+            ours.dedup();
+            let mut theirs: Vec<usize> = sim.constant_offsets().collect();
+            theirs.sort_unstable();
+            ours == theirs
+        },
+        "wasmgen overridable-constant offsets diverged from CompiledSimulation::constant_offsets"
+    );
+
+    let pages = total_bytes.div_ceil(WASM_PAGE_SIZE).max(1);
+
+    // save_every mirrors vm.rs::run_to: max(1, round(save_step / dt)).
+    let save_every = ((specs.save_step / specs.dt).round() as i64).max(1);
+    let save_every = i32::try_from(save_every).map_err(|_| too_large())?;
+
+    // Emitted helper functions occupy the module's first function slots; the
+    // per-instance function-triples follow (at `n_helpers + i*FUNCS_PER_INSTANCE`
+    // for instance `i`), and `run` is last. Build the helpers up front so the
+    // index registry threaded into each `EmitCtx` matches the assembled module's
+    // layout, and so `emit_bytecode`'s `call`s resolve.
+    let helpers = build_helpers();
+    let helper_fns = helpers.fns;
+    let n_helpers = helpers.functions.len() as u32;
+
+    // Assemble the per-instance descriptors and the `(ModuleKey, StepPart) -> fn
+    // index` map. The map is built for ALL instances before any function body is
+    // emitted, so an `EvalModule` in one instance's program resolves to the
+    // child's already-known function index (the instantiation graph is acyclic,
+    // but the index map does not depend on emit order regardless).
+    let mut instances: Vec<PerInstance> = Vec::with_capacity(instance_order.len());
+    let mut module_fn_index: HashMap<(ModuleKey, StepPart), u32> = HashMap::new();
+    for (i, key) in instance_order.iter().enumerate() {
+        let module = &sim.modules[key];
+        let base = n_helpers + (i as u32) * FUNCS_PER_INSTANCE;
+        module_fn_index.insert((key.clone(), StepPart::Initials), base + F_INITIALS);
+        module_fn_index.insert((key.clone(), StepPart::Flows), base + F_FLOWS);
+        module_fn_index.insert((key.clone(), StepPart::Stocks), base + F_STOCKS);
+        let (gf_directory_base, gf_data_base, gf_regions) =
+            instance_gf.remove(key).expect("gf entry per instance");
+        instances.push(PerInstance {
+            module,
+            n_inputs: instance_n_inputs.get(key).copied().unwrap_or(0),
+            gf_directory_base,
+            gf_data_base,
+            temp_storage_base: instance_temp_base[key],
+            gf_regions,
+            flows_const_offsets: flows_const_offsets_for(module),
+        });
+    }
+
+    // Emit each instance's three program functions (initials/flows/stocks) over
+    // the shared f64 slab, each lowered with that instance's own `ByteCodeContext`
+    // and per-instance bases. `step_part` is per-program so `LoadInitial` picks
+    // its `curr`-vs-snapshot branch at compile time (`vm.rs:1332-1340`), and an
+    // `EvalModule` resolves the child's function for that same phase.
+    let mut program_fns: Vec<Function> = Vec::with_capacity(instances.len() * 3);
+    for inst in &instances {
+        // `module_off` is the function's i32 param 0; inputs are params
+        // `1..=n_inputs`. The reverse-pop scratch f64 base sits past all other
+        // declared locals; the index helpers shift everything by `n_inputs`.
+        let make_ctx = |cond_depth: usize, extra_i32: u32, step_part: StepPart| lower::EmitCtx {
+            curr_base,
+            next_base,
+            gf_directory_base: inst.gf_directory_base,
+            gf_data_base: inst.gf_data_base,
+            initial_values_base,
+            prev_values_base,
+            use_prev_fallback_global: G_USE_PREV_FALLBACK,
+            step_part,
+            dt: specs.dt,
+            start_time: specs.start,
+            final_time: specs.stop,
+            module_off_local: L_MODULE_OFF,
+            scratch_local: lower::scratch_local_for(inst.n_inputs),
+            condition_locals: lower::condition_locals_for(inst.n_inputs, cond_depth),
+            apply_locals: lower::apply_locals_for(inst.n_inputs, cond_depth),
+            helpers: helper_fns,
+            temp_storage_base: inst.temp_storage_base,
+            extra_i32_local_base: lower::extra_i32_local_base(inst.n_inputs, cond_depth),
+            vector_f64_locals: lower::vector_f64_locals_for(inst.n_inputs, cond_depth),
+            vector_i32_locals: lower::vector_i32_locals_for(inst.n_inputs, cond_depth),
+            vector_scratch_base,
+            alloc_scratch_base,
+            module_input_scratch_base: lower::module_input_scratch_base(
+                inst.n_inputs,
+                cond_depth,
+                extra_i32,
+            ),
+            const_region_base,
+            flows_const_offsets: &inst.flows_const_offsets,
+            module_fn_index: &module_fn_index,
+            ctx: &inst.module.context,
+        };
+        program_fns.push(emit_initials_fn(inst.module, inst.n_inputs, &make_ctx)?);
+        program_fns.push(emit_opcode_fn(
+            &inst.module.compiled_flows,
+            inst.n_inputs,
+            StepPart::Flows,
+            &make_ctx,
+        )?);
+        program_fns.push(emit_opcode_fn(
+            &inst.module.compiled_stocks,
+            inst.n_inputs,
+            StepPart::Stocks,
+            &make_ctx,
+        )?);
+    }
+
+    // `run` calls the ROOT instance's initials/flows/stocks with `module_off = 0`
+    // and no inputs (the root takes none) -- unchanged from the single-module
+    // path. Its child `EvalModule`s recurse from there.
+    let root_idx = instance_order
+        .iter()
+        .position(|k| *k == sim.root)
+        .expect("root is among the instances");
+    let root_fn_base = n_helpers + (root_idx as u32) * FUNCS_PER_INSTANCE;
+    let run_fn = emit_run_simulation(
+        specs,
+        RunRegions {
+            n_slots,
+            results_base,
+            stride,
+            n_chunks,
+            initial_values_base,
+            prev_values_base,
+            rk_saved_base,
+            rk_accum_base,
+        },
+        save_every,
+        &stock_offsets,
+        root_fn_base,
+    );
+
+    // The constants-override exports (Phase 7 Task 2): `set_value` writes an
+    // override into the constants region (validated against the validity bytes),
+    // `reset` resets the run state (`use_prev_fallback`) without clearing the
+    // region, and `clear_values` restores the compiled defaults.
+    let set_value_fn = emit_set_value(n_slots, const_region_base, const_valid_base);
+    let reset_fn = emit_reset();
+    let clear_values_fn = emit_clear_values(const_region_base, &overridable_defaults);
+
+    // The constants region + validity bytes are initialized at instantiation by
+    // active data segments built from the overridable defaults (sparse writes,
+    // one f64 + one validity byte per overridable absolute offset).
+    let const_init =
+        build_const_region_init(&overridable_defaults, const_region_base, const_valid_base);
+
+    let instance_input_counts: Vec<u32> = instances.iter().map(|inst| inst.n_inputs).collect();
+    let gf_images: Vec<&GfRegions> = instances
+        .iter()
+        .filter_map(|inst| inst.gf_regions.as_ref())
+        .collect();
+    let wasm = assemble_simulation(AssembleParts {
+        helpers,
+        program_fns,
+        run_fn,
+        set_value_fn,
+        reset_fn,
+        clear_values_fn,
+        instance_input_counts: &instance_input_counts,
+        pages,
+        n_slots,
+        n_chunks,
+        results_base,
+        gf_regions: &gf_images,
+        const_init: &const_init,
+    });
+
+    let var_offsets = sim
+        .offsets
+        .iter()
+        .map(|(k, v)| (k.as_str().to_string(), *v))
+        .collect();
+
+    Ok(WasmArtifact {
+        wasm,
+        layout: WasmLayout {
+            n_slots: root.n_slots,
+            n_chunks: specs.n_chunks,
+            results_offset: results_base as usize,
+            gf_directory_offset: root_gf_directory_base as usize,
+            gf_data_offset: root_gf_data_base as usize,
+            var_offsets,
+        },
+    })
+}
+
+/// The `n_inputs` (module-input parameter count) of each module instance, drawn
+/// from the `EvalModule { n_inputs }` opcodes across every instance's three
+/// programs. The root receives 0 inputs (it is invoked by `run` with none); a
+/// child receives the count its callers pass -- the same value the VM sizes
+/// `module_inputs` to. All call sites for a given `(model, input_set)` key agree
+/// (the `input_set` is part of the key and `n_inputs == args.len()` at codegen,
+/// `codegen.rs:1094-1109`); first-seen wins, which is therefore unambiguous.
+fn collect_instance_input_counts(sim: &CompiledSimulation) -> HashMap<ModuleKey, u32> {
+    let mut counts: HashMap<ModuleKey, u32> = HashMap::new();
+    for module in sim.modules.values() {
+        let programs: [&ByteCode; 2] = [&module.compiled_flows, &module.compiled_stocks];
+        let initial_codes = module.compiled_initials.iter().map(|ci| &ci.bytecode);
+        for bc in programs.into_iter().chain(initial_codes) {
+            for op in &bc.code {
+                if let Opcode::EvalModule { id, n_inputs } = op {
+                    let decl = &module.context.modules[*id as usize];
+                    let child_key = crate::vm::make_module_key(&decl.model_name, &decl.input_set);
+                    counts.entry(child_key).or_insert(u32::from(*n_inputs));
+                }
+            }
+        }
+    }
+    counts
+}
+
+/// Build an instance's `initials` function: every `CompiledInitial`'s bytecode
+/// in order, over the shared slab. The shared condition-local count is the max
+/// nesting depth across all the initials (they run sequentially in one function);
+/// the reverse-pop scratch covers the max `EvalModule { n_inputs }` over them.
+/// `n_inputs` is the instance's module-input parameter count (shifts the locals).
+fn emit_initials_fn<'a>(
+    module: &CompiledModule,
+    n_inputs: u32,
+    make_ctx: &impl Fn(usize, u32, StepPart) -> lower::EmitCtx<'a>,
+) -> Result<Function, WasmGenError> {
+    let cond_depth = module
+        .compiled_initials
+        .iter()
+        .map(|ci| max_condition_depth(&ci.bytecode))
+        .max()
+        .unwrap_or(0);
+    // The initials run sequentially in one function; each fragment's dynamic-
+    // subscript accumulation (and `EvalModule` reverse-pop) completes before the
+    // next, so reserving the *max* per-fragment count -- not the sum -- is
+    // correct, and the fragments reuse the same scratch locals.
+    let extra_i32 = module
+        .compiled_initials
+        .iter()
+        .map(|ci| lower::count_extra_i32_locals(&ci.bytecode))
+        .max()
+        .unwrap_or(0);
+    let module_input_scratch = module
+        .compiled_initials
+        .iter()
+        .map(|ci| lower::count_module_input_scratch(&ci.bytecode))
+        .max()
+        .unwrap_or(0);
+    let ctx = make_ctx(cond_depth, extra_i32, StepPart::Initials);
+    let mut f = new_opcode_fn(n_inputs, cond_depth, extra_i32, module_input_scratch);
+    for ci in module.compiled_initials.iter() {
+        lower::emit_bytecode(&ci.bytecode, &ctx, &mut f)?;
+    }
+    f.instruction(&I::End);
+    Ok(f)
+}
+
+/// Build one opcode-program function from a single `ByteCode`, lowering it as
+/// `step_part` (which `LoadInitial` reads to pick its `curr`-vs-snapshot branch,
+/// and which an `EvalModule` calls the child's matching phase function for).
+/// `n_inputs` is the instance's module-input parameter count.
+fn emit_opcode_fn<'a>(
+    bc: &ByteCode,
+    n_inputs: u32,
+    step_part: StepPart,
+    make_ctx: &impl Fn(usize, u32, StepPart) -> lower::EmitCtx<'a>,
+) -> Result<Function, WasmGenError> {
+    let cond_depth = max_condition_depth(bc);
+    let extra_i32 = lower::count_extra_i32_locals(bc);
+    let module_input_scratch = lower::count_module_input_scratch(bc);
+    let ctx = make_ctx(cond_depth, extra_i32, step_part);
+    let mut f = new_opcode_fn(n_inputs, cond_depth, extra_i32, module_input_scratch);
+    lower::emit_bytecode(bc, &ctx, &mut f)?;
+    f.instruction(&I::End);
+    Ok(f)
+}
+
+/// A fresh opcode-program `Function` for an instance with `n_inputs` f64 input
+/// params: the scratch f64 local, `cond_depth` i32 condition locals, the three
+/// `Apply` scratch f64 locals, the vector-op scratch, `extra_i32`
+/// dynamic-subscript scratch i32 locals, and `module_input_scratch` `EvalModule`
+/// reverse-pop f64 locals (param 0 = `module_off`, params `1..=n_inputs` =
+/// inputs). The declaration list lives in [`lower::opcode_fn_locals`] (which is
+/// param-count-independent); the index helpers shift by `n_inputs`.
+fn new_opcode_fn(
+    n_inputs: u32,
+    cond_depth: usize,
+    extra_i32: u32,
+    module_input_scratch: u32,
+) -> Function {
+    // `n_inputs` is in the function's *type* (its params), not the declared
+    // locals list; it is applied at `assemble_simulation` where the type is
+    // chosen, so it does not appear here.
+    let _ = n_inputs;
+    Function::new(lower::opcode_fn_locals(
+        cond_depth,
+        extra_i32,
+        module_input_scratch,
+    ))
+}
+
+/// Collect absolute offsets of all stock variables across the whole simulation,
+/// recursing into child modules via `EvalModule` so submodule (SMOOTH/DELAY)
+/// stocks are included. Mirrors the VM's `collect_stock_offsets`
+/// (`vm.rs:512-543`) exactly: a stock writes via `AssignNext` or its
+/// peephole-fused `BinOpAssignNext` (most integrations are `stock + delta`), and
+/// an `EvalModule` recurses with `base_off + decl.off` (each instance addresses
+/// its slot at `base_off + off`). After each step these slots are copied `next ->
+/// curr`; the RK loops index `rk_scratch[saved/accum]` by their sorted position.
+fn collect_all_stock_offsets(
+    modules: &HashMap<ModuleKey, CompiledModule>,
+    key: &ModuleKey,
+    base_off: usize,
+) -> Vec<usize> {
+    let module = match modules.get(key) {
+        Some(m) => m,
+        None => return Vec::new(),
+    };
+    let mut offsets: Vec<usize> = Vec::new();
+    for op in module.compiled_stocks.code.iter() {
+        match op {
+            Opcode::AssignNext { off } | Opcode::BinOpAssignNext { off, .. } => {
+                offsets.push(base_off + *off as usize);
+            }
+            Opcode::EvalModule { id, .. } => {
+                let decl = &module.context.modules[*id as usize];
+                let child_key = crate::vm::make_module_key(&decl.model_name, &decl.input_set);
+                offsets.extend(collect_all_stock_offsets(
+                    modules,
+                    &child_key,
+                    base_off + decl.off,
+                ));
+            }
+            _ => {}
+        }
+    }
+    // Defensive dedup, as the VM does: duplicate offsets would double-copy.
+    offsets.sort_unstable();
+    offsets.dedup();
+    offsets
+}
+
+/// The set of *relative* offsets a module assigns via an `AssignConstCurr` in
+/// its **flows** phase: exactly this module's overridable constants. Mirrors the
+/// first (flows-only) pass of the VM's `collect_constant_info` (`vm.rs:436-450`),
+/// but keyed by relative offset and computed per module, so it is compile-time
+/// even for a shared `CompiledModule` instantiated at several base offsets (every
+/// instantiation's `base_off + off` is overridable, since `collect_constant_info`
+/// recurses through every declaration). An `AssignConstCurr { off }` in any phase
+/// whose `off` is in this set is redirected to read the constants-override
+/// region; one whose `off` is absent emits its immediate literal.
+fn flows_const_offsets_for(module: &CompiledModule) -> std::collections::HashSet<u16> {
+    module
+        .compiled_flows
+        .code
+        .iter()
+        .filter_map(|op| match op {
+            Opcode::AssignConstCurr { off, .. } => Some(*off),
+            _ => None,
+        })
+        .collect()
+}
+
+/// Collect `(absolute offset, compiled-default literal)` for every overridable
+/// constant across the whole simulation, recursing through `EvalModule`
+/// declarations with cumulative `base_off`. Mirrors the VM's `collect_constant_info`
+/// (`vm.rs:426-507`): an offset is overridable iff some module assigns it via an
+/// `AssignConstCurr` in its **flows** phase, and the default value is that flows
+/// `AssignConstCurr`'s literal. Used to size and initialize the constants-override
+/// region so the wasm blob's `set_value` accepts exactly the offsets the VM's
+/// `set_value_by_offset` does, each initialized to the same compiled default.
+///
+/// A shared module instantiated at two base offsets contributes both absolute
+/// offsets (one per instantiation), exactly as the VM's recursion does.
+fn collect_overridable_defaults(
+    modules: &HashMap<ModuleKey, CompiledModule>,
+    key: &ModuleKey,
+    base_off: usize,
+) -> Vec<(usize, f64)> {
+    let module = match modules.get(key) {
+        Some(m) => m,
+        None => return Vec::new(),
+    };
+    let mut out: Vec<(usize, f64)> = Vec::new();
+    for op in module.compiled_flows.code.iter() {
+        if let Opcode::AssignConstCurr { off, literal_id } = op {
+            // The literal is the flows assignment's compiled default. A
+            // well-formed program always has the literal in range; fall back to
+            // 0.0 defensively rather than panicking across what is otherwise an
+            // infallible layout pass.
+            let v = module
+                .compiled_flows
+                .literals
+                .get(*literal_id as usize)
+                .copied()
+                .unwrap_or(0.0);
+            out.push((base_off + *off as usize, v));
+        }
+    }
+    for decl in &module.context.modules {
+        let child_key = crate::vm::make_module_key(&decl.model_name, &decl.input_set);
+        out.extend(collect_overridable_defaults(
+            modules,
+            &child_key,
+            base_off + decl.off,
+        ));
+    }
+    out
+}
+
+/// The linear-memory region geometry `run` needs: the chunk/results bases, the
+/// snapshot bases (`initial_values`/`prev_values`), and the RK scratch bases
+/// (`saved`/`accum`). Bundled to keep `emit_run_simulation`'s signature small as
+/// the run loop gained snapshot + RK regions.
+#[derive(Clone, Copy)]
+struct RunRegions {
+    n_slots: u32,
+    results_base: u32,
+    stride: u32,
+    n_chunks: u32,
+    initial_values_base: u32,
+    prev_values_base: u32,
+    /// Slot-0 byte base of the RK `saved[i]` scratch (one f64 per stock).
+    rk_saved_base: u32,
+    /// Slot-0 byte base of the RK `accum[i]` scratch (one f64 per stock).
+    rk_accum_base: u32,
+}
+
+// `run`'s f64 locals (after the three i32 locals). The RK loops need a
+// `saved_time` (the timestep's t, restored after the stages move `curr[TIME]` to
+// trial points) and a per-stage `s` scratch (`next[off]-curr[off]`). Euler
+// declares them too -- two unused f64 locals are free.
+const L_SAVED_TIME: u32 = 3;
+const L_RK_S: u32 = 4;
+
+/// Emit the body of `run` for the `CompiledSimulation` path: seed the reserved
+/// globals, run the initials, capture `initial_values`, then drive the
+/// integration loop selected by `specs.method`. The loop `call`s the three
+/// opcode-emitted functions; the Euler arm mirrors `vm.rs::run_to`'s Euler arm,
+/// and the RK arms mirror `vm.rs:712-838`.
+fn emit_run_simulation(
+    specs: &Specs,
+    regions: RunRegions,
+    save_every: i32,
+    stock_offsets: &[usize],
+    root_fn_base: u32,
+) -> Function {
+    // Three i32 locals (saved/step_accum/dst) + two f64 locals (saved_time, s).
+    let mut f = Function::new([(3, ValType::I32), (2, ValType::F64)]);
+
+    // Absolute function indices of the ROOT instance's three program functions:
+    // its function-triple base + the per-phase offset. `run` drives the root with
+    // `module_off = 0`; nested instances are reached via `EvalModule` from there.
+    let f_initials = root_fn_base + F_INITIALS;
+    let f_flows = root_fn_base + F_FLOWS;
+    let f_stocks = root_fn_base + F_STOCKS;
+
+    // Seed the reserved global slots into curr (chunk base 0), then run the
+    // initials. The seeds mirror the VM, which writes start/dt/start/stop into
+    // TIME/DT/INITIAL_TIME/FINAL_TIME before run_initials.
+    store_curr_const_abs(&mut f, TIME_OFF, specs.start);
+    store_curr_const_abs(&mut f, DT_OFF, specs.dt);
+    store_curr_const_abs(&mut f, INITIAL_TIME_OFF, specs.start);
+    store_curr_const_abs(&mut f, FINAL_TIME_OFF, specs.stop);
+    // Re-arm the PREVIOUS fallback for this run, mirroring the VM's
+    // `run_initials` (which sets `use_prev_fallback = true` at the start of
+    // every run). `run` reseeds the time globals + reruns initials and is the
+    // documented per-change entry point for repeated re-simulation, so it must
+    // reset this flag itself: the loop below clears it to 0 after the first
+    // `prev_values` snapshot, and without re-arming it here a second `run` on
+    // the same instance would read the prior run's `prev_values` on step 0 (and
+    // during initials) instead of the fallback. The module-init value is also 1,
+    // so this is a no-op only on the very first run.
+    f.instruction(&I::I32Const(1));
+    f.instruction(&I::GlobalSet(G_USE_PREV_FALLBACK));
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::Call(f_initials));
+
+    // Capture `initial_values := curr` exactly once, after initials, for
+    // `INIT(x)` reads in the flows/stocks programs (`vm.rs:1124-1128`).
+    // `use_prev_fallback` is 1 (re-armed just above) through initials, so any
+    // `PREVIOUS(x)` evaluated during initials returns its fallback.
+    emit_copy_chunk(
+        &mut f,
+        CURR_BASE,
+        regions.initial_values_base,
+        regions.n_slots,
+    );
+
+    f.instruction(&I::Block(BlockType::Empty)); // $break
+    f.instruction(&I::Loop(BlockType::Empty)); // $continue
+
+    // if curr[TIME] > stop: break
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::F64Load(memarg(TIME_ADDR)));
+    f.instruction(&f64_const(specs.stop));
+    f.instruction(&I::F64Gt);
+    f.instruction(&I::BrIf(1));
+
+    // The per-method step: compute the new stock values into `next[off]`, leave
+    // `curr` holding the full time-`t` state (aux/flows + time-`t` stocks), then
+    // snapshot `prev_values := curr` and clear `use_prev_fallback`.
+    match specs.method {
+        Method::Euler => emit_euler_step(&mut f, f_flows, f_stocks, &regions),
+        Method::RungeKutta4 => {
+            emit_rk4_step(&mut f, f_flows, f_stocks, specs.dt, stock_offsets, &regions)
+        }
+        Method::RungeKutta2 => {
+            emit_rk2_step(&mut f, f_flows, f_stocks, specs.dt, stock_offsets, &regions)
+        }
+    }
+
+    // The save + advance tail is method-agnostic: every method leaves `next[off]`
+    // holding the new stock values and `curr` holding the time-`t` state, so the
+    // save row records `curr`, the advance copies the new stocks `next -> curr`,
+    // and `curr[TIME] += dt`.
+    emit_save_advance(&mut f, specs, save_every, stock_offsets, &regions);
+
+    f.instruction(&I::Br(0)); // continue
+    f.instruction(&I::End); // end loop
+    f.instruction(&I::End); // end block
+    f.instruction(&I::End); // end function
+    f
+}
+
+/// The Euler step: `flows`+`stocks` (the stocks program writes `next[off]`),
+/// then the `prev_values` snapshot. Mirrors `vm.rs:698-708`.
+fn emit_euler_step(f: &mut Function, f_flows: u32, f_stocks: u32, regions: &RunRegions) {
+    emit_eval_step(f, f_flows, f_stocks);
+    emit_prev_snapshot(f, regions);
+}
+
+/// `eval_step` = `flows(0)` then `stocks(0)` (`vm.rs:1195`). The stocks program
+/// writes each stock's integrated value into `next[off]`.
+fn emit_eval_step(f: &mut Function, f_flows: u32, f_stocks: u32) {
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::Call(f_flows));
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::Call(f_stocks));
+}
+
+/// Snapshot `prev_values := curr` and clear `use_prev_fallback` so the next
+/// step's `PREVIOUS(x)` reads this step's `curr` rather than its fallback
+/// (`vm.rs:705-707` for Euler; `vm.rs:781-783` / `832-834` for RK, where it runs
+/// only after the end-of-step flows re-eval has restored `curr`).
+fn emit_prev_snapshot(f: &mut Function, regions: &RunRegions) {
+    emit_copy_chunk(f, CURR_BASE, regions.prev_values_base, regions.n_slots);
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::GlobalSet(G_USE_PREV_FALLBACK));
+}
+
+/// The method-agnostic save + advance tail (the wasm analogue of the VM's
+/// `save_advance!` plus its per-step advance). Records a results row from `curr`
+/// on the VM's cadence, breaks when the chunk budget is exhausted, then advances
+/// by copying the new stock values `next -> curr` and stepping `curr[TIME] += dt`.
+fn emit_save_advance(
+    f: &mut Function,
+    specs: &Specs,
+    save_every: i32,
+    stock_offsets: &[usize],
+    regions: &RunRegions,
+) {
+    let n_slots = regions.n_slots;
+
+    // step_accum += 1
+    f.instruction(&I::LocalGet(L_STEP_ACCUM));
+    f.instruction(&I::I32Const(1));
+    f.instruction(&I::I32Add);
+    f.instruction(&I::LocalSet(L_STEP_ACCUM));
+
+    // save_cond = (step_accum == save_every) | (saved == 0 & time == start)
+    f.instruction(&I::LocalGet(L_STEP_ACCUM));
+    f.instruction(&I::I32Const(save_every));
+    f.instruction(&I::I32Eq);
+    f.instruction(&I::LocalGet(L_SAVED));
+    f.instruction(&I::I32Eqz);
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::F64Load(memarg(TIME_ADDR)));
+    f.instruction(&f64_const(specs.start));
+    f.instruction(&I::F64Eq);
+    f.instruction(&I::I32And);
+    f.instruction(&I::I32Or);
+    f.instruction(&I::If(BlockType::Empty));
+
+    // dst = results_base + saved * stride
+    f.instruction(&I::I32Const(regions.results_base as i32));
+    f.instruction(&I::LocalGet(L_SAVED));
+    f.instruction(&I::I32Const(regions.stride as i32));
+    f.instruction(&I::I32Mul);
+    f.instruction(&I::I32Add);
+    f.instruction(&I::LocalSet(L_DST));
+
+    // results[dst + slot*8] = curr[slot]   for every slot
+    for slot in 0..n_slots {
+        f.instruction(&I::LocalGet(L_DST));
+        f.instruction(&I::I32Const(0));
+        f.instruction(&I::F64Load(memarg(u64::from(slot) * u64::from(SLOT_SIZE))));
+        f.instruction(&I::F64Store(memarg(u64::from(slot) * u64::from(SLOT_SIZE))));
+    }
+
+    // saved += 1; step_accum = 0
+    f.instruction(&I::LocalGet(L_SAVED));
+    f.instruction(&I::I32Const(1));
+    f.instruction(&I::I32Add);
+    f.instruction(&I::LocalSet(L_SAVED));
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::LocalSet(L_STEP_ACCUM));
+
+    // if saved >= n_chunks: break (depth 2: if -> loop -> block)
+    f.instruction(&I::LocalGet(L_SAVED));
+    f.instruction(&I::I32Const(regions.n_chunks as i32));
+    f.instruction(&I::I32GeS);
+    f.instruction(&I::BrIf(2));
+
+    f.instruction(&I::End); // end if
+
+    // Advance: copy the freshly integrated stock values next -> curr. The
+    // `next` chunk's slot-0 byte base is one chunk past `curr`, i.e. the chunk
+    // stride (`compile_simulation` sets `next_base = stride`).
+    let next_base = regions.stride;
+    for &off in stock_offsets {
+        f.instruction(&I::I32Const(0));
+        f.instruction(&I::I32Const(0));
+        f.instruction(&I::F64Load(memarg(
+            u64::from(next_base) + off as u64 * u64::from(SLOT_SIZE),
+        )));
+        f.instruction(&I::F64Store(memarg(off as u64 * u64::from(SLOT_SIZE))));
+    }
+
+    // time += dt
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::F64Load(memarg(TIME_ADDR)));
+    f.instruction(&f64_const(specs.dt));
+    f.instruction(&I::F64Add);
+    f.instruction(&I::F64Store(memarg(TIME_ADDR)));
+}
+
+/// Store a compile-time constant into a `curr` slot at an absolute (module_off
+/// 0) address.
+fn store_curr_const_abs(f: &mut Function, off: usize, v: f64) {
+    f.instruction(&I::I32Const(0));
+    f.instruction(&f64_const(v));
+    f.instruction(&I::F64Store(memarg(off as u64 * u64::from(SLOT_SIZE))));
+}
+
+// ── Constants-override exports (Phase 7 Task 2) ───────────────────────────
+//
+// `set_value(offset: i32, val: f64) -> i32` writes the override into the
+// constants region (0 ok / 1 when `offset` is out of range or not overridable);
+// `reset() -> ()` resets the run state without clearing the region (overrides
+// persist across reset, like the VM); `clear_values() -> ()` restores the
+// compiled defaults. The constants region is `n_slots`-wide and indexed by
+// absolute slab offset (so a redirected `AssignConstCurr` reads it with the same
+// `module_off`-relative addressing the slab uses); a parallel `n_slots`-byte
+// validity region (1 = overridable) is what `set_value` checks.
+
+/// A `MemArg` for a single-byte access (the validity region), align 0.
+fn byte_memarg(addr: u64) -> wasm_encoder::MemArg {
+    wasm_encoder::MemArg {
+        offset: addr,
+        align: 0,
+        memory_index: 0,
+    }
+}
+
+// `set_value`'s i32 params: the absolute slab offset and (param 1) the f64
+// value. Param 0 is the offset.
+const SV_OFFSET: u32 = 0;
+const SV_VALUE: u32 = 1;
+
+/// Emit `set_value(offset: i32, val: f64) -> i32`: write `const_region[offset] =
+/// val` and return 0 when `offset` is a valid overridable slot, else return 1
+/// without writing. Validity is `0 <= offset < n_slots` AND `valid[offset] != 0`
+/// (the byte the data segment set for each overridable absolute offset). This
+/// mirrors the VM's `set_value_by_offset` (`vm.rs:1037-1052`): an out-of-range or
+/// non-constant offset is rejected (the VM returns `Err`), a valid one applies
+/// the override (which persists across `reset`).
+fn emit_set_value(n_slots: u32, const_region_base: u32, const_valid_base: u32) -> Function {
+    let mut f = Function::new([]);
+
+    // if (offset < 0) | (offset >= n_slots): return 1
+    f.instruction(&I::LocalGet(SV_OFFSET));
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::I32LtS);
+    f.instruction(&I::LocalGet(SV_OFFSET));
+    f.instruction(&I::I32Const(n_slots as i32));
+    f.instruction(&I::I32GeS);
+    f.instruction(&I::I32Or);
+    f.instruction(&I::If(BlockType::Empty));
+    f.instruction(&I::I32Const(1));
+    f.instruction(&I::Return);
+    f.instruction(&I::End);
+
+    // if valid[offset] == 0: return 1   (valid byte at const_valid_base + offset)
+    f.instruction(&I::LocalGet(SV_OFFSET));
+    f.instruction(&I::I32Load8U(byte_memarg(u64::from(const_valid_base))));
+    f.instruction(&I::I32Eqz);
+    f.instruction(&I::If(BlockType::Empty));
+    f.instruction(&I::I32Const(1));
+    f.instruction(&I::Return);
+    f.instruction(&I::End);
+
+    // const_region[offset] = val   (f64 at const_region_base + offset*8)
+    f.instruction(&I::LocalGet(SV_OFFSET));
+    f.instruction(&I::I32Const(SLOT_SIZE as i32));
+    f.instruction(&I::I32Mul);
+    f.instruction(&I::LocalGet(SV_VALUE));
+    f.instruction(&I::F64Store(memarg(u64::from(const_region_base))));
+
+    // return 0
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::End);
+    f
+}
+
+/// Emit `reset() -> ()`: reset the run state so the next `run` re-runs initials
+/// and the loop from t=start. The wasm `run` already re-seeds the time slots and
+/// re-runs initials on every call and uses fresh i32 locals for the chunk/step
+/// counters, so the only cross-run state is the `use_prev_fallback` global, which
+/// `run` clears after the first `prev_values` snapshot. Setting it back to 1 here
+/// is the analogue of the VM's `reset` clearing `prev_values_valid` (`vm.rs:976-989`),
+/// and -- like the VM -- it deliberately does NOT touch the constants region, so
+/// overrides persist across reset.
+fn emit_reset() -> Function {
+    let mut f = Function::new([]);
+    f.instruction(&I::I32Const(1));
+    f.instruction(&I::GlobalSet(G_USE_PREV_FALLBACK));
+    f.instruction(&I::End);
+    f
+}
+
+/// Emit `clear_values() -> ()`: restore each overridable constant to its
+/// compiled-default literal by writing the defaults back into the constants
+/// region (the VM's `clear_values`, `vm.rs:1055-1062`). The defaults are
+/// compile-time constants, so this is a straight-line sequence of `f64.store`s --
+/// one per overridable absolute offset. The data segment also writes these at
+/// instantiation; `clear_values` lets a host undo a `set_value` without
+/// re-instantiating the module.
+fn emit_clear_values(const_region_base: u32, overridable_defaults: &[(usize, f64)]) -> Function {
+    let mut f = Function::new([]);
+    for &(abs_off, default) in overridable_defaults {
+        f.instruction(&I::I32Const(0));
+        f.instruction(&f64_const(default));
+        f.instruction(&I::F64Store(memarg(
+            u64::from(const_region_base) + abs_off as u64 * u64::from(SLOT_SIZE),
+        )));
+    }
+    f.instruction(&I::End);
+    f
+}
+
+/// The active `DataSection` payloads that initialize the constants region and
+/// its validity bytes at instantiation: for each overridable absolute offset, the
+/// f64 default written into the constants region and a `1` validity byte. Sparse
+/// (one segment per overridable offset), so a model with no overridable constants
+/// produces an empty list (no segments).
+struct ConstRegionInit {
+    /// `(byte address within the constants region, the 8 LE bytes of the default)`.
+    value_segments: Vec<(u32, [u8; 8])>,
+    /// `byte address within the validity region` (the byte written is always 1).
+    valid_segments: Vec<u32>,
+}
+
+/// Build the constants-region init payloads from the overridable defaults.
+fn build_const_region_init(
+    overridable_defaults: &[(usize, f64)],
+    const_region_base: u32,
+    const_valid_base: u32,
+) -> ConstRegionInit {
+    let mut value_segments = Vec::with_capacity(overridable_defaults.len());
+    let mut valid_segments = Vec::with_capacity(overridable_defaults.len());
+    for &(abs_off, default) in overridable_defaults {
+        let value_addr = const_region_base + abs_off as u32 * SLOT_SIZE;
+        value_segments.push((value_addr, default.to_le_bytes()));
+        valid_segments.push(const_valid_base + abs_off as u32);
+    }
+    ConstRegionInit {
+        value_segments,
+        valid_segments,
+    }
+}
+
+// ── RK loop primitives ────────────────────────────────────────────────────
+//
+// Every RK memory slot lives at a constant byte address (`base + idx*8`), so the
+// dynamic part of the address is always `i32.const 0` and the constant
+// `memarg.offset` carries `base + idx*8`. `f64.store` wants `[addr_i32,
+// value_f64]`, so the store helpers push the `i32.const 0` address first, then
+// the caller leaves the value on the stack.
+
+/// `i32.const 0; f64.load[base + idx*8]` -- push the f64 at slot `idx` of the
+/// region whose slot-0 byte base is `base`.
+fn emit_load_slot(f: &mut Function, base: u32, idx: u32) {
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::F64Load(memarg(
+        u64::from(base) + u64::from(idx) * u64::from(SLOT_SIZE),
+    )));
+}
+
+/// Push the store *address* half of an RK slot store: a bare `i32.const 0`.
+/// Every RK slot's full byte address (`base + idx*8`) rides in the matching
+/// [`emit_store_slot_value`]'s `memarg.offset`, so the dynamic address is always
+/// the constant 0 -- this half therefore needs no `base`/`idx`. Kept as the
+/// named symmetry partner of `emit_store_slot_value` (which it precedes at every
+/// call site, since `f64.store` consumes `[addr_i32, value_f64]`): inlining only
+/// this half would scatter unexplained `i32.const 0`s whose absolute-addressing
+/// intent is exactly what the pairing documents.
+fn emit_store_slot_addr(f: &mut Function) {
+    f.instruction(&I::I32Const(0));
+}
+
+/// `f64.store[base + idx*8]` -- consume `[addr_i32, value_f64]` already on the
+/// stack (the address from [`emit_store_slot_addr`]).
+fn emit_store_slot_value(f: &mut Function, base: u32, idx: u32) {
+    f.instruction(&I::F64Store(memarg(
+        u64::from(base) + u64::from(idx) * u64::from(SLOT_SIZE),
+    )));
+}
+
+/// Emit `L_RK_S := next[off] - curr[off]` -- the stock's stage delta `s_k`
+/// (`vm.rs`: `let sN = next[off] - curr[off]`). Computed before any of the
+/// stage's writes clobber `curr[off]`. `next_base` is `n_slots*8`.
+///
+/// `off` is the full-width absolute slot offset (`u32`, like the Euler advance's
+/// `emit_save_advance`). A `u16` here would silently truncate a stock at slot
+/// 65536 or above -- reachable in a large nested model (each submodel / SMOOTH /
+/// DELAY instance adds slots, with no cap on total `n_slots`) -- to
+/// `off & 0xFFFF`, clobbering an unrelated slot (offset 65536 maps to slot 0,
+/// TIME).
+fn emit_compute_stage_delta(f: &mut Function, next_base: u32, off: u32) {
+    emit_load_slot(f, next_base, off);
+    emit_load_slot(f, CURR_BASE, off);
+    f.instruction(&I::F64Sub);
+    f.instruction(&I::LocalSet(L_RK_S));
+}
+
+/// The RK4 step (`vm.rs:712-787`): four stages over the compile-time stock
+/// offsets, the time juggling, the final flows-only re-eval with restored
+/// `curr`, and the `prev_values` snapshot. `next[off]` ends holding the new
+/// integrated stock value; `curr` ends holding the time-`t` state.
+fn emit_rk4_step(
+    f: &mut Function,
+    f_flows: u32,
+    f_stocks: u32,
+    dt: f64,
+    stock_offsets: &[usize],
+    regions: &RunRegions,
+) {
+    let (saved, accum) = (regions.rk_saved_base, regions.rk_accum_base);
+    // The `next` chunk's slot-0 byte base == the chunk stride (`next` sits one
+    // chunk past `curr`); see `emit_save_advance`.
+    let next_base = regions.stride;
+
+    // saved_time = curr[TIME]
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::F64Load(memarg(TIME_ADDR)));
+    f.instruction(&I::LocalSet(L_SAVED_TIME));
+
+    // Stage 1 at (t, y): s1 = next-curr; saved=curr; accum=s1; curr=saved+s1*0.5
+    emit_eval_step(f, f_flows, f_stocks);
+    for (i, &off) in stock_offsets.iter().enumerate() {
+        let (i, off) = (i as u32, off as u32);
+        emit_compute_stage_delta(f, next_base, off);
+        // saved[i] = curr[off]
+        emit_store_slot_addr(f);
+        emit_load_slot(f, CURR_BASE, off);
+        emit_store_slot_value(f, saved, i);
+        // accum[i] = s1
+        emit_store_slot_addr(f);
+        f.instruction(&I::LocalGet(L_RK_S));
+        emit_store_slot_value(f, accum, i);
+        // curr[off] = saved[i] + s1*0.5
+        emit_store_slot_addr(f);
+        emit_load_slot(f, saved, i);
+        f.instruction(&I::LocalGet(L_RK_S));
+        f.instruction(&f64_const(0.5));
+        f.instruction(&I::F64Mul);
+        f.instruction(&I::F64Add);
+        emit_store_slot_value(f, CURR_BASE, off);
+    }
+    // curr[TIME] = saved_time + dt*0.5
+    emit_store_time_offset(f, dt * 0.5);
+
+    // Stage 2 at (t+dt/2, y+s1/2): s2 = next-curr; accum+=2*s2; curr=saved+s2*0.5
+    emit_eval_step(f, f_flows, f_stocks);
+    for (i, &off) in stock_offsets.iter().enumerate() {
+        let (i, off) = (i as u32, off as u32);
+        emit_compute_stage_delta(f, next_base, off);
+        // accum[i] += 2*s2
+        emit_store_slot_addr(f);
+        emit_load_slot(f, accum, i);
+        f.instruction(&I::LocalGet(L_RK_S));
+        f.instruction(&f64_const(2.0));
+        f.instruction(&I::F64Mul);
+        f.instruction(&I::F64Add);
+        emit_store_slot_value(f, accum, i);
+        // curr[off] = saved[i] + s2*0.5
+        emit_store_slot_addr(f);
+        emit_load_slot(f, saved, i);
+        f.instruction(&I::LocalGet(L_RK_S));
+        f.instruction(&f64_const(0.5));
+        f.instruction(&I::F64Mul);
+        f.instruction(&I::F64Add);
+        emit_store_slot_value(f, CURR_BASE, off);
+    }
+
+    // Stage 3 at (t+dt/2, y+s2/2): s3 = next-curr; accum+=2*s3; curr=saved+s3
+    emit_eval_step(f, f_flows, f_stocks);
+    for (i, &off) in stock_offsets.iter().enumerate() {
+        let (i, off) = (i as u32, off as u32);
+        emit_compute_stage_delta(f, next_base, off);
+        // accum[i] += 2*s3
+        emit_store_slot_addr(f);
+        emit_load_slot(f, accum, i);
+        f.instruction(&I::LocalGet(L_RK_S));
+        f.instruction(&f64_const(2.0));
+        f.instruction(&I::F64Mul);
+        f.instruction(&I::F64Add);
+        emit_store_slot_value(f, accum, i);
+        // curr[off] = saved[i] + s3
+        emit_store_slot_addr(f);
+        emit_load_slot(f, saved, i);
+        f.instruction(&I::LocalGet(L_RK_S));
+        f.instruction(&I::F64Add);
+        emit_store_slot_value(f, CURR_BASE, off);
+    }
+    // curr[TIME] = saved_time + dt
+    emit_store_time_offset(f, dt);
+
+    // Stage 4 at (t+dt, y+s3): s4 = next-curr; accum+=s4;
+    // next[off] = saved[i] + accum[i]/6; curr[off] = saved[i]
+    emit_eval_step(f, f_flows, f_stocks);
+    for (i, &off) in stock_offsets.iter().enumerate() {
+        let (i, off) = (i as u32, off as u32);
+        emit_compute_stage_delta(f, next_base, off);
+        // accum[i] += s4
+        emit_store_slot_addr(f);
+        emit_load_slot(f, accum, i);
+        f.instruction(&I::LocalGet(L_RK_S));
+        f.instruction(&I::F64Add);
+        emit_store_slot_value(f, accum, i);
+        // next[off] = saved[i] + accum[i]/6.0
+        emit_store_slot_addr(f);
+        emit_load_slot(f, saved, i);
+        emit_load_slot(f, accum, i);
+        f.instruction(&f64_const(6.0));
+        f.instruction(&I::F64Div);
+        f.instruction(&I::F64Add);
+        emit_store_slot_value(f, next_base, off);
+        // curr[off] = saved[i]  (restore the original)
+        emit_store_slot_addr(f);
+        emit_load_slot(f, saved, i);
+        emit_store_slot_value(f, CURR_BASE, off);
+    }
+
+    // curr[TIME] = saved_time ; next[TIME] = saved_time + dt
+    emit_restore_and_advance_time(f, dt, regions);
+
+    // Final flows-only re-eval with the restored curr, so curr's aux/flow slots
+    // hold time-`t` values (stages 2-4 clobbered them). Load-bearing for both
+    // the saved output row and the PREVIOUS snapshot (`vm.rs:769-778`).
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::Call(f_flows));
+
+    emit_prev_snapshot(f, regions);
+}
+
+/// The RK2 (Heun) step (`vm.rs:788-838`): two stages, the time juggling, the
+/// final flows-only re-eval, and the `prev_values` snapshot.
+fn emit_rk2_step(
+    f: &mut Function,
+    f_flows: u32,
+    f_stocks: u32,
+    dt: f64,
+    stock_offsets: &[usize],
+    regions: &RunRegions,
+) {
+    let (saved, accum) = (regions.rk_saved_base, regions.rk_accum_base);
+    // The `next` chunk's slot-0 byte base == the chunk stride; see
+    // `emit_save_advance`.
+    let next_base = regions.stride;
+
+    // saved_time = curr[TIME]
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::F64Load(memarg(TIME_ADDR)));
+    f.instruction(&I::LocalSet(L_SAVED_TIME));
+
+    // Stage 1 at (t, y): s1 = next-curr; saved=curr; accum=s1; curr=saved+s1
+    emit_eval_step(f, f_flows, f_stocks);
+    for (i, &off) in stock_offsets.iter().enumerate() {
+        let (i, off) = (i as u32, off as u32);
+        emit_compute_stage_delta(f, next_base, off);
+        // saved[i] = curr[off]
+        emit_store_slot_addr(f);
+        emit_load_slot(f, CURR_BASE, off);
+        emit_store_slot_value(f, saved, i);
+        // accum[i] = s1
+        emit_store_slot_addr(f);
+        f.instruction(&I::LocalGet(L_RK_S));
+        emit_store_slot_value(f, accum, i);
+        // curr[off] = saved[i] + s1   (full Euler step for the trial point)
+        emit_store_slot_addr(f);
+        emit_load_slot(f, saved, i);
+        f.instruction(&I::LocalGet(L_RK_S));
+        f.instruction(&I::F64Add);
+        emit_store_slot_value(f, CURR_BASE, off);
+    }
+    // curr[TIME] = saved_time + dt
+    emit_store_time_offset(f, dt);
+
+    // Stage 2 at (t+dt, y+s1): s2 = next-curr; accum+=s2;
+    // next[off] = saved[i] + accum[i]/2; curr[off] = saved[i]
+    emit_eval_step(f, f_flows, f_stocks);
+    for (i, &off) in stock_offsets.iter().enumerate() {
+        let (i, off) = (i as u32, off as u32);
+        emit_compute_stage_delta(f, next_base, off);
+        // accum[i] += s2
+        emit_store_slot_addr(f);
+        emit_load_slot(f, accum, i);
+        f.instruction(&I::LocalGet(L_RK_S));
+        f.instruction(&I::F64Add);
+        emit_store_slot_value(f, accum, i);
+        // next[off] = saved[i] + accum[i]/2.0
+        emit_store_slot_addr(f);
+        emit_load_slot(f, saved, i);
+        emit_load_slot(f, accum, i);
+        f.instruction(&f64_const(2.0));
+        f.instruction(&I::F64Div);
+        f.instruction(&I::F64Add);
+        emit_store_slot_value(f, next_base, off);
+        // curr[off] = saved[i]  (restore the original)
+        emit_store_slot_addr(f);
+        emit_load_slot(f, saved, i);
+        emit_store_slot_value(f, CURR_BASE, off);
+    }
+
+    // curr[TIME] = saved_time ; next[TIME] = saved_time + dt
+    emit_restore_and_advance_time(f, dt, regions);
+
+    // Final flows-only re-eval with restored curr (see the RK4 comment).
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::Call(f_flows));
+
+    emit_prev_snapshot(f, regions);
+}
+
+/// `curr[TIME] = saved_time + offset` -- the trial-point time the stages run at
+/// (`saved_time + dt*0.5` or `saved_time + dt`).
+fn emit_store_time_offset(f: &mut Function, offset: f64) {
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::LocalGet(L_SAVED_TIME));
+    f.instruction(&f64_const(offset));
+    f.instruction(&I::F64Add);
+    f.instruction(&I::F64Store(memarg(TIME_ADDR)));
+}
+
+/// Restore `curr[TIME] = saved_time` and set `next[TIME] = saved_time + dt`
+/// (`vm.rs:759-760` / `818-819`), so the final flows re-eval runs at time `t`.
+/// `next[TIME]` is set for faithfulness with the VM even though the wasm
+/// save/advance tail advances via `curr[TIME] += dt` rather than reading it.
+fn emit_restore_and_advance_time(f: &mut Function, dt: f64, regions: &RunRegions) {
+    let next_time_addr = u64::from(regions.n_slots) * u64::from(SLOT_SIZE) + TIME_ADDR;
+    // curr[TIME] = saved_time
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::LocalGet(L_SAVED_TIME));
+    f.instruction(&I::F64Store(memarg(TIME_ADDR)));
+    // next[TIME] = saved_time + dt
+    f.instruction(&I::I32Const(0));
+    f.instruction(&I::LocalGet(L_SAVED_TIME));
+    f.instruction(&f64_const(dt));
+    f.instruction(&I::F64Add);
+    f.instruction(&I::F64Store(memarg(next_time_addr)));
+}
+
+/// Emit an unrolled `dst[0..n_slots] := src[0..n_slots]` f64 copy between two
+/// linear-memory regions whose slot-0 byte bases are `src_base`/`dst_base`. Used
+/// for the whole-chunk snapshots (`initial_values := curr`, `prev_values :=
+/// curr`), each `n_slots` wide. The unroll matches the per-slot store style the
+/// rest of `run` uses; `n_slots` is small for scalar models.
+fn emit_copy_chunk(f: &mut Function, src_base: u32, dst_base: u32, n_slots: u32) {
+    for slot in 0..n_slots {
+        let slot_off = u64::from(slot) * u64::from(SLOT_SIZE);
+        // f64.store wants [addr_i32, value_f64]; the constant `memarg.offset`
+        // carries each region's base, so the dynamic address is a constant 0.
+        f.instruction(&I::I32Const(0));
+        f.instruction(&I::I32Const(0));
+        f.instruction(&I::F64Load(memarg(u64::from(src_base) + slot_off)));
+        f.instruction(&I::F64Store(memarg(u64::from(dst_base) + slot_off)));
+    }
+}
+
+/// Inputs to [`assemble_simulation`], grouped to keep the signature small now
+/// that the module carries a per-instance function-triple (one per
+/// `(model, input_set)`) plus a `run` driver, and possibly several GF regions.
+struct AssembleParts<'a> {
+    helpers: BuiltHelpers,
+    /// The instances' program functions in `instance_order`, flattened as
+    /// `[initials_0, flows_0, stocks_0, initials_1, ...]`. `instance_input_counts`
+    /// (same instance order) gives each triple's f64 input-param count.
+    program_fns: Vec<Function>,
+    run_fn: Function,
+    /// `set_value(offset: i32, val: f64) -> i32` (Phase 7 Task 2).
+    set_value_fn: Function,
+    /// `reset() -> ()` (Phase 7 Task 2).
+    reset_fn: Function,
+    /// `clear_values() -> ()` (Phase 7 Task 2).
+    clear_values_fn: Function,
+    /// Module-input parameter count per instance, in the same order the triples
+    /// appear in `program_fns`. Drives the per-triple wasm type
+    /// (`(i32, f64*k) -> ()`).
+    instance_input_counts: &'a [u32],
+    pages: u32,
+    n_slots: u32,
+    n_chunks: u32,
+    results_base: u32,
+    /// Every GF-bearing instance's region image, for the active `DataSection`
+    /// segments (each instance's directory + data sit at distinct bases).
+    gf_regions: &'a [&'a GfRegions],
+    /// The constants-override region init payloads (Phase 7 Task 2): sparse
+    /// active `DataSection` segments seeding each overridable slot's f64 default
+    /// and its validity byte.
+    const_init: &'a ConstRegionInit,
+}
+
+/// Assemble the simulation module: types, functions, memory, globals, exports,
+/// code, and (when present) the GF data segments. Layout: the emitted helper
+/// functions ([`build_helpers`]) lead the function/code sections (indices
+/// `0..n_helpers`); then one `[initials, flows, stocks]` triple per module
+/// instance (in `instance_order`); then `run` last. Exports `memory`, `run`, and
+/// the three self-describing i32 geometry globals. Each GF-bearing instance
+/// contributes two active `DataSection` segments (its directory + data) at its
+/// own bases.
+fn assemble_simulation(parts: AssembleParts) -> Vec<u8> {
+    let AssembleParts {
+        helpers,
+        program_fns,
+        run_fn,
+        set_value_fn,
+        reset_fn,
+        clear_values_fn,
+        instance_input_counts,
+        pages,
+        n_slots,
+        n_chunks,
+        results_base,
+        gf_regions,
+        const_init,
+    } = parts;
+
+    let mut wasm = WasmModule::new();
+    let n_helpers = helpers.functions.len() as u32;
+    let n_instances = instance_input_counts.len() as u32;
+    // Function layout: helpers, the per-instance triples, then `run`, then the
+    // three constants-override exports (`set_value`/`reset`/`clear_values`).
+    let run_fn_index = n_helpers + n_instances * FUNCS_PER_INSTANCE;
+    let set_value_fn_index = run_fn_index + 1;
+    let reset_fn_index = run_fn_index + 2;
+    let clear_values_fn_index = run_fn_index + 3;
+
+    // Type section: `run`'s `() -> ()` first, then one opcode-program type per
+    // *distinct* module-input count (`(i32, f64*k) -> ()`, sorted), then the
+    // helper types, then the `set_value` type (`(i32, f64) -> i32`).
+    // `reset`/`clear_values` reuse `TYPE_RUN_FN`. `opcode_type_for` maps an
+    // instance's `n_inputs` to its type index; a helper at function index `i`
+    // uses the type appended after those.
+    let mut distinct_inputs: Vec<u32> = instance_input_counts.to_vec();
+    distinct_inputs.sort_unstable();
+    distinct_inputs.dedup();
+    let opcode_type_index: HashMap<u32, u32> = distinct_inputs
+        .iter()
+        .enumerate()
+        .map(|(i, &k)| (k, TYPE_RUN_FN + 1 + i as u32))
+        .collect();
+    let first_helper_type = TYPE_RUN_FN + 1 + distinct_inputs.len() as u32;
+    let set_value_type = first_helper_type + helpers.functions.len() as u32;
+
+    let mut types = TypeSection::new();
+    types.ty().function([], []); // TYPE_RUN_FN: () -> ()
+    for &k in &distinct_inputs {
+        // (module_off: i32, in_0..in_{k-1}: f64) -> ()
+        let mut params: Vec<ValType> = Vec::with_capacity(1 + k as usize);
+        params.push(ValType::I32);
+        params.extend(std::iter::repeat_n(ValType::F64, k as usize));
+        types.ty().function(params, []);
+    }
+    for hf in &helpers.functions {
+        types.ty().function(hf.params.clone(), hf.results.clone());
+    }
+    // `set_value(offset: i32, val: f64) -> i32`.
+    types
+        .ty()
+        .function([ValType::I32, ValType::F64], [ValType::I32]);
+    wasm.section(&types);
+
+    // Function section: helpers first (indices `0..n_helpers`), then each
+    // instance's three program functions (typed by that instance's `n_inputs`),
+    // then `run`, then `set_value`/`reset`/`clear_values`.
+    let mut functions = FunctionSection::new();
+    for (i, _) in helpers.functions.iter().enumerate() {
+        functions.function(first_helper_type + i as u32);
+    }
+    for &k in instance_input_counts {
+        let ty = opcode_type_index[&k];
+        functions.function(ty); // initials
+        functions.function(ty); // flows
+        functions.function(ty); // stocks
+    }
+    functions.function(TYPE_RUN_FN); // run
+    functions.function(set_value_type); // set_value
+    functions.function(TYPE_RUN_FN); // reset
+    functions.function(TYPE_RUN_FN); // clear_values
+    wasm.section(&functions);
+
+    let mut memories = MemorySection::new();
+    memories.memory(MemoryType {
+        minimum: u64::from(pages),
+        maximum: None,
+        memory64: false,
+        shared: false,
+        page_size_log2: None,
+    });
+    wasm.section(&memories);
+
+    let i32_global = || GlobalType {
+        val_type: ValType::I32,
+        mutable: false,
+        shared: false,
+    };
+    let mut globals = GlobalSection::new();
+    globals.global(i32_global(), &ConstExpr::i32_const(n_slots as i32));
+    globals.global(i32_global(), &ConstExpr::i32_const(n_chunks as i32));
+    globals.global(i32_global(), &ConstExpr::i32_const(results_base as i32));
+    // `use_prev_fallback`: the only mutable global. Init 1 so `LoadPrev` returns
+    // its fallback until the first `prev_values` snapshot clears it (`vm.rs:668`).
+    globals.global(
+        GlobalType {
+            val_type: ValType::I32,
+            mutable: true,
+            shared: false,
+        },
+        &ConstExpr::i32_const(1),
+    );
+    wasm.section(&globals);
+
+    let mut exports = ExportSection::new();
+    exports.export("run", ExportKind::Func, run_fn_index);
+    exports.export("set_value", ExportKind::Func, set_value_fn_index);
+    exports.export("reset", ExportKind::Func, reset_fn_index);
+    exports.export("clear_values", ExportKind::Func, clear_values_fn_index);
+    exports.export("memory", ExportKind::Memory, 0);
+    exports.export("n_slots", ExportKind::Global, G_N_SLOTS);
+    exports.export("n_chunks", ExportKind::Global, G_N_CHUNKS);
+    exports.export("results_offset", ExportKind::Global, G_RESULTS_OFFSET);
+    wasm.section(&exports);
+
+    // Code section order must match the function section: helper bodies, then the
+    // per-instance program functions (in `program_fns` order), then `run`, then
+    // `set_value`/`reset`/`clear_values`.
+    let mut code = CodeSection::new();
+    for hf in &helpers.functions {
+        code.function(&hf.body);
+    }
+    for program in &program_fns {
+        code.function(program);
+    }
+    code.function(&run_fn);
+    code.function(&set_value_fn);
+    code.function(&reset_fn);
+    code.function(&clear_values_fn);
+    wasm.section(&code);
+
+    // The GF directory + data regions and the constants-override init values
+    // are read-only-at-instantiation constants; active data segments write each
+    // at its byte address when the module is instantiated. A module has at most
+    // one data section, so the GF regions and the constants-override init share
+    // it. The data section must follow the code section per the wasm binary order.
+    let has_const_init =
+        !const_init.value_segments.is_empty() || !const_init.valid_segments.is_empty();
+    if !gf_regions.is_empty() || has_const_init {
+        let mut data = DataSection::new();
+        for gf in gf_regions {
+            data.active(
+                0,
+                &ConstExpr::i32_const(gf.directory_base as i32),
+                gf.directory.iter().copied(),
+            );
+            data.active(
+                0,
+                &ConstExpr::i32_const(gf.data_base as i32),
+                gf.data.iter().copied(),
+            );
+        }
+        // The constants region's per-slot default (8 LE bytes each) and its
+        // validity bytes (a single `1` each), one active segment per overridable
+        // absolute offset.
+        for &(addr, bytes) in &const_init.value_segments {
+            data.active(0, &ConstExpr::i32_const(addr as i32), bytes.iter().copied());
+        }
+        for &addr in &const_init.valid_segments {
+            data.active(0, &ConstExpr::i32_const(addr as i32), [1u8].iter().copied());
+        }
+        wasm.section(&data);
+    }
+
+    wasm.finish()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::common::{Canonical, Ident};
+    use crate::compat::open_xmile;
+    use crate::db::{SimlinDb, compile_project_incremental, sync_from_datamodel_incremental};
+    use crate::vm::Vm;
+    use checked::Store;
+    use std::io::BufReader;
+    use wasm::validate;
+
+    const POPULATION_XMILE: &str = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../default_projects/population/model.xmile"
+    );
+
+    /// A graphical function whose table is `knots`. `Continuous` kind, with the
+    /// x-scale spanning the knots' x-range.
+    fn gf_from_knots(knots: &[(f64, f64)]) -> crate::datamodel::GraphicalFunction {
+        use crate::datamodel;
+        let x_points: Vec<f64> = knots.iter().map(|&(x, _)| x).collect();
+        let y_points: Vec<f64> = knots.iter().map(|&(_, y)| y).collect();
+        datamodel::GraphicalFunction {
+            kind: datamodel::GraphicalFunctionKind::Continuous,
+            x_points: Some(x_points.clone()),
+            y_points,
+            x_scale: datamodel::GraphicalFunctionScale {
+                min: x_points.first().copied().unwrap_or(0.0),
+                max: x_points.last().copied().unwrap_or(1.0),
+            },
+            y_scale: datamodel::GraphicalFunctionScale { min: 0.0, max: 1.0 },
+        }
+    }
+
+    /// Decode a GF directory's `n`th entry from `directory` bytes: the absolute
+    /// data byte offset and the point count.
+    fn decode_dir_entry(directory: &[u8], n: usize) -> (usize, usize) {
+        let base = n * GF_DIRECTORY_ENTRY_BYTES as usize;
+        let data_off = i32::from_le_bytes(directory[base..base + 4].try_into().unwrap()) as usize;
+        let n_points =
+            i32::from_le_bytes(directory[base + 4..base + 8].try_into().unwrap()) as usize;
+        (data_off, n_points)
+    }
+
+    /// Decode the `(x, y)` knots stored at relative `data` offset `rel_off` for
+    /// a table of `n_points` (interleaved f64 LE x,y pairs).
+    fn decode_knots(data: &[u8], rel_off: usize, n_points: usize) -> Vec<(f64, f64)> {
+        (0..n_points)
+            .map(|k| {
+                let a = rel_off + k * GF_KNOT_BYTES as usize;
+                let x = f64::from_le_bytes(data[a..a + 8].try_into().unwrap());
+                let y = f64::from_le_bytes(data[a + 8..a + 16].try_into().unwrap());
+                (x, y)
+            })
+            .collect()
+    }
+
+    /// Task 1 (pure layout): `build_gf_regions` concatenates several tables into
+    /// the data region in order, and the directory maps each global table index
+    /// to its *absolute* data byte offset + point count. The data offset for
+    /// table `t` must be `data_base` plus the byte span of all earlier tables.
+    #[test]
+    fn build_gf_regions_lays_out_directory_and_data() {
+        let region_base = 4096u32;
+        let tables = vec![
+            vec![(0.0, 10.0), (1.0, 20.0), (2.5, 5.0)],
+            vec![(-1.0, 0.5)],
+            vec![(0.0, 0.0), (10.0, 100.0)],
+        ];
+        let regions = build_gf_regions(&tables, region_base)
+            .expect("layout must succeed")
+            .expect("non-empty tables yield Some");
+
+        // Directory immediately at region_base; data follows the directory.
+        assert_eq!(regions.directory_base, region_base);
+        let directory_bytes = tables.len() as u32 * GF_DIRECTORY_ENTRY_BYTES;
+        assert_eq!(regions.data_base, region_base + directory_bytes);
+        assert_eq!(regions.directory.len(), directory_bytes as usize);
+
+        // Walk the directory; each table's data offset is absolute and its
+        // knots round-trip exactly. The running expected offset is data_base
+        // plus the byte span of all previously-laid tables.
+        let mut expected_abs = regions.data_base as usize;
+        let mut total_knot_bytes = 0usize;
+        for (t, table) in tables.iter().enumerate() {
+            let (data_off, n_points) = decode_dir_entry(&regions.directory, t);
+            assert_eq!(n_points, table.len(), "table {t} point count");
+            assert_eq!(data_off, expected_abs, "table {t} absolute data offset");
+
+            let rel = data_off - regions.data_base as usize;
+            assert_eq!(
+                decode_knots(&regions.data, rel, n_points).as_slice(),
+                table.as_slice(),
+                "table {t} knots round-trip"
+            );
+
+            let span = table.len() * GF_KNOT_BYTES as usize;
+            expected_abs += span;
+            total_knot_bytes += span;
+        }
+        assert_eq!(
+            regions.total_bytes as usize,
+            directory_bytes as usize + total_knot_bytes,
+            "total span covers directory + all knots"
+        );
+    }
+
+    /// Task 3 (pure serializer): a `WasmLayout` round-trips through
+    /// `serialize`/`deserialize` -- the geometry and the full name->offset map are
+    /// recovered exactly. The GF offsets are not part of the wire format (a host
+    /// reads results by name), so they come back as 0.
+    #[test]
+    fn wasm_layout_serialize_round_trips() {
+        let layout = WasmLayout {
+            n_slots: 7,
+            n_chunks: 101,
+            results_offset: 112,
+            gf_directory_offset: 4096,
+            gf_data_offset: 4104,
+            var_offsets: vec![
+                ("time".to_string(), 0),
+                ("population".to_string(), 4),
+                ("a_var_with_a_longer_name".to_string(), 6),
+            ],
+        };
+        let bytes = layout.serialize();
+        let back = WasmLayout::deserialize(&bytes).expect("round-trip must succeed");
+        assert_eq!(back.n_slots, 7);
+        assert_eq!(back.n_chunks, 101);
+        assert_eq!(back.results_offset, 112);
+        assert_eq!(back.var_offsets, layout.var_offsets);
+        // The GF offsets are not serialized; they reconstruct as 0.
+        assert_eq!(back.gf_directory_offset, 0);
+        assert_eq!(back.gf_data_offset, 0);
+    }
+
+    /// Task 3 (serializer robustness): a truncated buffer deserializes to `None`
+    /// rather than panicking, so a host handed a corrupt buffer fails cleanly.
+    #[test]
+    fn wasm_layout_deserialize_truncated_is_none() {
+        let layout = WasmLayout {
+            n_slots: 2,
+            n_chunks: 3,
+            results_offset: 32,
+            gf_directory_offset: 0,
+            gf_data_offset: 0,
+            var_offsets: vec![("x".to_string(), 0), ("y".to_string(), 1)],
+        };
+        let bytes = layout.serialize();
+        // Every strict prefix of a valid buffer must fail to parse (each cuts off
+        // a length-prefixed field mid-way).
+        for cut in 0..bytes.len() {
+            assert!(
+                WasmLayout::deserialize(&bytes[..cut]).is_none(),
+                "a buffer truncated to {cut} bytes must not deserialize"
+            );
+        }
+        // The full buffer parses.
+        assert!(WasmLayout::deserialize(&bytes).is_some());
+    }
+
+    /// Task 1 (pure layout): an empty table list yields no regions and no
+    /// growth, so a model without graphical functions is unaffected.
+    #[test]
+    fn build_gf_regions_empty_is_none() {
+        assert!(
+            build_gf_regions(&[], 4096)
+                .expect("layout must succeed")
+                .is_none(),
+            "no tables -> no GF regions"
+        );
+    }
+
+    /// Task 1 (data-section round-trip): the GF regions reach the instantiated
+    /// module's linear memory via the active `DataSection`, at the bases the
+    /// directory advertises. Reads the directory entry for table 0 from memory,
+    /// follows its absolute data offset, and asserts the `(x, y)` knots are
+    /// present with the right count -- the contract the `Lookup` opcode (Task 3)
+    /// relies on. (Exercised end-to-end through a GF *model* once the opcode
+    /// lowers, in `compile_simulation_gf_lookup_modes_match_vm`.)
+    #[test]
+    fn assembled_module_initializes_gf_regions_in_memory() {
+        let knots = [(0.0, 10.0), (1.0, 20.0), (2.5, 5.0), (4.0, 40.0)];
+        let region_base = WASM_PAGE_SIZE; // one page in, comfortably past slot 0
+        let regions = build_gf_regions(std::slice::from_ref(&knots.to_vec()), region_base)
+            .expect("layout")
+            .expect("non-empty");
+
+        // A minimal module: one empty exported `run` (so the assembler shape is
+        // exercised) is unnecessary here -- assert directly that the active data
+        // segments initialize memory. Assemble via the production assembler with
+        // a single root instance of three empty (0-input) program functions.
+        let helpers = build_helpers();
+        let empty = || {
+            let mut f = Function::new([]);
+            f.instruction(&I::End);
+            f
+        };
+        let pages = (region_base + regions.total_bytes)
+            .div_ceil(WASM_PAGE_SIZE)
+            .max(1);
+        let empty_const_init = ConstRegionInit {
+            value_segments: Vec::new(),
+            valid_segments: Vec::new(),
+        };
+        let wasm = assemble_simulation(AssembleParts {
+            helpers,
+            program_fns: vec![empty(), empty(), empty()],
+            run_fn: empty(),
+            // Empty (no-op) override functions: this test only checks the GF data
+            // segments, so the override exports are present but trivial.
+            set_value_fn: {
+                let mut f = Function::new([]);
+                // A `(i32, f64) -> i32` body must leave an i32 on the stack.
+                f.instruction(&I::I32Const(0));
+                f.instruction(&I::End);
+                f
+            },
+            reset_fn: empty(),
+            clear_values_fn: empty(),
+            instance_input_counts: &[0],
+            pages,
+            n_slots: 0,
+            n_chunks: 0,
+            results_base: 0,
+            gf_regions: &[&regions],
+            const_init: &empty_const_init,
+        });
+
+        let info = validate(&wasm).expect("module must validate");
+        let mut store = Store::new(());
+        let inst = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("instantiate")
+            .module_addr;
+        let mem = store
+            .instance_export(inst, "memory")
+            .unwrap()
+            .as_mem()
+            .unwrap();
+
+        let dir_off = regions.directory_base as usize;
+        let (data_off, n_points, flat) = store.mem_access_mut_slice(mem, |bytes| {
+            let data_off =
+                i32::from_le_bytes(bytes[dir_off..dir_off + 4].try_into().unwrap()) as usize;
+            let n_points =
+                i32::from_le_bytes(bytes[dir_off + 4..dir_off + 8].try_into().unwrap()) as usize;
+            let flat: Vec<f64> = (0..n_points * 2)
+                .map(|i| {
+                    let a = data_off + i * 8;
+                    f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap())
+                })
+                .collect();
+            (data_off, n_points, flat)
+        });
+
+        assert_eq!(n_points, knots.len(), "directory point count");
+        assert_eq!(
+            data_off, regions.data_base as usize,
+            "table 0's data offset is the start of the data region"
+        );
+        for (k, &(x, y)) in knots.iter().enumerate() {
+            assert_eq!(flat[2 * k], x, "knot {k} x");
+            assert_eq!(flat[2 * k + 1], y, "knot {k} y");
+        }
+    }
+
+    /// Task 3 (end-to-end): a model with a graphical-function variable looked up
+    /// in all three modes -- `LOOKUP` (Interpolate), `LOOKUP FORWARD`, and
+    /// `LOOKUP BACKWARD` -- matches the VM at every saved step. The lookup index
+    /// is `TIME - 1`, which sweeps the table's x-domain plus a below-range
+    /// margin (negative at t=0) and an above-range margin, so the recorded
+    /// series exercise below/at-knot/between/above across the run.
+    #[test]
+    fn compile_simulation_gf_lookup_modes_match_vm() {
+        let knots = [(0.0, 10.0), (1.0, 20.0), (2.5, 5.0), (4.0, 40.0)];
+        let datamodel = crate::test_common::TestProject::new("gf_modes")
+            // TIME 0..6, dt 0.25 -> index = TIME-1 sweeps -1..5 over [0,4] table.
+            .with_sim_time(0.0, 6.0, 0.25)
+            .aux("input", "TIME - 1", None)
+            .aux_with_gf("curve", "0", gf_from_knots(&knots))
+            .aux("interp_val", "LOOKUP(curve, input)", None)
+            .aux("fwd_val", "LOOKUP_FORWARD(curve, input)", None)
+            .aux("bwd_val", "LOOKUP_BACKWARD(curve, input)", None)
+            .build_datamodel();
+
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+
+        let checked = assert_matches_vm(sim, &artifact);
+        // All five variables must reach parity: the three lookup-mode results
+        // (interp/fwd/bwd), the lookup-only `curve` holder they read, and its
+        // `input`. Pinning >= 5 (not just the 3 lookup modes) proves the
+        // lookup-only curve holder and its driver also match the VM.
+        assert!(
+            checked >= 5,
+            "expected to compare interp/fwd/bwd + curve + input, only checked {checked}"
+        );
+        for name in ["interp_val", "fwd_val", "bwd_val"] {
+            assert!(
+                artifact.layout.var_offsets.iter().any(|(n, _)| n == name),
+                "{name} should be in the layout"
+            );
+        }
+    }
+
+    /// The FFI entry point goes through the salsa pipeline + `compile_simulation`
+    /// and returns a non-empty blob that validates under the interpreter.
+    #[test]
+    fn compile_datamodel_to_wasm_validates() {
+        let file = std::fs::File::open(POPULATION_XMILE).expect("open population model");
+        let mut reader = BufReader::new(file);
+        let datamodel = open_xmile(&mut reader).expect("parse population xmile");
+
+        let wasm = compile_datamodel_to_wasm(&datamodel, "main").expect("wasm codegen");
+        assert!(!wasm.is_empty(), "blob should be non-empty");
+        validate(&wasm).expect("blob must validate under the interpreter");
+    }
+
+    // ── compile_simulation (CompiledSimulation -> wasm) ───────────────────
+
+    /// Build a `CompiledSimulation` for the named model of `datamodel` via the
+    /// production incremental pipeline (the same path the VM corpus uses).
+    fn compile_sim(datamodel: &crate::datamodel::Project, model_name: &str) -> CompiledSimulation {
+        let mut db = SimlinDb::default();
+        let sync = sync_from_datamodel_incremental(&mut db, datamodel, None);
+        compile_project_incremental(&db, sync.project, model_name).expect("incremental compile")
+    }
+
+    /// Run a `WasmArtifact` under the DLR-FT interpreter and return the
+    /// step-major results slab (`n_chunks * n_slots` f64, row-major by step).
+    fn run_artifact_results(artifact: &WasmArtifact) -> Vec<f64> {
+        let info = validate(&artifact.wasm).expect("generated module must validate");
+        let mut store = Store::new(());
+        let inst = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("instantiate")
+            .module_addr;
+        let run = store
+            .instance_export(inst, "run")
+            .unwrap()
+            .as_func()
+            .unwrap();
+        store
+            .invoke_simple_typed::<(), ()>(run, ())
+            .expect("run wasm");
+        let mem = store
+            .instance_export(inst, "memory")
+            .unwrap()
+            .as_mem()
+            .unwrap();
+        let n = artifact.layout.n_chunks * artifact.layout.n_slots;
+        let base = artifact.layout.results_offset;
+        store.mem_access_mut_slice(mem, |bytes| {
+            (0..n)
+                .map(|i| {
+                    let a = base + i * 8;
+                    f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap())
+                })
+                .collect()
+        })
+    }
+
+    /// Assert every variable in `artifact.layout` matches the VM's series for
+    /// the same `CompiledSimulation`. Returns the number of variables checked.
+    fn assert_matches_vm(sim: CompiledSimulation, artifact: &WasmArtifact) -> usize {
+        let n_slots = artifact.layout.n_slots;
+        let n_chunks = artifact.layout.n_chunks;
+        let wasm_data = run_artifact_results(artifact);
+
+        let mut vm = Vm::new(sim).expect("vm creation");
+        vm.run_to_end().expect("vm run");
+        let vm_results = vm.into_results();
+
+        assert_eq!(
+            vm_results.step_count, n_chunks,
+            "saved-chunk count differs from VM"
+        );
+
+        let mut checked = 0usize;
+        for (name, wasm_off) in &artifact.layout.var_offsets {
+            let wasm_off = *wasm_off;
+            let ident = Ident::<Canonical>::from_str_unchecked(name);
+            let Some(&vm_off) = vm_results.offsets.get(&ident) else {
+                continue;
+            };
+            for c in 0..n_chunks {
+                let vm_val = vm_results.data[c * vm_results.step_size + vm_off];
+                let wasm_val = wasm_data[c * n_slots + wasm_off];
+                let diff = (vm_val - wasm_val).abs();
+                assert!(
+                    diff < 1e-9,
+                    "{name} mismatch at chunk {c}: vm={vm_val} wasm={wasm_val} (diff {diff})",
+                );
+            }
+            checked += 1;
+        }
+        checked
+    }
+
+    /// End-to-end VM parity for the `AllocateAvailable` opcode on the real
+    /// `allocate.xmile` corpus model. The model's supply ramps from 0 to 10
+    /// over the run while total demand is 9, so the recorded series sweep all
+    /// three regimes -- `avail <= 0` (zeros) early, the partial-allocation
+    /// bisection over rectangular priority profiles in the middle, and
+    /// `avail >= total_demand` (full grant) once supply exceeds demand --
+    /// against `Vm::new(sim).run_to_end()`. (The model is NOT in the active
+    /// `wasm_parity_floor` corpus; raising that floor is a separate task.)
+    #[test]
+    fn compile_simulation_allocate_available_matches_vm() {
+        let path = concat!(
+            env!("CARGO_MANIFEST_DIR"),
+            "/../../test/sdeverywhere/models/allocate/allocate.xmile"
+        );
+        let file = std::fs::File::open(path).expect("open allocate xmile");
+        let mut reader = BufReader::new(file);
+        let datamodel = open_xmile(&mut reader).expect("parse allocate xmile");
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("allocate wasm codegen");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(
+            checked >= 5,
+            "expected to compare the allocate model's variables, only checked {checked}"
+        );
+        assert!(
+            artifact
+                .layout
+                .var_offsets
+                .iter()
+                .any(|(n, _)| n.starts_with("shipments")),
+            "the arrayed shipments allocation should be in the layout"
+        );
+    }
+
+    #[test]
+    fn compile_simulation_population_matches_vm() {
+        let file = std::fs::File::open(POPULATION_XMILE).expect("open population model");
+        let mut reader = BufReader::new(file);
+        let datamodel = open_xmile(&mut reader).expect("parse population xmile");
+
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+
+        // Geometry is self-consistent with the specs.
+        let specs = Specs::from(&datamodel.sim_specs);
+        assert_eq!(artifact.layout.n_chunks, specs.n_chunks);
+
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(
+            checked >= 5,
+            "expected to compare the population model's variables, only checked {checked}"
+        );
+        assert!(
+            artifact
+                .layout
+                .var_offsets
+                .iter()
+                .any(|(n, _)| n == "population"),
+            "the population stock should be in the layout"
+        );
+    }
+
+    #[test]
+    fn compile_simulation_simple_stock_flow_matches_vm() {
+        // A minimal scalar Euler model: a stock filled by a constant inflow.
+        let datamodel = crate::test_common::TestProject::new("simple")
+            .with_sim_time(0.0, 10.0, 1.0)
+            .aux("inflow_rate", "2", None)
+            .stock("level", "0", &["inflow"], &[], None)
+            .flow("inflow", "inflow_rate", None)
+            .build_datamodel();
+
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 2, "expected to compare level + inflow");
+        // level should integrate to 2*10 = 20 by the last step.
+        let last = run_artifact_results(&artifact);
+        let n_slots = artifact.layout.n_slots;
+        let level_off = artifact
+            .layout
+            .var_offsets
+            .iter()
+            .find(|(n, _)| n == "level")
+            .map(|(_, off)| *off)
+            .expect("level offset");
+        let last_step = (artifact.layout.n_chunks - 1) * n_slots + level_off;
+        assert!(
+            (last[last_step] - 20.0).abs() < 1e-9,
+            "level should reach 20"
+        );
+    }
+
+    #[test]
+    fn compile_simulation_save_step_cadence_matches_vm() {
+        // Exercises the conditional-save / non-save-step copy-back branch of
+        // `save_advance!` (`vm.rs:682`): with save_step = 2*dt, most steps copy
+        // `next -> curr` WITHOUT recording a snapshot, and only every other step
+        // (plus the forced t=start sample) writes a results row. Every other
+        // wasmgen test uses save_step = None (save_every = 1), so this is the
+        // only coverage of the multi-step cadence.
+        let mut datamodel = crate::test_common::TestProject::new("cadence")
+            .with_sim_time(0.0, 10.0, 1.0)
+            .aux("inflow_rate", "2", None)
+            .stock("level", "0", &["inflow"], &[], None)
+            .flow("inflow", "inflow_rate", None)
+            .build_datamodel();
+        // `with_sim_time` clears save_step to dt; the builder has no
+        // `with_save_step`, so set it directly: save_step = 2, dt = 1.
+        datamodel.sim_specs.save_step = Some(crate::datamodel::Dt::Dt(2.0));
+
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+
+        // dt=1, save_step=2 over [0,10] saves at t=0,2,4,6,8,10 -> 6 chunks.
+        assert_eq!(
+            artifact.layout.n_chunks, 6,
+            "save_step = 2*dt over [0,10] should yield 6 saved samples"
+        );
+
+        // Per-variable series + saved-chunk count both match the VM (which
+        // `assert_matches_vm` asserts via `step_count == n_chunks`).
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 2, "expected to compare level + inflow");
+    }
+
+    #[test]
+    fn compile_simulation_conditional_model_matches_vm() {
+        // Exercises the SetCond/If lowering through the whole-model path.
+        let datamodel = crate::test_common::TestProject::new("cond")
+            .with_sim_time(0.0, 5.0, 1.0)
+            .aux("threshold", "3", None)
+            .aux("gated", "IF TIME > threshold THEN 10 ELSE 1", None)
+            .stock("acc", "0", &["gated_flow"], &[], None)
+            .flow("gated_flow", "gated", None)
+            .build_datamodel();
+
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 2, "expected to compare gated + acc");
+    }
+
+    // ── PREVIOUS / INIT (Task 1: snapshot regions + LoadPrev/LoadInitial) ──
+
+    /// Task 1: `PREVIOUS(x)` under Euler. At t0 the snapshot has not been taken,
+    /// so `LoadPrev` returns its fallback (the 0 the unary `PREVIOUS` desugars
+    /// to); after the first step it returns the prior step's `x`. The series
+    /// must match the VM, which gates the same fallback-vs-snapshot choice on
+    /// `use_prev_fallback`.
+    #[test]
+    fn compile_simulation_previous_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("prev")
+            .with_sim_time(0.0, 5.0, 1.0)
+            // x ramps each step so PREVIOUS(x) is a visibly-lagged series.
+            .stock("x", "10", &["grow"], &[], None)
+            .flow("grow", "1", None)
+            .aux("x_prev", "PREVIOUS(x)", None)
+            .build_datamodel();
+
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 2, "expected to compare x + x_prev");
+    }
+
+    /// Instantiate `artifact` ONCE and invoke the exported `run` `runs` times in
+    /// sequence with no `reset` between, returning the results slab read after
+    /// each call. Models the wasm backend's documented "instantiate once, re-run
+    /// on every change" usage (interactive scrubbing; the POC's `run` "re-runs
+    /// the whole simulation" per call) -- which exercises the cross-run state
+    /// reset that a single `run` invocation cannot.
+    fn run_artifact_results_repeated(artifact: &WasmArtifact, runs: usize) -> Vec<Vec<f64>> {
+        let info = validate(&artifact.wasm).expect("generated module must validate");
+        let mut store = Store::new(());
+        let inst = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("instantiate")
+            .module_addr;
+        let n = artifact.layout.n_chunks * artifact.layout.n_slots;
+        let base = artifact.layout.results_offset;
+        let mut out = Vec::with_capacity(runs);
+        for _ in 0..runs {
+            let run = store
+                .instance_export(inst, "run")
+                .unwrap()
+                .as_func()
+                .unwrap();
+            store
+                .invoke_simple_typed::<(), ()>(run, ())
+                .expect("run wasm");
+            let mem = store
+                .instance_export(inst, "memory")
+                .unwrap()
+                .as_mem()
+                .unwrap();
+            let slab = store.mem_access_mut_slice(mem, |bytes| {
+                (0..n)
+                    .map(|i| {
+                        let a = base + i * 8;
+                        f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap())
+                    })
+                    .collect::<Vec<f64>>()
+            });
+            out.push(slab);
+        }
+        out
+    }
+
+    /// Regression (PR #620 review): `run` reseeds the time globals and reruns
+    /// initials, so it is a complete simulation from t0 and the documented
+    /// per-change entry point for repeated re-simulation. It must therefore
+    /// reset the PREVIOUS fallback flag itself, mirroring the VM's `run_initials`
+    /// (which sets `use_prev_fallback = true` at the start of every run). Without
+    /// that reset, the loop leaves the flag at 0, so a SECOND `run` on the same
+    /// instance reads the first run's final `prev_values` on step 0 (and during
+    /// initials) instead of the fallback -- contaminating any `PREVIOUS(...)`
+    /// model. This instantiates once and runs twice with no `reset` between: a
+    /// deterministic model must produce identical results both times, and
+    /// `x_prev` at t0 must be the unary-PREVIOUS fallback (0), not the stale
+    /// prior-run value.
+    #[test]
+    fn compile_simulation_repeated_run_resets_previous_fallback() {
+        let datamodel = crate::test_common::TestProject::new("prev_repeat")
+            .with_sim_time(0.0, 5.0, 1.0)
+            .stock("x", "10", &["grow"], &[], None)
+            .flow("grow", "1", None)
+            .aux("x_prev", "PREVIOUS(x)", None)
+            .build_datamodel();
+
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+
+        let runs = run_artifact_results_repeated(&artifact, 2);
+        let (first, second) = (&runs[0], &runs[1]);
+
+        // A deterministic model re-run from t0 produces byte-identical results;
+        // the bug makes the second run's PREVIOUS reads diverge on step 0.
+        assert_eq!(
+            first, second,
+            "second run() diverged from the first -- stale PREVIOUS fallback state leaked across runs"
+        );
+
+        // Pin the discriminating cell: x_prev at the first saved chunk (t0) is
+        // the unary-PREVIOUS fallback (0), not the prior run's final x.
+        let x_prev_off = artifact
+            .layout
+            .var_offsets
+            .iter()
+            .find(|(name, _)| name == "x_prev")
+            .map(|(_, off)| *off)
+            .expect("x_prev in layout");
+        assert_eq!(
+            second[x_prev_off], 0.0,
+            "x_prev at t0 on the second run must be the PREVIOUS fallback (0), got {}",
+            second[x_prev_off]
+        );
+    }
+
+    /// Regression (PR #620 review): a stock at an absolute slot offset >= 65536
+    /// must address its real slot under RK integration, not `off & 0xFFFF`. Such
+    /// offsets are reachable in a large nested model (each submodel/SMOOTH/DELAY
+    /// instance adds slots; nothing caps total `n_slots` in the wasm path). The
+    /// RK stage delta `next[off] - curr[off]` is computed by
+    /// `emit_compute_stage_delta`; the original bug threaded `off` as `u16`, so a
+    /// stock at offset 65536 read slot `65536 & 0xFFFF == 0` (TIME) instead of its
+    /// own. This drives the helper at offset 65536 over a hand-built memory whose
+    /// slot 0 and slot 65536 hold distinct values and asserts it reads slot 65536
+    /// (matching the Euler advance, which has always used the full-width offset).
+    #[test]
+    fn rk_stage_delta_addresses_stock_above_65535() {
+        // 65536 & 0xFFFF == 0, so a truncated offset would alias slot 0 (TIME).
+        const HIGH_OFF: u32 = 65536;
+        // `curr` holds slots [0, HIGH_OFF]; `next` sits one stride past it.
+        let next_base = (HIGH_OFF + 1) * SLOT_SIZE;
+
+        // probe() -> f64: L_RK_S := next[HIGH_OFF] - curr[HIGH_OFF]; return it.
+        // Locals mirror the run fn so the f64 local L_RK_S (index 4) is valid.
+        let mut probe = Function::new([(3, ValType::I32), (2, ValType::F64)]);
+        emit_compute_stage_delta(&mut probe, next_base, HIGH_OFF);
+        probe.instruction(&I::LocalGet(L_RK_S));
+        probe.instruction(&I::End);
+
+        let mut module = WasmModule::new();
+        let mut types = TypeSection::new();
+        types.ty().function([], [ValType::F64]);
+        module.section(&types);
+        let mut functions = FunctionSection::new();
+        functions.function(0);
+        module.section(&functions);
+        let bytes_needed = next_base + (HIGH_OFF + 1) * SLOT_SIZE;
+        let mut memories = MemorySection::new();
+        memories.memory(MemoryType {
+            minimum: u64::from(bytes_needed.div_ceil(65536) + 1),
+            maximum: None,
+            memory64: false,
+            shared: false,
+            page_size_log2: None,
+        });
+        module.section(&memories);
+        let mut exports = ExportSection::new();
+        exports.export("probe", ExportKind::Func, 0);
+        exports.export("memory", ExportKind::Memory, 0);
+        module.section(&exports);
+        let mut code = CodeSection::new();
+        code.function(&probe);
+        module.section(&code);
+        let wasm = module.finish();
+
+        let info = validate(&wasm).expect("module must validate");
+        let mut store = Store::new(());
+        let inst = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("instantiate")
+            .module_addr;
+        let mem = store
+            .instance_export(inst, "memory")
+            .unwrap()
+            .as_mem()
+            .unwrap();
+        // Seed slot 0 (the alias target under truncation) and slot HIGH_OFF with
+        // distinct values, so reading the wrong slot yields a distinguishable result.
+        let curr_hi = (HIGH_OFF * SLOT_SIZE) as usize;
+        let next0 = next_base as usize;
+        let next_hi = (next_base + HIGH_OFF * SLOT_SIZE) as usize;
+        store.mem_access_mut_slice(mem, |b| {
+            b[0..8].copy_from_slice(&100.0f64.to_le_bytes()); // curr[0]
+            b[next0..next0 + 8].copy_from_slice(&200.0f64.to_le_bytes()); // next[0]
+            b[curr_hi..curr_hi + 8].copy_from_slice(&3.0f64.to_le_bytes()); // curr[HIGH_OFF]
+            b[next_hi..next_hi + 8].copy_from_slice(&10.0f64.to_le_bytes()); // next[HIGH_OFF]
+        });
+        let probe_fn = store
+            .instance_export(inst, "probe")
+            .unwrap()
+            .as_func()
+            .unwrap();
+        let delta: f64 = store
+            .invoke_simple_typed::<(), f64>(probe_fn, ())
+            .expect("probe");
+
+        // next[HIGH_OFF] - curr[HIGH_OFF] = 10 - 3 = 7. A truncated u16 offset
+        // would read slot 0 instead (200 - 100 = 100).
+        assert_eq!(
+            delta, 7.0,
+            "RK stage delta read the wrong slot -- stock offset truncated above 65535?"
+        );
+    }
+
+    /// Task 1: `INIT(x)` referenced from a flow reads the `initial_values`
+    /// snapshot captured once after the initials phase (in the flows/stocks
+    /// programs `LoadInitial` reads `initial_values[off]`, never `curr`). Here
+    /// the inflow is held at `INIT(level)`, so `level` integrates by its own
+    /// initial value each step; the wasm series must match the VM.
+    #[test]
+    fn compile_simulation_init_from_flow_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("init_flow")
+            .with_sim_time(0.0, 5.0, 1.0)
+            .stock("level", "7", &["inflow"], &[], None)
+            // INIT(level) is captured once at t0 (= 7) and stays 7 every step.
+            .flow("inflow", "INIT(level)", None)
+            .build_datamodel();
+
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 2, "expected to compare level + inflow");
+        // level starts at 7 and grows by INIT(level)=7 each of 5 steps -> 42.
+        let results = run_artifact_results(&artifact);
+        let n_slots = artifact.layout.n_slots;
+        let level_off = artifact
+            .layout
+            .var_offsets
+            .iter()
+            .find(|(n, _)| n == "level")
+            .map(|(_, off)| *off)
+            .expect("level offset");
+        let last = (artifact.layout.n_chunks - 1) * n_slots + level_off;
+        assert!(
+            (results[last] - 42.0).abs() < 1e-9,
+            "level should reach 7 + 5*7 = 42, got {}",
+            results[last]
+        );
+    }
+
+    /// Task 1: `INIT(x)` referenced from *another initial equation* reads
+    /// `curr` during the initials phase (the snapshot is taken only after
+    /// initials run). `seed` is computed during initials, and `derived`'s
+    /// initial equation reads `INIT(seed)` -- which must resolve to the
+    /// just-computed `curr[seed]`, not an as-yet-unwritten `initial_values`.
+    #[test]
+    fn compile_simulation_init_from_initial_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("init_initial")
+            .with_sim_time(0.0, 3.0, 1.0)
+            .aux("seed", "5", None)
+            // A stock whose INITIAL equation reads INIT(seed): during initials
+            // LoadInitial must read curr[seed] (= 5), so derived starts at 5.
+            .stock("derived", "INIT(seed)", &["hold"], &[], None)
+            .flow("hold", "0", None)
+            .build_datamodel();
+
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 2, "expected to compare seed + derived");
+        // derived initializes to INIT(seed)=5 and the flow holds it there.
+        // Chunk 0 starts at slab offset 0, so `derived_off` indexes it directly.
+        let results = run_artifact_results(&artifact);
+        let derived_off = artifact
+            .layout
+            .var_offsets
+            .iter()
+            .find(|(n, _)| n == "derived")
+            .map(|(_, off)| *off)
+            .expect("derived offset");
+        assert!(
+            (results[derived_off] - 5.0).abs() < 1e-9,
+            "derived should initialize to INIT(seed) = 5, got {}",
+            results[derived_off]
+        );
+    }
+
+    // ── RK2 / RK4 integration loops (Task 2) ──────────────────────────────
+
+    /// A logistic-growth model: `pop' = rate * pop * (1 - pop/capacity)`. The
+    /// nonlinear flow depends on the stock, so RK's trial-point evaluations
+    /// genuinely differ from Euler -- a pure-constant flow would let a broken RK
+    /// loop pass by coincidence.
+    fn logistic_growth(
+        name: &str,
+        method: crate::datamodel::SimMethod,
+    ) -> crate::datamodel::Project {
+        crate::test_common::TestProject::new(name)
+            .with_sim_time(0.0, 20.0, 0.5)
+            .with_sim_method(method)
+            .aux("rate", "0.3", None)
+            .aux("capacity", "1000", None)
+            .stock("pop", "10", &["growth"], &[], None)
+            .flow("growth", "rate * pop * (1 - pop / capacity)", None)
+            .build_datamodel()
+    }
+
+    /// Task 2: an RK4 scalar model matches the VM's saved samples (cadence and
+    /// values). The VM's RK4 loop is the oracle; the emitted four-stage loop
+    /// with time juggling + the end-of-step flows re-eval must reproduce it.
+    #[test]
+    fn compile_simulation_rk4_matches_vm() {
+        let datamodel = logistic_growth("rk4_logistic", crate::datamodel::SimMethod::RungeKutta4);
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (RK4)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 2, "expected to compare pop + growth");
+    }
+
+    /// Task 2: an RK2 (Heun) scalar model matches the VM's saved samples. Same
+    /// nonlinear model so the two-stage trial step is genuinely exercised.
+    #[test]
+    fn compile_simulation_rk2_matches_vm() {
+        let datamodel = logistic_growth("rk2_logistic", crate::datamodel::SimMethod::RungeKutta2);
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (RK2)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 2, "expected to compare pop + growth");
+    }
+
+    /// Task 2: RK4 and RK2 must genuinely differ from Euler on this nonlinear
+    /// model -- otherwise the RK tests above could pass against a loop that
+    /// silently fell back to Euler. Establishes that the oracle (the VM) sees a
+    /// method-dependent trajectory, so wasm-vs-VM parity is a meaningful check.
+    #[test]
+    fn rk_methods_differ_from_euler_in_vm() {
+        let last_pop = |method| {
+            let datamodel = logistic_growth("rk_vs_euler", method);
+            let sim = compile_sim(&datamodel, "main");
+            let mut vm = Vm::new(sim).expect("vm");
+            vm.run_to_end().expect("vm run");
+            let results = vm.into_results();
+            let pop = Ident::<Canonical>::from_str_unchecked("pop");
+            let off = *results.offsets.get(&pop).expect("pop offset");
+            results.data[(results.step_count - 1) * results.step_size + off]
+        };
+        let euler = last_pop(crate::datamodel::SimMethod::Euler);
+        let rk4 = last_pop(crate::datamodel::SimMethod::RungeKutta4);
+        let rk2 = last_pop(crate::datamodel::SimMethod::RungeKutta2);
+        assert!(
+            (euler - rk4).abs() > 1e-6,
+            "RK4 must differ from Euler (euler={euler}, rk4={rk4})"
+        );
+        assert!(
+            (euler - rk2).abs() > 1e-6,
+            "RK2 must differ from Euler (euler={euler}, rk2={rk2})"
+        );
+    }
+
+    /// A coupled two-stock Lotka-Volterra (predator-prey) model. Each stock's
+    /// flows read the *other* stock, so a single RK stage's trial-point
+    /// evaluation interleaves both stocks: `prey`'s `predation` outflow reads
+    /// `predator`, and `predator`'s `growth` inflow reads `prey`. This is what
+    /// the single-stock RK tests cannot exercise -- with two stocks the stage
+    /// math walks `stock_offsets` and keeps each stock's `saved[i]`/`accum[i]`
+    /// and trial `curr[off_i]` independent. A loop that aliased the scratch
+    /// across stocks, or iterated `stock_offsets` in an unstable order, would
+    /// corrupt one stock's trajectory and fail the VM-parity check below.
+    ///
+    /// Classic textbook parameters (alpha/beta/gamma/delta) on a short horizon
+    /// with a small dt: the system oscillates, both stay strictly positive, and
+    /// Euler vs RK4/RK2 visibly diverge (asserted by
+    /// `multi_stock_coupled_diverges_euler_vs_rk_in_vm`). 100 steps keeps the
+    /// un-JITed DLR-FT run well under the per-test budget.
+    fn lotka_volterra(
+        name: &str,
+        method: crate::datamodel::SimMethod,
+    ) -> crate::datamodel::Project {
+        crate::test_common::TestProject::new(name)
+            .with_sim_time(0.0, 5.0, 0.05)
+            .with_sim_method(method)
+            .aux("alpha", "1.1", None)
+            .aux("beta", "0.4", None)
+            .aux("gamma", "0.4", None)
+            .aux("delta", "0.1", None)
+            // prey:     d/dt = alpha*prey - beta*prey*predator
+            .stock("prey", "10", &["prey_birth"], &["predation"], None)
+            .flow("prey_birth", "alpha * prey", None)
+            .flow("predation", "beta * prey * predator", None)
+            // predator: d/dt = delta*prey*predator - gamma*predator
+            .stock("predator", "10", &["pred_growth"], &["pred_death"], None)
+            .flow("pred_growth", "delta * prey * predator", None)
+            .flow("pred_death", "gamma * predator", None)
+            .build_datamodel()
+    }
+
+    /// Meaningfulness precondition for the two-stock RK parity tests: the
+    /// coupled model's trajectory is genuinely method-dependent in the VM (the
+    /// oracle) for *both* stocks. Without this, a wasm RK loop that silently
+    /// degraded to Euler -- or never advanced the second stock -- could pass
+    /// `assert_matches_vm` against a coincidentally-identical VM Euler series.
+    #[test]
+    fn multi_stock_coupled_diverges_euler_vs_rk_in_vm() {
+        let last_two = |method| {
+            let datamodel = lotka_volterra("lv_vs_euler", method);
+            let sim = compile_sim(&datamodel, "main");
+            let mut vm = Vm::new(sim).expect("vm");
+            vm.run_to_end().expect("vm run");
+            let results = vm.into_results();
+            let read = |name: &str| {
+                let id = Ident::<Canonical>::from_str_unchecked(name);
+                let off = *results
+                    .offsets
+                    .get(&id)
+                    .unwrap_or_else(|| panic!("{name} offset"));
+                results.data[(results.step_count - 1) * results.step_size + off]
+            };
+            (read("prey"), read("predator"))
+        };
+        let (e_prey, e_pred) = last_two(crate::datamodel::SimMethod::Euler);
+        let (rk4_prey, rk4_pred) = last_two(crate::datamodel::SimMethod::RungeKutta4);
+        let (rk2_prey, rk2_pred) = last_two(crate::datamodel::SimMethod::RungeKutta2);
+        // Both stocks must move under RK4 and RK2 relative to Euler -- proving
+        // the stage math integrates each independently, not just the first.
+        assert!(
+            (e_prey - rk4_prey).abs() > 1e-6 && (e_pred - rk4_pred).abs() > 1e-6,
+            "RK4 must differ from Euler for both stocks \
+             (prey: euler={e_prey} rk4={rk4_prey}; predator: euler={e_pred} rk4={rk4_pred})"
+        );
+        assert!(
+            (e_prey - rk2_prey).abs() > 1e-6 && (e_pred - rk2_pred).abs() > 1e-6,
+            "RK2 must differ from Euler for both stocks \
+             (prey: euler={e_prey} rk2={rk2_prey}; predator: euler={e_pred} rk2={rk2_pred})"
+        );
+    }
+
+    /// Coverage gap closed: a TWO-STOCK COUPLED model under RK4 matches the VM
+    /// per-variable, per-chunk. The phase's other RK tests are single-stock, so
+    /// this is the only check that the four-stage stage math keeps two stocks'
+    /// `saved[i]`/`accum[i]`/`curr[off_i]` independent and iterates
+    /// `stock_offsets` in a stable order across all four stages. `checked >= 2`
+    /// pins that both stocks (not just `prey`) reached parity.
+    #[test]
+    fn compile_simulation_two_stock_coupled_rk4_matches_vm() {
+        let datamodel = lotka_volterra("lv_rk4", crate::datamodel::SimMethod::RungeKutta4);
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (two-stock RK4)");
+        let checked = assert_matches_vm(sim, &artifact);
+        // Both stocks plus the four flows and four params all match; pin >= 2 so
+        // the two coupled stocks specifically are among the compared variables.
+        assert!(
+            checked >= 2,
+            "expected to compare both prey + predator, only checked {checked}"
+        );
+        for name in ["prey", "predator"] {
+            assert!(
+                artifact.layout.var_offsets.iter().any(|(n, _)| n == name),
+                "{name} should be in the layout"
+            );
+        }
+    }
+
+    /// The RK2 (Heun) companion to `compile_simulation_two_stock_coupled_rk4_matches_vm`:
+    /// the two-stage trial step over two coupled stocks matches the VM.
+    #[test]
+    fn compile_simulation_two_stock_coupled_rk2_matches_vm() {
+        let datamodel = lotka_volterra("lv_rk2", crate::datamodel::SimMethod::RungeKutta2);
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (two-stock RK2)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(
+            checked >= 2,
+            "expected to compare both prey + predator, only checked {checked}"
+        );
+    }
+
+    /// Task 2: a model using `PREVIOUS`/`INIT` under RK4 matches the VM. The
+    /// snapshot timing is the subtle part: `prev_values` is captured AFTER the
+    /// end-of-step flows re-eval (with `curr` restored to time-`t` state), not
+    /// from a trial point. `x_prev` lags `pop`; `pop_init` reads INIT(pop).
+    #[test]
+    fn compile_simulation_rk4_with_previous_and_init_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("rk4_prev_init")
+            .with_sim_time(0.0, 10.0, 0.5)
+            .with_sim_method(crate::datamodel::SimMethod::RungeKutta4)
+            .aux("rate", "0.3", None)
+            .aux("capacity", "1000", None)
+            .stock("pop", "10", &["growth"], &[], None)
+            .flow("growth", "rate * pop * (1 - pop / capacity)", None)
+            // PREVIOUS(pop): lagged by one saved step; captured after re-eval.
+            .aux("pop_prev", "PREVIOUS(pop)", None)
+            // INIT(pop): the t0 snapshot (= 10), read from initial_values.
+            .aux("pop_init", "INIT(pop)", None)
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (RK4 + PREVIOUS/INIT)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(
+            checked >= 4,
+            "expected to compare pop + growth + pop_prev + pop_init"
+        );
+    }
+
+    /// After Task 2, RK4 (and RK2) are supported, so a model using them runs
+    /// rather than being rejected -- the inverse of the Phase-1 guard. Pinned so
+    /// a regression that re-introduced the Euler-only guard would be caught.
+    #[test]
+    fn compile_simulation_accepts_rk4() {
+        let datamodel = crate::test_common::TestProject::new("rk4_accept")
+            .with_sim_time(0.0, 5.0, 1.0)
+            .with_sim_method(crate::datamodel::SimMethod::RungeKutta4)
+            .aux("inflow_rate", "2", None)
+            .stock("level", "0", &["inflow"], &[], None)
+            .flow("inflow", "inflow_rate", None)
+            .build_datamodel();
+
+        let sim = compile_sim(&datamodel, "main");
+        compile_simulation(&sim).expect("RK4 must now be supported");
+    }
+
+    // ── Modules: EvalModule / LoadModuleInput (Phase 7 Task 1) ────────────
+    //
+    // Each unique `(model, input_set)` instance becomes its own initials/flows/
+    // stocks wasm function taking `(module_off: i32, in_0..in_{k-1}: f64)`. An
+    // `EvalModule` resolves the child instance and `call`s its function for the
+    // current `StepPart`, passing `module_off + decl.off` and the popped inputs;
+    // `LoadModuleInput` reads an input parameter. These tests assert wasm matches
+    // the VM for submodel-bearing models, including the SMOOTH stdlib macro (which
+    // expands to implicit module stocks) and the same instance at two offsets.
+
+    /// A two-model datamodel: a `main` model that instantiates `submodel`
+    /// `n_instances` times, wiring `in_value` (an aux in `main`) into each
+    /// instance's `in` input. The submodel computes `out = body` (referencing its
+    /// own `in`); `body_is_stock` makes `out` a stock integrating `body`, so the
+    /// submodel carries internal stocks reached only through `EvalModule` (the
+    /// nested-stock-offset case). `TestProject` only emits a single `main` model,
+    /// so this is built as an explicit datamodel.
+    fn submodel_project(
+        name: &str,
+        method: crate::datamodel::SimMethod,
+        in_value: &str,
+        body: &str,
+        body_is_stock: bool,
+        n_instances: usize,
+    ) -> crate::datamodel::Project {
+        use crate::datamodel;
+        let mut main_vars: Vec<datamodel::Variable> =
+            vec![datamodel::Variable::Aux(datamodel::Aux {
+                ident: "in_value".to_string(),
+                equation: datamodel::Equation::Scalar(in_value.to_string()),
+                documentation: String::new(),
+                units: None,
+                gf: None,
+                ai_state: None,
+                uid: None,
+                compat: datamodel::Compat::default(),
+            })];
+        for i in 0..n_instances {
+            let ident = format!("sub{i}");
+            main_vars.push(datamodel::Variable::Module(datamodel::Module {
+                // A module reference's `dst` is qualified with the instance name
+                // (`subN.in`), not the bare input variable; an unqualified `dst`
+                // silently fails to wire the input (the submodel's `in` keeps its
+                // default), which would make `LoadModuleInput` untested.
+                references: vec![datamodel::ModuleReference {
+                    src: "in_value".to_string(),
+                    dst: format!("{ident}.in"),
+                }],
+                ident,
+                model_name: "submodel".to_string(),
+                documentation: String::new(),
+                units: None,
+                compat: datamodel::Compat::default(),
+                ai_state: None,
+                uid: None,
+            }));
+        }
+
+        let out_var = if body_is_stock {
+            datamodel::Variable::Stock(datamodel::Stock {
+                ident: "out".to_string(),
+                equation: datamodel::Equation::Scalar("0".to_string()),
+                documentation: String::new(),
+                units: None,
+                inflows: vec!["grow".to_string()],
+                outflows: vec![],
+                ai_state: None,
+                uid: None,
+                compat: datamodel::Compat::default(),
+            })
+        } else {
+            datamodel::Variable::Aux(datamodel::Aux {
+                ident: "out".to_string(),
+                equation: datamodel::Equation::Scalar(body.to_string()),
+                documentation: String::new(),
+                units: None,
+                gf: None,
+                ai_state: None,
+                uid: None,
+                compat: datamodel::Compat::default(),
+            })
+        };
+        let mut submodel_vars = vec![
+            datamodel::Variable::Aux(datamodel::Aux {
+                ident: "in".to_string(),
+                equation: datamodel::Equation::Scalar("0".to_string()),
+                documentation: String::new(),
+                units: None,
+                gf: None,
+                ai_state: None,
+                uid: None,
+                compat: datamodel::Compat {
+                    can_be_module_input: true,
+                    ..datamodel::Compat::default()
+                },
+            }),
+            out_var,
+        ];
+        if body_is_stock {
+            submodel_vars.push(datamodel::Variable::Flow(datamodel::Flow {
+                ident: "grow".to_string(),
+                equation: datamodel::Equation::Scalar(body.to_string()),
+                documentation: String::new(),
+                units: None,
+                gf: None,
+                ai_state: None,
+                uid: None,
+                compat: datamodel::Compat::default(),
+            }));
+        }
+
+        datamodel::Project {
+            name: name.to_string(),
+            sim_specs: datamodel::SimSpecs {
+                start: 0.0,
+                stop: 5.0,
+                dt: datamodel::Dt::Dt(1.0),
+                save_step: None,
+                sim_method: method,
+                time_units: None,
+            },
+            dimensions: vec![],
+            units: vec![],
+            models: vec![
+                datamodel::Model {
+                    name: "main".to_string(),
+                    sim_specs: None,
+                    variables: main_vars,
+                    views: vec![],
+                    loop_metadata: vec![],
+                    groups: vec![],
+                    macro_spec: None,
+                },
+                datamodel::Model {
+                    name: "submodel".to_string(),
+                    sim_specs: None,
+                    variables: submodel_vars,
+                    views: vec![],
+                    loop_metadata: vec![],
+                    groups: vec![],
+                    macro_spec: None,
+                },
+            ],
+            source: Default::default(),
+            ai_information: None,
+        }
+    }
+
+    /// A two-model datamodel like [`submodel_project`], but the submodel carries
+    /// its OWN overridable constant `k` (a flows-phase `AssignConstCurr`) and
+    /// `out = in + k`. Instantiating it `n_instances` times in `main` gives each
+    /// instance a DISTINCT absolute offset for its own `k` (the recursive
+    /// `base_off + module_decl.off` addressing), so a per-instance `set_value`
+    /// override on one instance's `k` must not perturb the other. `in_value` is a
+    /// constant wired into every instance's `in`, so the only differentiator
+    /// between two instances' `out` is each instance's `k` override.
+    fn submodel_with_constant_project(
+        name: &str,
+        in_value: &str,
+        k_default: &str,
+        n_instances: usize,
+    ) -> crate::datamodel::Project {
+        use crate::datamodel;
+        let mut main_vars: Vec<datamodel::Variable> =
+            vec![datamodel::Variable::Aux(datamodel::Aux {
+                ident: "in_value".to_string(),
+                equation: datamodel::Equation::Scalar(in_value.to_string()),
+                documentation: String::new(),
+                units: None,
+                gf: None,
+                ai_state: None,
+                uid: None,
+                compat: datamodel::Compat::default(),
+            })];
+        for i in 0..n_instances {
+            let ident = format!("sub{i}");
+            main_vars.push(datamodel::Variable::Module(datamodel::Module {
+                references: vec![datamodel::ModuleReference {
+                    src: "in_value".to_string(),
+                    dst: format!("{ident}.in"),
+                }],
+                ident,
+                model_name: "submodel".to_string(),
+                documentation: String::new(),
+                units: None,
+                compat: datamodel::Compat::default(),
+                ai_state: None,
+                uid: None,
+            }));
+        }
+
+        let submodel_vars = vec![
+            datamodel::Variable::Aux(datamodel::Aux {
+                ident: "in".to_string(),
+                equation: datamodel::Equation::Scalar("0".to_string()),
+                documentation: String::new(),
+                units: None,
+                gf: None,
+                ai_state: None,
+                uid: None,
+                compat: datamodel::Compat {
+                    can_be_module_input: true,
+                    ..datamodel::Compat::default()
+                },
+            }),
+            // `k` is a bare constant, so it lowers to a flows-phase
+            // `AssignConstCurr` -- i.e. an overridable constant, distinct per
+            // instance.
+            datamodel::Variable::Aux(datamodel::Aux {
+                ident: "k".to_string(),
+                equation: datamodel::Equation::Scalar(k_default.to_string()),
+                documentation: String::new(),
+                units: None,
+                gf: None,
+                ai_state: None,
+                uid: None,
+                compat: datamodel::Compat::default(),
+            }),
+            datamodel::Variable::Aux(datamodel::Aux {
+                ident: "out".to_string(),
+                equation: datamodel::Equation::Scalar("in + k".to_string()),
+                documentation: String::new(),
+                units: None,
+                gf: None,
+                ai_state: None,
+                uid: None,
+                compat: datamodel::Compat::default(),
+            }),
+        ];
+
+        datamodel::Project {
+            name: name.to_string(),
+            sim_specs: datamodel::SimSpecs {
+                start: 0.0,
+                stop: 3.0,
+                dt: datamodel::Dt::Dt(1.0),
+                save_step: None,
+                sim_method: datamodel::SimMethod::Euler,
+                time_units: None,
+            },
+            dimensions: vec![],
+            units: vec![],
+            models: vec![
+                datamodel::Model {
+                    name: "main".to_string(),
+                    sim_specs: None,
+                    variables: main_vars,
+                    views: vec![],
+                    loop_metadata: vec![],
+                    groups: vec![],
+                    macro_spec: None,
+                },
+                datamodel::Model {
+                    name: "submodel".to_string(),
+                    sim_specs: None,
+                    variables: submodel_vars,
+                    views: vec![],
+                    loop_metadata: vec![],
+                    groups: vec![],
+                    macro_spec: None,
+                },
+            ],
+            source: Default::default(),
+            ai_information: None,
+        }
+    }
+
+    /// Task 1: a model instantiating a submodel runs through wasm and matches the
+    /// VM. The submodel's `out` depends on its `in` input (passed from `main`), so
+    /// this exercises both `EvalModule` (the child `call`) and `LoadModuleInput`
+    /// (the child reading its passed input). Previously this construct was rejected
+    /// as `submodules are not supported`.
+    #[test]
+    fn compile_simulation_submodel_matches_vm() {
+        let datamodel = submodel_project(
+            "submod",
+            crate::datamodel::SimMethod::Euler,
+            "TIME + 1",
+            "in * 2",
+            false,
+            1,
+        );
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (submodel)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(
+            checked >= 2,
+            "expected to compare main's in_value + the submodel's out, only checked {checked}"
+        );
+        // The submodel's output slot is in the single shared slab, addressed at
+        // `module_off + off`; its layout entry confirms it was emitted.
+        assert!(
+            artifact
+                .layout
+                .var_offsets
+                .iter()
+                .any(|(n, _)| n.ends_with("out")),
+            "the submodel's `out` should be in the layout"
+        );
+    }
+
+    /// Task 1: `LoadModuleInput` reads the right input. The submodel's output is
+    /// exactly its input, and `in_value` varies with TIME, so a wrong input-param
+    /// index (or a missing pass-through) would diverge from the VM immediately.
+    #[test]
+    fn compile_simulation_submodel_loadmoduleinput_reads_right_input() {
+        let datamodel = submodel_project(
+            "passthru",
+            crate::datamodel::SimMethod::Euler,
+            "TIME * 3 + 1",
+            "in", // out == in: a pure pass-through of the module input
+            false,
+            1,
+        );
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (passthrough)");
+
+        // out must equal in_value (= TIME*3+1) at every saved step.
+        let results = run_artifact_results(&artifact);
+        let n_slots = artifact.layout.n_slots;
+        let find = |needle: &str| {
+            artifact
+                .layout
+                .var_offsets
+                .iter()
+                .find(|(n, _)| n.ends_with(needle))
+                .map(|(_, o)| *o)
+                .unwrap_or_else(|| panic!("{needle} offset"))
+        };
+        let in_off = find("in_value");
+        let out_off = find("out");
+        for c in 0..artifact.layout.n_chunks {
+            let in_v = results[c * n_slots + in_off];
+            let out_v = results[c * n_slots + out_off];
+            assert!(
+                (in_v - out_v).abs() < 1e-9,
+                "submodel out must equal its passed input at chunk {c}: in={in_v} out={out_v}"
+            );
+        }
+        // And the whole model matches the VM.
+        assert_matches_vm(sim, &artifact);
+    }
+
+    /// Task 1 (the `module_off` proof): the SAME `(model, input_set)` instance,
+    /// instantiated twice in `main`, runs through wasm and matches the VM. Both
+    /// instances share one `CompiledModule` (one function triple) but run at two
+    /// different base offsets, so `module_off` must thread correctly into the
+    /// child's slab reads/writes. Each `EvalModule` passes a distinct
+    /// `module_off + decl.off`.
+    #[test]
+    fn compile_simulation_two_instances_same_module_matches_vm() {
+        let datamodel = submodel_project(
+            "twice",
+            crate::datamodel::SimMethod::Euler,
+            "TIME + 2",
+            "in * 10",
+            false,
+            2,
+        );
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (two instances)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(
+            checked >= 3,
+            "expected to compare in_value + both instances' out, only checked {checked}"
+        );
+        // Both instances' outputs occupy distinct slots in the shared slab.
+        let out_slots: Vec<usize> = artifact
+            .layout
+            .var_offsets
+            .iter()
+            .filter(|(n, _)| n.ends_with("out"))
+            .map(|(_, o)| *o)
+            .collect();
+        assert_eq!(
+            out_slots.len(),
+            2,
+            "two instances should contribute two distinct `out` slots, got {out_slots:?}"
+        );
+        assert_ne!(
+            out_slots[0], out_slots[1],
+            "the two instances must run at different module offsets"
+        );
+    }
+
+    /// Task 1 (per-instance DISTINCT overrides -- the direct test of the
+    /// absolute-slot const-region addressing): the SAME `CompiledModule`,
+    /// instantiated twice in `main`, carries DISTINCT `set_value` overrides for
+    /// its own constant `k`. Each instance's `k` lives at a distinct absolute
+    /// offset (`base_off + module_decl.off`, the recursion in
+    /// `collect_overridable_defaults`); the wasm override region is indexed by
+    /// that absolute offset, so overriding instance 0's `k` to 100 and instance
+    /// 1's `k` to 200 makes each instance's `out = in + k` reflect ITS OWN
+    /// override. A bug that applied one override to both instances, or that
+    /// ignored `module_off` (writing both overrides to the same slot), would make
+    /// the two `out` series equal -- which the non-vacuity `assert_ne!` rejects.
+    ///
+    /// This is a wasm-only correctness property: the VM is NOT a valid cell-for-
+    /// cell oracle for *distinct* overrides of a SHARED module, because its
+    /// `set_value_by_offset` mutates the module's shared bytecode literal (one
+    /// `literal_id` for both instances, resolved through the single shared
+    /// `ModuleKey`), so the second override clobbers the first and both instances
+    /// read the last value. The wasm backend is strictly more correct here. The
+    /// VM divergence is tracked separately; this test still anchors against the
+    /// VM in the regime where they DO agree -- both instances overridden to the
+    /// SAME value (`compile_simulation_two_instances_same_value_override_matches_vm`).
+    #[test]
+    fn compile_simulation_two_instances_distinct_overrides() {
+        // `in_value` is the constant 7 wired into both instances' `in`, so the
+        // ONLY differentiator between the two instances' `out` is each instance's
+        // `k` override (default 1).
+        let datamodel = submodel_with_constant_project("distinct", "7", "1", 2);
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (distinct overrides)");
+
+        let (k0_off, k1_off) = instance_k_offsets(&artifact);
+        assert_ne!(
+            k0_off, k1_off,
+            "the two instances' `k` must occupy distinct absolute offsets"
+        );
+        assert!(
+            sim.is_constant_offset(k0_off) && sim.is_constant_offset(k1_off),
+            "each instance's `k` must be a VM-overridable constant (sub0·k={k0_off}, sub1·k={k1_off})"
+        );
+
+        // Apply DIFFERENT overrides to the two instances, then reset + run.
+        let wasm_slab = run_artifact_with_overrides(&artifact, &[(k0_off, 100.0), (k1_off, 200.0)]);
+        let n_slots = artifact.layout.n_slots;
+        let n_chunks = artifact.layout.n_chunks;
+
+        // Non-vacuity: each instance's `out` reflects ITS OWN override, and the
+        // two genuinely DIFFER. `in_value` is 7, so sub0·out = 7 + 100 = 107 and
+        // sub1·out = 7 + 200 = 207 at every saved step. If a bug applied one
+        // override to both instances (or ignored `module_off` and wrote both to
+        // one slot), the two `out` series would be equal and this would fail.
+        let out0_off = layout_offset(&artifact, qualified_ident("sub0", "out").as_str());
+        let out1_off = layout_offset(&artifact, qualified_ident("sub1", "out").as_str());
+        for c in 0..n_chunks {
+            let out0 = wasm_slab[c * n_slots + out0_off];
+            let out1 = wasm_slab[c * n_slots + out1_off];
+            assert!(
+                (out0 - 107.0).abs() < 1e-9,
+                "sub0·out should be in_value(7)+k0(100)=107 at chunk {c}, got {out0}"
+            );
+            assert!(
+                (out1 - 207.0).abs() < 1e-9,
+                "sub1·out should be in_value(7)+k1(200)=207 at chunk {c}, got {out1}"
+            );
+            assert_ne!(
+                out0, out1,
+                "the two instances' outputs must DIFFER under distinct per-instance overrides"
+            );
+        }
+    }
+
+    /// Task 1 (VM parity anchor for the shared-module override path): overriding
+    /// BOTH instances' `k` to the SAME value matches the VM cell-for-cell. This is
+    /// the regime where the VM and wasm agree -- the VM's shared-literal clobber
+    /// (see `compile_simulation_two_instances_distinct_overrides`) is harmless
+    /// when both overrides carry the same value -- so it proves the wasm override
+    /// mechanism is faithful to the VM (not merely internally consistent) for a
+    /// shared `CompiledModule` instantiated at two `module_off`s.
+    #[test]
+    fn compile_simulation_two_instances_same_value_override_matches_vm() {
+        let datamodel = submodel_with_constant_project("same_val", "7", "1", 2);
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+
+        let (k0_off, k1_off) = instance_k_offsets(&artifact);
+        let wasm_slab = run_artifact_with_overrides(&artifact, &[(k0_off, 300.0), (k1_off, 300.0)]);
+        let n_slots = artifact.layout.n_slots;
+        let n_chunks = artifact.layout.n_chunks;
+
+        let mut vm = Vm::new(compile_sim(&datamodel, "main")).expect("vm creation");
+        vm.set_value_by_offset(k0_off, 300.0)
+            .expect("sub0·k must be a VM-overridable constant");
+        vm.set_value_by_offset(k1_off, 300.0)
+            .expect("sub1·k must be a VM-overridable constant");
+        vm.run_to_end().expect("vm run");
+        let vm_results = vm.into_results();
+        assert_eq!(
+            vm_results.step_count, n_chunks,
+            "saved-chunk count differs from VM"
+        );
+
+        let mut checked = 0usize;
+        for (name, wasm_off) in &artifact.layout.var_offsets {
+            let wasm_off = *wasm_off;
+            let ident = Ident::<Canonical>::from_str_unchecked(name);
+            let Some(&vm_off) = vm_results.offsets.get(&ident) else {
+                continue;
+            };
+            for c in 0..n_chunks {
+                let vm_val = vm_results.data[c * vm_results.step_size + vm_off];
+                let wasm_val = wasm_slab[c * n_slots + wasm_off];
+                assert!(
+                    (vm_val - wasm_val).abs() < 1e-9,
+                    "{name} mismatch at chunk {c} under same-value override: \
+                     vm={vm_val} wasm={wasm_val}"
+                );
+            }
+            checked += 1;
+        }
+        assert!(
+            checked >= 3,
+            "expected to compare in_value + both instances' k/out, only checked {checked}"
+        );
+        // Both instances reach 7 + 300 = 307 (the override took on both).
+        let out0_off = layout_offset(&artifact, qualified_ident("sub0", "out").as_str());
+        let out1_off = layout_offset(&artifact, qualified_ident("sub1", "out").as_str());
+        assert!(
+            (wasm_slab[out0_off] - 307.0).abs() < 1e-9
+                && (wasm_slab[out1_off] - 307.0).abs() < 1e-9,
+            "both instances should reach 7+300=307 under the shared override"
+        );
+    }
+
+    /// Task 1 (nested stocks under Euler): a submodel whose `out` is a stock
+    /// integrating a flow that depends on its `in` input. The submodel's internal
+    /// stock is reached only through `EvalModule`, and its offset must be picked
+    /// up by the recursive stock-offset collection so the Euler advance copies it
+    /// `next -> curr`. The wasm must match the VM.
+    #[test]
+    fn compile_simulation_submodel_nested_stock_euler_matches_vm() {
+        let datamodel = submodel_project(
+            "nested_stock",
+            crate::datamodel::SimMethod::Euler,
+            "2",
+            "in", // grow = in (= 2); out integrates by 2 each step
+            true,
+            1,
+        );
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (nested stock)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(
+            checked >= 2,
+            "expected to compare in_value + nested out stock"
+        );
+        // Pin the nested stock's value so this can't pass vacuously with an
+        // un-wired input (`in` defaulting to 0). `grow = in = 2` integrates the
+        // nested `out` stock by 2 each of the 5 Euler steps -> 10.
+        let results = run_artifact_results(&artifact);
+        let n_slots = artifact.layout.n_slots;
+        let out_off = artifact
+            .layout
+            .var_offsets
+            .iter()
+            .find(|(n, _)| n.ends_with("out"))
+            .map(|(_, o)| *o)
+            .expect("nested out offset");
+        let last = (artifact.layout.n_chunks - 1) * n_slots + out_off;
+        assert!(
+            (results[last] - 10.0).abs() < 1e-9,
+            "nested out stock should integrate to 2*5 = 10, got {}",
+            results[last]
+        );
+    }
+
+    /// Task 1 (nested stocks under RK4): the same nested-stock submodel under RK4.
+    /// The recursive stock-offset collection must feed the RK stage math (saved/
+    /// accum scratch indexed by stock position) the submodel's internal stock, so
+    /// the four-stage integration covers nested stocks. The wasm must match the VM.
+    #[test]
+    fn compile_simulation_submodel_nested_stock_rk4_matches_vm() {
+        // A nonlinear flow so RK genuinely differs from Euler: grow = in - out/10,
+        // a first-order approach to a steady state, evaluated at trial points.
+        let datamodel = submodel_project(
+            "nested_stock_rk4",
+            crate::datamodel::SimMethod::RungeKutta4,
+            "5",
+            "in - out / 10",
+            true,
+            1,
+        );
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (nested stock RK4)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(
+            checked >= 2,
+            "expected to compare in_value + nested out stock"
+        );
+    }
+
+    /// Task 1 (stdlib macro -> implicit module stocks): `SMTH1(input, delay)`
+    /// expands to a stdlib `smth1` submodule carrying an internal SMOOTH stock.
+    /// The whole model must match the VM, proving the implicit-module path (the
+    /// stdlib instance's own `ByteCodeContext`, its nested stock under the RK/Euler
+    /// loop, and the `EvalModule`/`LoadModuleInput` wiring) reproduces the VM.
+    /// `SMTH1` was the canonical still-`Skipped` construct before this task.
+    ///
+    /// A NaN-aware comparison: the stdlib `smth1` instance carries an internal
+    /// `initial_value` helper slot that is NaN at the t=0 results snapshot in
+    /// *both* the VM and wasm (it is not written into `curr` before the forced
+    /// t=0 save), so a finite-difference compare would spuriously fail on a
+    /// faithful NaN==NaN match. Every user-visible variable (`input`,
+    /// `smoothed`) is finite and compared exactly.
+    #[test]
+    fn compile_simulation_smooth_macro_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("smooth")
+            .with_sim_time(0.0, 8.0, 0.25)
+            .aux("input", "TIME", None)
+            .aux("smoothed", "SMTH1(input, 2)", None)
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (SMTH1)");
+        // Pin that `smoothed` is finite and nonzero at the last step, so the
+        // NaN-aware comparison cannot pass vacuously (an all-NaN `smoothed` would
+        // satisfy NaN==NaN). A 2-unit smoothing of `input = TIME` reaches a
+        // meaningful positive value by t=8.
+        let results = run_artifact_results(&artifact);
+        let n_slots = artifact.layout.n_slots;
+        let smoothed_off = artifact
+            .layout
+            .var_offsets
+            .iter()
+            .find(|(n, _)| n == "smoothed")
+            .map(|(_, o)| *o)
+            .expect("smoothed offset");
+        let last = (artifact.layout.n_chunks - 1) * n_slots + smoothed_off;
+        assert!(
+            results[last].is_finite() && results[last] > 0.0,
+            "smoothed should be finite and positive by the last step, got {}",
+            results[last]
+        );
+        let checked = assert_matches_vm_nan_aware(sim, &artifact);
+        assert!(
+            checked >= 2,
+            "expected to compare input + smoothed, only checked {checked}"
+        );
+    }
+
+    /// Task 1 (DELAY stdlib macro under RK4): `DELAY3` expands to a stdlib
+    /// submodule with three chained internal SMOOTH stocks, exercising a deeper
+    /// nested-stock chain under the RK4 stage math. The wasm must match the VM.
+    /// NaN-aware for the same internal-`initial_value` reason as the SMTH1 test.
+    #[test]
+    fn compile_simulation_delay3_macro_rk4_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("delay3")
+            .with_sim_time(0.0, 8.0, 0.25)
+            .with_sim_method(crate::datamodel::SimMethod::RungeKutta4)
+            .aux("input", "TIME", None)
+            .aux("delayed", "DELAY3(input, 2)", None)
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (DELAY3 RK4)");
+        let checked = assert_matches_vm_nan_aware(sim, &artifact);
+        assert!(
+            checked >= 2,
+            "expected to compare input + delayed, only checked {checked}"
+        );
+    }
+
+    /// AC4.1: a host reads the three exported geometry globals from the
+    /// instantiated module and uses them (no external metadata) to stride one
+    /// variable's series, which must match the VM.
+    #[test]
+    fn compile_simulation_exports_self_describing_geometry() {
+        let file = std::fs::File::open(POPULATION_XMILE).expect("open population model");
+        let mut reader = BufReader::new(file);
+        let datamodel = open_xmile(&mut reader).expect("parse population xmile");
+
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+
+        let info = validate(&artifact.wasm).expect("module must validate");
+        let mut store = Store::new(());
+        let inst = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("instantiate")
+            .module_addr;
+
+        // Read the three i32 geometry globals straight from the module.
+        let read_global = |store: &mut Store<()>, name: &str| -> usize {
+            let g = store
+                .instance_export(inst, name)
+                .unwrap()
+                .as_global()
+                .unwrap();
+            match store.global_read(g) {
+                checked::StoredValue::I32(x) => x as usize,
+                other => panic!("expected i32 global, got {other:?}"),
+            }
+        };
+        let n_slots = read_global(&mut store, "n_slots");
+        let n_chunks = read_global(&mut store, "n_chunks");
+        let results_offset = read_global(&mut store, "results_offset");
+
+        // They equal the layout values.
+        assert_eq!(n_slots, artifact.layout.n_slots);
+        assert_eq!(n_chunks, artifact.layout.n_chunks);
+        assert_eq!(results_offset, artifact.layout.results_offset);
+
+        // Stride to the population series using only module-reported geometry.
+        let run = store
+            .instance_export(inst, "run")
+            .unwrap()
+            .as_func()
+            .unwrap();
+        store
+            .invoke_simple_typed::<(), ()>(run, ())
+            .expect("run wasm");
+        let mem = store
+            .instance_export(inst, "memory")
+            .unwrap()
+            .as_mem()
+            .unwrap();
+        let pop_off = artifact
+            .layout
+            .var_offsets
+            .iter()
+            .find(|(n, _)| n == "population")
+            .map(|(_, off)| *off)
+            .expect("population offset");
+        let pop_series: Vec<f64> = store.mem_access_mut_slice(mem, |bytes| {
+            (0..n_chunks)
+                .map(|c| {
+                    let a = results_offset + (c * n_slots + pop_off) * 8;
+                    f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap())
+                })
+                .collect()
+        });
+
+        let mut vm = Vm::new(sim).expect("vm");
+        vm.run_to_end().expect("vm run");
+        let vm_results = vm.into_results();
+        let pop = Ident::<Canonical>::from_str_unchecked("population");
+        let vm_pop_off = *vm_results.offsets.get(&pop).expect("vm population offset");
+        for (c, &wasm_val) in pop_series.iter().enumerate() {
+            let vm_val = vm_results.data[c * vm_results.step_size + vm_pop_off];
+            assert!(
+                (vm_val - wasm_val).abs() < 1e-9,
+                "population mismatch at chunk {c}: vm={vm_val} wasm={wasm_val}"
+            );
+        }
+    }
+
+    // ── Array reducers end-to-end (Phase 5 Tasks 1-2) ─────────────────────
+    //
+    // These compile real reducer models through the production salsa pipeline
+    // (so the bytecode is the genuine `PushStaticView; Array<Reduce>; PopView`
+    // codegen emits, with all constant subscripts baked into the static view)
+    // and assert the wasm matches the VM. They are the gold-standard parity
+    // checks for Tasks 1-2; the inline `lower.rs` unit tests pin the individual
+    // view ops against the VM's addressing oracle.
+
+    /// Assert a single scalar variable's wasm series matches the VM, allowing a
+    /// NaN-vs-NaN match (`assert_matches_vm` rejects NaN via its abs-diff
+    /// tolerance, so the empty-view / OOB reducers need this NaN-aware variant).
+    fn assert_scalar_matches_vm(sim: CompiledSimulation, artifact: &WasmArtifact, name: &str) {
+        let n_slots = artifact.layout.n_slots;
+        let n_chunks = artifact.layout.n_chunks;
+        let wasm_data = run_artifact_results(artifact);
+
+        let mut vm = Vm::new(sim).expect("vm creation");
+        vm.run_to_end().expect("vm run");
+        let vm_results = vm.into_results();
+
+        let wasm_off = artifact
+            .layout
+            .var_offsets
+            .iter()
+            .find(|(n, _)| n == name)
+            .map(|(_, off)| *off)
+            .unwrap_or_else(|| panic!("{name} not in wasm layout"));
+        let ident = Ident::<Canonical>::from_str_unchecked(name);
+        let vm_off = *vm_results
+            .offsets
+            .get(&ident)
+            .unwrap_or_else(|| panic!("{name} not in vm offsets"));
+
+        for c in 0..n_chunks {
+            let vm_val = vm_results.data[c * vm_results.step_size + vm_off];
+            let wasm_val = wasm_data[c * n_slots + wasm_off];
+            if vm_val.is_nan() {
+                assert!(
+                    wasm_val.is_nan(),
+                    "{name} chunk {c}: vm=NaN but wasm={wasm_val}"
+                );
+            } else {
+                assert!(
+                    (vm_val - wasm_val).abs() < 1e-9,
+                    "{name} chunk {c}: vm={vm_val} wasm={wasm_val}"
+                );
+            }
+        }
+    }
+
+    /// A 1-D `SUM(source[3:5])` over an indexed dimension: a range subscript that
+    /// codegen bakes into a static view with `offset=2`, `dims=[3]`. The whole
+    /// model (including the arrayed `source`) must match the VM.
+    #[test]
+    fn compile_simulation_sum_range_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("sum_range")
+            .with_sim_time(0.0, 3.0, 1.0)
+            .indexed_dimension("A", 5)
+            .array_aux("source[A]", "3 * A + 1")
+            .scalar_aux("total", "SUM(source[3:5])")
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (SUM range)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 1, "expected to compare source elements + total");
+    }
+
+    /// `SUM(values[*:SubA])` (star-range) selects a sparse subset of a named
+    /// dimension's elements; codegen bakes the sparse mapping into the static
+    /// view, exercising the sparse addressing path against the VM. (A transposed
+    /// reducer like `SUM(matrix')` instead hoists into a `BeginIter` temp-copy
+    /// loop, so it lands in Phase 5 Task 3; the transpose `ViewDesc` transform
+    /// itself is pinned by `lower.rs`'s `view_transpose_then_reduce_matches_vm`.)
+    #[test]
+    fn compile_simulation_sum_star_range_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("sum_star_range")
+            .with_sim_time(0.0, 2.0, 1.0)
+            .named_dimension("DimA", &["A1", "A2", "A3", "A4"])
+            .named_dimension("SubA", &["A2", "A3"])
+            .array_with_ranges(
+                "values[DimA]",
+                vec![("A1", "10"), ("A2", "20"), ("A3", "30"), ("A4", "40")],
+            )
+            .scalar_aux("total", "SUM(values[*:SubA])")
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (SUM star range)");
+        // The whole model (including the sparse-selected `total` = A2+A3 = 50)
+        // matches the VM element-for-element.
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 1);
+        // Independently pin the sparse selection value against the VM.
+        let sim2 = compile_sim(&datamodel, "main");
+        assert_scalar_matches_vm(sim2, &artifact, "total");
+    }
+
+    /// A per-element sliced reducer `msum[D] = SUM(m[D, *])` over a 2-D array.
+    /// Each output element is its own `PushStaticView; ArraySum; PopView` over a
+    /// per-row static view (the A2A target unrolls to per-element bytecode).
+    #[test]
+    fn compile_simulation_sliced_row_sum_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("row_sum")
+            .with_sim_time(0.0, 2.0, 1.0)
+            .indexed_dimension("D", 2)
+            .indexed_dimension("E", 3)
+            .array_aux("m[D, E]", "10 * D + E")
+            .array_aux("msum[D]", "SUM(m[D, *])")
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (row sum)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(
+            checked >= 1,
+            "expected to compare m elements + msum elements"
+        );
+    }
+
+    /// MEAN / STDDEV / MAX / MIN / SIZE over a range slice, each matching the VM.
+    /// One model carries all five so a single compile exercises every reducer's
+    /// production lowering.
+    #[test]
+    fn compile_simulation_all_reducers_match_vm() {
+        let datamodel = crate::test_common::TestProject::new("all_reducers")
+            .with_sim_time(0.0, 2.0, 1.0)
+            .indexed_dimension("A", 5)
+            .array_aux("source[A]", "2 * A")
+            .scalar_aux("mean_val", "MEAN(source[2:4])")
+            .scalar_aux("stddev_val", "STDDEV(source[1:5])")
+            .scalar_aux("max_val", "MAX(source[2:4])")
+            .scalar_aux("min_val", "MIN(source[2:4])")
+            .scalar_aux("size_val", "SIZE(source[2:4])")
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (all reducers)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 5, "expected to compare all five reducer results");
+        for name in ["mean_val", "stddev_val", "max_val", "min_val", "size_val"] {
+            assert!(
+                artifact.layout.var_offsets.iter().any(|(n, _)| n == name),
+                "{name} should be in the layout"
+            );
+        }
+    }
+
+    // The empty-but-valid view reducer asymmetry (SUM->0.0 vs others->NaN) and
+    // the invalid-view->NaN-for-all asymmetry are pinned directly against the
+    // VM's `reduce_view` semantics by the inline `lower.rs` unit tests
+    // (`empty_valid_view_*` / `invalid_view_*`): a literal empty range
+    // (`source[4:3]`) is rejected at compile time, and a runtime-empty range
+    // (`source[start:end]` with `start > end`) plus an out-of-bounds dynamic
+    // subscript both go through `ViewRangeDynamic` / `ViewSubscriptDynamic`,
+    // which are Phase 5 Task 4, so the end-to-end coverage of those cases lands
+    // there.
+
+    // ── Phase 5 Task 3: BeginIter iteration loops (end-to-end) ────────────
+    //
+    // The broadcasting `LoadIterViewAt` path (source dims != iter dims) and the
+    // standalone `BeginBroadcastIter` family are not reachable through the
+    // current production codegen (an A2A elementwise op is scalar-unrolled, and a
+    // mismatched-dim reducer argument fails the engine's own dimension check), so
+    // those are pinned directly against the VM by hand-built-bytecode unit tests
+    // in `lower.rs` (`iter_loop_*` / `broadcast_iter_*`). The two reachable
+    // shapes -- a hoisted same-dim reducer loop and the deferred transpose
+    // reducer -- are covered end-to-end here.
+
+    /// `SUM(2 * source[3:5] + 1)`: the elementwise expression is hoisted into an
+    /// `AssignTemp` `BeginIter` loop (codegen.rs:1183-1378), then `SUM` reduces
+    /// the temp. The whole-model wasm must match the VM element-for-element.
+    #[test]
+    fn compile_simulation_hoisted_reducer_loop_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("hoist")
+            .with_sim_time(0.0, 2.0, 1.0)
+            .indexed_dimension("A", 5)
+            .array_aux("source[A]", "A")
+            .scalar_aux("summed", "SUM(2 * source[3:5] + 1)")
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (hoisted reducer)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 1, "expected to compare summed");
+    }
+
+    /// `SUM(matrix')`: the transpose materializes the transposed matrix into a
+    /// temp via a `BeginIter` loop reading the (transposed) source through
+    /// `LoadIterViewAt`, then sums the temp. This is the case Subcomponent A
+    /// deferred to the iteration task; the wasm must match the VM.
+    #[test]
+    fn compile_simulation_transpose_reducer_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("transpose")
+            .with_sim_time(0.0, 2.0, 1.0)
+            .indexed_dimension("A", 2)
+            .indexed_dimension("B", 3)
+            .array_aux("matrix[A,B]", "A * 10 + B")
+            .scalar_aux("summed", "SUM(matrix')")
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen (transpose)");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 1, "expected to compare summed");
+    }
+
+    // ── Phase 5 Task 4: dynamic subscripts + OOB->NaN (end-to-end) ────────
+
+    /// Assert every layout variable matches the VM, treating a NaN on both sides
+    /// as equal (the OOB-subscript result). The plain `assert_matches_vm` uses a
+    /// finite-difference compare that a NaN would fail, so the OOB tests use this.
+    fn assert_matches_vm_nan_aware(sim: CompiledSimulation, artifact: &WasmArtifact) -> usize {
+        let n_slots = artifact.layout.n_slots;
+        let n_chunks = artifact.layout.n_chunks;
+        let wasm_data = run_artifact_results(artifact);
+        let mut vm = Vm::new(sim).expect("vm creation");
+        vm.run_to_end().expect("vm run");
+        let vm_results = vm.into_results();
+        assert_eq!(vm_results.step_count, n_chunks, "saved-chunk count differs");
+
+        let mut checked = 0usize;
+        for (name, wasm_off) in &artifact.layout.var_offsets {
+            let ident = Ident::<Canonical>::from_str_unchecked(name);
+            let Some(&vm_off) = vm_results.offsets.get(&ident) else {
+                continue;
+            };
+            for c in 0..n_chunks {
+                let vm_val = vm_results.data[c * vm_results.step_size + vm_off];
+                let wasm_val = wasm_data[c * n_slots + *wasm_off];
+                if vm_val.is_nan() {
+                    assert!(
+                        wasm_val.is_nan(),
+                        "{name} chunk {c}: vm=NaN but wasm={wasm_val}"
+                    );
+                } else {
+                    let diff = (vm_val - wasm_val).abs();
+                    assert!(diff < 1e-9, "{name} chunk {c}: vm={vm_val} wasm={wasm_val}");
+                }
+            }
+            checked += 1;
+        }
+        checked
+    }
+
+    /// Legacy scalar dynamic subscript `arr[idx]` (`PushSubscriptIndex` /
+    /// `LoadSubscript`), in range: the wasm must match the VM.
+    #[test]
+    fn compile_simulation_scalar_dynamic_subscript_in_range_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("dyn")
+            .with_sim_time(0.0, 2.0, 1.0)
+            .indexed_dimension("A", 4)
+            .array_aux("arr[A]", "A * 10")
+            .scalar_aux("idx", "3")
+            .scalar_aux("picked", "arr[idx]")
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 1, "expected to compare picked");
+    }
+
+    /// Legacy scalar dynamic subscript `arr[idx]` out of range -> NaN, matching
+    /// the VM (`vm.rs:1343` sets the subscript invalid, `1361` pushes NaN).
+    #[test]
+    fn compile_simulation_scalar_dynamic_subscript_oob_is_nan() {
+        // idx = 99 is well past the 4-element dimension -> NaN on both backends.
+        let datamodel = crate::test_common::TestProject::new("dyn_oob")
+            .with_sim_time(0.0, 2.0, 1.0)
+            .indexed_dimension("A", 4)
+            .array_aux("arr[A]", "A * 10")
+            .scalar_aux("idx", "99")
+            .scalar_aux("picked", "arr[idx]")
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+        let checked = assert_matches_vm_nan_aware(sim, &artifact);
+        assert!(checked >= 1, "expected to compare picked");
+
+        // Pin the NaN directly: `picked` must be NaN at every step.
+        let n_slots = artifact.layout.n_slots;
+        let off = artifact
+            .layout
+            .var_offsets
+            .iter()
+            .find(|(n, _)| n == "picked")
+            .map(|(_, o)| *o)
+            .expect("picked offset");
+        let data = run_artifact_results(&artifact);
+        for c in 0..artifact.layout.n_chunks {
+            assert!(
+                data[c * n_slots + off].is_nan(),
+                "out-of-bounds arr[idx] must be NaN at chunk {c}"
+            );
+        }
+    }
+
+    /// `ViewSubscriptDynamic` via `SUM(mat[row, 1])`: a dynamically-subscripted
+    /// view reduced to a scalar. In range, wasm matches the VM.
+    #[test]
+    fn compile_simulation_view_dynamic_subscript_in_range_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("vdyn")
+            .with_sim_time(0.0, 2.0, 1.0)
+            .indexed_dimension("A", 3)
+            .indexed_dimension("B", 4)
+            .array_aux("mat[A,B]", "A * 10 + B")
+            .scalar_aux("row", "2")
+            .scalar_aux("picked", "SUM(mat[row, 1])")
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+        let checked = assert_matches_vm(sim, &artifact);
+        assert!(checked >= 1, "expected to compare picked");
+    }
+
+    /// `ViewSubscriptDynamic` out of range -> the view is invalid -> the reducer
+    /// yields NaN for *all* reducers, matching `reduce_view`'s `if !is_valid`.
+    #[test]
+    fn compile_simulation_view_dynamic_subscript_oob_is_nan() {
+        let datamodel = crate::test_common::TestProject::new("vdyn_oob")
+            .with_sim_time(0.0, 2.0, 1.0)
+            .indexed_dimension("A", 3)
+            .indexed_dimension("B", 4)
+            .array_aux("mat[A,B]", "A * 10 + B")
+            .scalar_aux("row", "99") // out of range for dim A (size 3)
+            .scalar_aux("picked", "SUM(mat[row, 1])")
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+        let checked = assert_matches_vm_nan_aware(sim, &artifact);
+        assert!(checked >= 1, "expected to compare picked");
+
+        let n_slots = artifact.layout.n_slots;
+        let off = artifact
+            .layout
+            .var_offsets
+            .iter()
+            .find(|(n, _)| n == "picked")
+            .map(|(_, o)| *o)
+            .expect("picked offset");
+        let data = run_artifact_results(&artifact);
+        for c in 0..artifact.layout.n_chunks {
+            assert!(
+                data[c * n_slots + off].is_nan(),
+                "out-of-bounds SUM(mat[row,1]) must be NaN at chunk {c}"
+            );
+        }
+    }
+
+    /// AC4.2: a by-name series read strides the results slab using only the
+    /// layout's `n_slots`/`results_offset` + the variable's offset, copies exactly
+    /// `n_chunks` values (never the whole `n_chunks * n_slots` slab), and equals
+    /// the VM's `get_series` for that variable. This is the read pattern a host
+    /// performs over the blob's results region (the FFI returns the same layout).
+    #[test]
+    fn by_name_series_read_strides_slab_and_matches_vm_get_series() {
+        let file = std::fs::File::open(POPULATION_XMILE).expect("open population model");
+        let mut reader = BufReader::new(file);
+        let datamodel = open_xmile(&mut reader).expect("parse population xmile");
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+
+        let n_slots = artifact.layout.n_slots;
+        let n_chunks = artifact.layout.n_chunks;
+        let results_offset = artifact.layout.results_offset;
+        let pop_off = layout_offset(&artifact, "population");
+
+        // Run the blob and read the whole results region once (the host would map
+        // the module's memory; here we copy it out).
+        let slab = run_artifact_results(&artifact);
+
+        // Stride out ONLY `population`'s series: exactly `n_chunks` reads at
+        // `results_offset/8 + c*n_slots + off` (the slab is f64-indexed here).
+        let _ = results_offset; // documents the byte base; `slab` already starts at it
+        let mut series = Vec::with_capacity(n_chunks);
+        for c in 0..n_chunks {
+            series.push(slab[c * n_slots + pop_off]);
+        }
+        assert_eq!(
+            series.len(),
+            n_chunks,
+            "a by-name read copies exactly n_chunks values, not the whole slab"
+        );
+        assert!(
+            n_slots > 1,
+            "the model must have >1 slot so striding (not a full copy) is meaningful"
+        );
+
+        // It equals the VM's get_series for the same variable.
+        let mut vm = Vm::new(sim).expect("vm");
+        vm.run_to_end().expect("vm run");
+        let pop = Ident::<Canonical>::from_str_unchecked("population");
+        let vm_series = vm.get_series(&pop).expect("vm get_series(population)");
+        assert_eq!(
+            vm_series.len(),
+            series.len(),
+            "series length matches the VM"
+        );
+        for (c, (&w, &v)) in series.iter().zip(vm_series.iter()).enumerate() {
+            assert!(
+                (w - v).abs() < 1e-9,
+                "population chunk {c}: striped wasm read {w} != vm get_series {v}"
+            );
+        }
+    }
+
+    // ── set_value / reset override mechanism (Phase 7 Task 2) ─────────────
+    //
+    // An exported `set_value(offset, val) -> i32` writes the override into the
+    // constants region (0 ok / nonzero when `offset` is not overridable), an
+    // exported `reset()` resets run state without clearing the region (overrides
+    // persist across reset, like the VM), and the next `run` re-runs initials +
+    // the loop sourcing the overridable `AssignConstCurr` from the region.
+    // `clear_values()` restores compiled defaults. These mirror the VM's
+    // `set_value_by_offset`/`reset`/`clear_values` (`vm.rs:976-1062`).
+
+    /// Instantiate `artifact.wasm`, optionally apply a list of `(offset, value)`
+    /// overrides via the exported `set_value`, call `reset` then `run`, and copy
+    /// the step-major results slab out. Each `set_value` return code is checked to
+    /// be 0 (the caller passes only overridable offsets). Returns the slab.
+    fn run_artifact_with_overrides(
+        artifact: &WasmArtifact,
+        overrides: &[(usize, f64)],
+    ) -> Vec<f64> {
+        let info = validate(&artifact.wasm).expect("module must validate");
+        let mut store = Store::new(());
+        let inst = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("instantiate")
+            .module_addr;
+        let set_value = store
+            .instance_export(inst, "set_value")
+            .expect("set_value export")
+            .as_func()
+            .expect("set_value is a function");
+        for &(off, val) in overrides {
+            let rc: i32 = store
+                .invoke_simple_typed::<(i32, f64), i32>(set_value, (off as i32, val))
+                .expect("set_value invoke");
+            assert_eq!(
+                rc, 0,
+                "set_value({off}, {val}) should accept an overridable offset"
+            );
+        }
+        let reset = store
+            .instance_export(inst, "reset")
+            .expect("reset export")
+            .as_func()
+            .expect("reset is a function");
+        store
+            .invoke_simple_typed::<(), ()>(reset, ())
+            .expect("reset invoke");
+        let run = store
+            .instance_export(inst, "run")
+            .expect("run export")
+            .as_func()
+            .expect("run is a function");
+        store
+            .invoke_simple_typed::<(), ()>(run, ())
+            .expect("run invoke");
+        let mem = store
+            .instance_export(inst, "memory")
+            .unwrap()
+            .as_mem()
+            .unwrap();
+        let n = artifact.layout.n_chunks * artifact.layout.n_slots;
+        let base = artifact.layout.results_offset;
+        store.mem_access_mut_slice(mem, |bytes| {
+            (0..n)
+                .map(|i| {
+                    let a = base + i * 8;
+                    f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap())
+                })
+                .collect()
+        })
+    }
+
+    /// Call the exported `set_value` once on a freshly-instantiated module and
+    /// return its i32 return code, without running the simulation. Used to assert
+    /// the validation behavior (nonzero on a non-overridable offset).
+    fn set_value_rc(artifact: &WasmArtifact, off: i32, val: f64) -> i32 {
+        let info = validate(&artifact.wasm).expect("module must validate");
+        let mut store = Store::new(());
+        let inst = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("instantiate")
+            .module_addr;
+        let set_value = store
+            .instance_export(inst, "set_value")
+            .expect("set_value export")
+            .as_func()
+            .expect("set_value is a function");
+        store
+            .invoke_simple_typed::<(i32, f64), i32>(set_value, (off, val))
+            .expect("set_value invoke")
+    }
+
+    /// The absolute slab offset of `name` in the artifact's layout.
+    fn layout_offset(artifact: &WasmArtifact, name: &str) -> usize {
+        artifact
+            .layout
+            .var_offsets
+            .iter()
+            .find(|(n, _)| n == name)
+            .map(|(_, o)| *o)
+            .unwrap_or_else(|| panic!("{name} offset"))
+    }
+
+    /// The canonical qualified ident for a sub-model `instance`'s sub-variable
+    /// `var` (`Ident::join`, the U+00B7 module-hierarchy separator), e.g.
+    /// `sub0·k`. Built the same way `calc_flattened_offsets_incremental` keys the
+    /// layout, so it stays correct if the separator ever changes.
+    fn qualified_ident(instance: &str, var: &str) -> Ident<Canonical> {
+        Ident::<Canonical>::join(
+            &Ident::<Canonical>::new(instance).as_canonical_str(),
+            &Ident::<Canonical>::new(var).as_canonical_str(),
+        )
+    }
+
+    /// The absolute slab offsets of the two `submodel_with_constant_project`
+    /// instances' own constant `k` (`sub0·k`, `sub1·k`). These are distinct
+    /// because `calc_flattened_offsets_incremental` advances the base offset per
+    /// instance, mirroring the VM's `collect_constant_info` recursion.
+    fn instance_k_offsets(artifact: &WasmArtifact) -> (usize, usize) {
+        (
+            layout_offset(artifact, qualified_ident("sub0", "k").as_str()),
+            layout_offset(artifact, qualified_ident("sub1", "k").as_str()),
+        )
+    }
+
+    /// A VM run of `sim` with an override applied at absolute `off` (the VM's
+    /// `set_value_by_offset`), returning that variable's slab so wasm overrides
+    /// can be compared cell-for-cell against the VM oracle.
+    fn vm_results_with_override(
+        sim: CompiledSimulation,
+        off: usize,
+        val: f64,
+    ) -> (Vec<f64>, usize, usize) {
+        let mut vm = Vm::new(sim).expect("vm creation");
+        vm.set_value_by_offset(off, val)
+            .expect("offset must be a VM-overridable constant");
+        vm.run_to_end().expect("vm run");
+        let results = vm.into_results();
+        (results.data.to_vec(), results.step_size, results.step_count)
+    }
+
+    /// AC5.1: overriding a constant via `set_value`, then `reset`, then `run`,
+    /// yields the same series the VM produces under the same override. A constant
+    /// aux feeds a flow that integrates a stock, so the override propagates into
+    /// every downstream value at every step -- a wrong source (or an override that
+    /// did not take) would diverge from the VM immediately.
+    #[test]
+    fn compile_simulation_set_value_override_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("override")
+            .with_sim_time(0.0, 5.0, 1.0)
+            .aux("inflow_rate", "2", None)
+            .stock("level", "0", &["inflow"], &[], None)
+            .flow("inflow", "inflow_rate", None)
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+
+        let rate_off = layout_offset(&artifact, "inflow_rate");
+        assert!(
+            sim.is_constant_offset(rate_off),
+            "inflow_rate must be a VM-overridable constant for this test to be meaningful"
+        );
+
+        // Override the constant inflow_rate to 5 (was 2), so level integrates by
+        // 5/step: 0,5,10,...,25 -- visibly different from the default 0,2,...,10.
+        let wasm_slab = run_artifact_with_overrides(&artifact, &[(rate_off, 5.0)]);
+        let n_slots = artifact.layout.n_slots;
+        let n_chunks = artifact.layout.n_chunks;
+
+        let sim_vm = compile_sim(&datamodel, "main");
+        let (vm_data, vm_step_size, vm_step_count) =
+            vm_results_with_override(sim_vm, rate_off, 5.0);
+        assert_eq!(vm_step_count, n_chunks, "saved-chunk count differs from VM");
+
+        let mut checked = 0usize;
+        for (name, wasm_off) in &artifact.layout.var_offsets {
+            let wasm_off = *wasm_off;
+            let ident = Ident::<Canonical>::from_str_unchecked(name);
+            // Index the VM slab with the VM's own offset for this variable. It
+            // equals `wasm_off` (both backends derive offsets from
+            // `calc_flattened_offsets_incremental`), so this also skips the
+            // implicit globals the layout carries but the VM offsets map omits.
+            let vm_off = match sim.get_offset(&ident) {
+                Some(o) => o,
+                None => continue,
+            };
+            for c in 0..n_chunks {
+                let vm_val = vm_data[c * vm_step_size + vm_off];
+                let wasm_val = wasm_slab[c * n_slots + wasm_off];
+                assert!(
+                    (vm_val - wasm_val).abs() < 1e-9,
+                    "{name} mismatch at chunk {c} under override: vm={vm_val} wasm={wasm_val}"
+                );
+            }
+            checked += 1;
+        }
+        assert!(
+            checked >= 2,
+            "expected to compare inflow_rate + level + inflow"
+        );
+
+        // Pin the override actually took: level reaches 5*5 = 25 (not the default
+        // 10), so this cannot pass vacuously with an ignored override.
+        let level_off = layout_offset(&artifact, "level");
+        let last = (n_chunks - 1) * n_slots + level_off;
+        assert!(
+            (wasm_slab[last] - 25.0).abs() < 1e-9,
+            "level under inflow_rate=5 should reach 25, got {}",
+            wasm_slab[last]
+        );
+    }
+
+    /// AC5.2: `reset` with no override reproduces the compiled-default series. A
+    /// `set_value`-then-reset-then-run with an empty override list must match a
+    /// plain VM run (the default literals), proving the constants region is
+    /// initialized to the compiled defaults and `reset` leaves them intact.
+    #[test]
+    fn compile_simulation_reset_no_override_restores_defaults() {
+        let datamodel = crate::test_common::TestProject::new("defaults")
+            .with_sim_time(0.0, 5.0, 1.0)
+            .aux("inflow_rate", "2", None)
+            .stock("level", "0", &["inflow"], &[], None)
+            .flow("inflow", "inflow_rate", None)
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+
+        let wasm_slab = run_artifact_with_overrides(&artifact, &[]);
+        let n_slots = artifact.layout.n_slots;
+        let n_chunks = artifact.layout.n_chunks;
+
+        // The default run: level integrates by 2/step -> reaches 10.
+        let mut vm = Vm::new(compile_sim(&datamodel, "main")).expect("vm");
+        vm.run_to_end().expect("vm run");
+        let vm_results = vm.into_results();
+        for (name, wasm_off) in &artifact.layout.var_offsets {
+            let wasm_off = *wasm_off;
+            let ident = Ident::<Canonical>::from_str_unchecked(name);
+            let Some(&vm_off) = vm_results.offsets.get(&ident) else {
+                continue;
+            };
+            for c in 0..n_chunks {
+                let vm_val = vm_results.data[c * vm_results.step_size + vm_off];
+                let wasm_val = wasm_slab[c * n_slots + wasm_off];
+                assert!(
+                    (vm_val - wasm_val).abs() < 1e-9,
+                    "{name} default mismatch at chunk {c}: vm={vm_val} wasm={wasm_val}"
+                );
+            }
+        }
+        let level_off = layout_offset(&artifact, "level");
+        let last = (n_chunks - 1) * n_slots + level_off;
+        assert!(
+            (wasm_slab[last] - 10.0).abs() < 1e-9,
+            "default level should reach 10, got {}",
+            wasm_slab[last]
+        );
+    }
+
+    /// `set_value` on a non-constant offset returns the error code and does not
+    /// write. A stock's offset (`level`) is not an overridable constant (its
+    /// initial is a constant, but it is assigned via `AssignNext`, not an
+    /// `AssignConstCurr` in flows), so `set_value` must reject it. After the
+    /// rejected call the default run must be unchanged.
+    #[test]
+    fn compile_simulation_set_value_rejects_non_constant_offset() {
+        let datamodel = crate::test_common::TestProject::new("reject")
+            .with_sim_time(0.0, 5.0, 1.0)
+            .aux("inflow_rate", "2", None)
+            .stock("level", "0", &["inflow"], &[], None)
+            .flow("inflow", "inflow_rate", None)
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+
+        let level_off = layout_offset(&artifact, "level");
+        assert!(
+            !sim.is_constant_offset(level_off),
+            "level (a stock) must not be a VM-overridable constant"
+        );
+        // A non-overridable offset returns nonzero.
+        assert_ne!(
+            set_value_rc(&artifact, level_off as i32, 999.0),
+            0,
+            "set_value on a stock offset must return a nonzero error code"
+        );
+        // An out-of-range offset (>= n_slots) also returns nonzero.
+        assert_ne!(
+            set_value_rc(&artifact, artifact.layout.n_slots as i32, 1.0),
+            0,
+            "set_value on an out-of-range offset must return a nonzero error code"
+        );
+        assert_ne!(
+            set_value_rc(&artifact, -1, 1.0),
+            0,
+            "set_value on a negative offset must return a nonzero error code"
+        );
+
+        // The rejected write left the constants region untouched: a no-override
+        // run still reproduces the defaults (level reaches 10, not 999-driven).
+        let wasm_slab = run_artifact_with_overrides(&artifact, &[]);
+        let n_slots = artifact.layout.n_slots;
+        let n_chunks = artifact.layout.n_chunks;
+        let last = (n_chunks - 1) * n_slots + level_off;
+        assert!(
+            (wasm_slab[last] - 10.0).abs() < 1e-9,
+            "a rejected set_value must not perturb the default run; level should still reach 10, got {}",
+            wasm_slab[last]
+        );
+    }
+
+    /// `clear_values` restores compiled defaults after an override, without
+    /// re-instantiating. Override inflow_rate, run (diverges), then clear, reset,
+    /// run again -- the second run must reproduce the defaults.
+    #[test]
+    fn compile_simulation_clear_values_restores_defaults() {
+        let datamodel = crate::test_common::TestProject::new("clear")
+            .with_sim_time(0.0, 5.0, 1.0)
+            .aux("inflow_rate", "2", None)
+            .stock("level", "0", &["inflow"], &[], None)
+            .flow("inflow", "inflow_rate", None)
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+        let rate_off = layout_offset(&artifact, "inflow_rate");
+        let level_off = layout_offset(&artifact, "level");
+        let n_slots = artifact.layout.n_slots;
+        let n_chunks = artifact.layout.n_chunks;
+
+        let info = validate(&artifact.wasm).expect("module must validate");
+        let mut store = Store::new(());
+        let inst = store
+            .module_instantiate(&info, Vec::new(), None)
+            .expect("instantiate")
+            .module_addr;
+        let func = |store: &mut Store<()>, name: &str| {
+            store
+                .instance_export(inst, name)
+                .unwrap()
+                .as_func()
+                .unwrap()
+        };
+
+        // Override -> run -> level reaches 25.
+        let set_value = func(&mut store, "set_value");
+        let rc: i32 = store
+            .invoke_simple_typed::<(i32, f64), i32>(set_value, (rate_off as i32, 5.0))
+            .expect("set_value");
+        assert_eq!(rc, 0);
+        let run = func(&mut store, "run");
+        store.invoke_simple_typed::<(), ()>(run, ()).expect("run");
+
+        // clear_values -> reset -> run -> level back to the default 10.
+        let clear_values = func(&mut store, "clear_values");
+        store
+            .invoke_simple_typed::<(), ()>(clear_values, ())
+            .expect("clear_values");
+        let reset = func(&mut store, "reset");
+        store
+            .invoke_simple_typed::<(), ()>(reset, ())
+            .expect("reset");
+        let run = func(&mut store, "run");
+        store.invoke_simple_typed::<(), ()>(run, ()).expect("run");
+
+        let mem = store
+            .instance_export(inst, "memory")
+            .unwrap()
+            .as_mem()
+            .unwrap();
+        let base = artifact.layout.results_offset;
+        let last_addr = base + ((n_chunks - 1) * n_slots + level_off) * 8;
+        let level_last = store.mem_access_mut_slice(mem, |bytes| {
+            f64::from_le_bytes(bytes[last_addr..last_addr + 8].try_into().unwrap())
+        });
+        assert!(
+            (level_last - 10.0).abs() < 1e-9,
+            "after clear_values the default level should reach 10, got {level_last}"
+        );
+    }
+
+    /// The wasm backend's overridable-constant set (`collect_overridable_defaults`,
+    /// which mirrors the VM's `collect_constant_info` recursion to capture each
+    /// default literal) must address EXACTLY the offsets the VM reports overridable
+    /// via `CompiledSimulation::constant_offsets`. If the two diverged, a blob's
+    /// `set_value` would accept/reject a different set than the VM's, or initialize
+    /// the wrong slots -- so this pins them equal over a model with both a top-level
+    /// constant and a nested-module (SMOOTH) constant.
+    #[test]
+    fn wasm_overridable_set_matches_vm_constant_offsets() {
+        let datamodel = crate::test_common::TestProject::new("const_set")
+            .with_sim_time(0.0, 4.0, 0.5)
+            .aux("k", "3", None)
+            .aux("input", "TIME + k", None)
+            // SMTH1 expands to a nested stdlib module carrying its own constants
+            // (the smoothing delay), so the overridable set spans nested modules.
+            .aux("smoothed", "SMTH1(input, 2)", None)
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+
+        let mut wasm_set: Vec<usize> = collect_overridable_defaults(&sim.modules, &sim.root, 0)
+            .into_iter()
+            .map(|(off, _)| off)
+            .collect();
+        wasm_set.sort_unstable();
+        wasm_set.dedup();
+
+        let mut vm_set: Vec<usize> = sim.constant_offsets().collect();
+        vm_set.sort_unstable();
+
+        assert_eq!(
+            wasm_set, vm_set,
+            "the wasm overridable-constant offsets must match the VM's exactly"
+        );
+        assert!(
+            !vm_set.is_empty(),
+            "this model must have at least one overridable constant (k) for the check to be meaningful"
+        );
+
+        // Every overridable offset is in range (so it indexes the n_slots-wide
+        // const region and the validity byte region safely).
+        let n_slots = sim.n_slots();
+        for &off in &vm_set {
+            assert!(
+                off < n_slots,
+                "overridable offset {off} must be < n_slots {n_slots}"
+            );
+        }
+    }
+
+    /// AC5.1 with an override on a constant that feeds an *initial* equation: the
+    /// VM re-applies the override across initials (it mutates the literal at all
+    /// locations), so an overridable constant read during the initials phase must
+    /// also source from the region. Here `seed` is a constant whose value is the
+    /// stock's initial, so overriding `seed` must change the stock's starting
+    /// value -- exercising the initials-phase redirect, not just flows.
+    #[test]
+    fn compile_simulation_set_value_override_in_initials_matches_vm() {
+        let datamodel = crate::test_common::TestProject::new("override_init")
+            .with_sim_time(0.0, 3.0, 1.0)
+            .aux("seed", "5", None)
+            .stock("level", "seed", &["hold"], &[], None)
+            .flow("hold", "0", None)
+            .build_datamodel();
+        let sim = compile_sim(&datamodel, "main");
+        let artifact = compile_simulation(&sim).expect("wasm codegen");
+        let seed_off = layout_offset(&artifact, "seed");
+        assert!(
+            sim.is_constant_offset(seed_off),
+            "seed must be an overridable constant"
+        );
+
+        let wasm_slab = run_artifact_with_overrides(&artifact, &[(seed_off, 42.0)]);
+        let n_slots = artifact.layout.n_slots;
+        let n_chunks = artifact.layout.n_chunks;
+
+        let sim_vm = compile_sim(&datamodel, "main");
+        let (vm_data, vm_step_size, vm_step_count) =
+            vm_results_with_override(sim_vm, seed_off, 42.0);
+        assert_eq!(vm_step_count, n_chunks);
+
+        for (name, wasm_off) in &artifact.layout.var_offsets {
+            let wasm_off = *wasm_off;
+            let ident = Ident::<Canonical>::from_str_unchecked(name);
+            if sim.get_offset(&ident).is_none() {
+                continue;
+            }
+            for c in 0..n_chunks {
+                let vm_val = vm_data[c * vm_step_size + wasm_off];
+                let wasm_val = wasm_slab[c * n_slots + wasm_off];
+                assert!(
+                    (vm_val - wasm_val).abs() < 1e-9,
+                    "{name} mismatch at chunk {c} under initials override: vm={vm_val} wasm={wasm_val}"
+                );
+            }
+        }
+        // seed=42 makes level start (and stay, hold=0) at 42.
+        let level_off = layout_offset(&artifact, "level");
+        assert!(
+            (wasm_slab[level_off] - 42.0).abs() < 1e-9,
+            "level should initialize to the overridden seed=42, got {}",
+            wasm_slab[level_off]
+        );
+    }
+}
diff --git a/src/simlin-engine/src/wasmgen/vector.rs b/src/simlin-engine/src/wasmgen/vector.rs
new file mode 100644
index 000000000..994ee51a1
--- /dev/null
+++ b/src/simlin-engine/src/wasmgen/vector.rs
@@ -0,0 +1,1063 @@
+// Copyright 2026 The Simlin Authors. All rights reserved.
+// Use of this source code is governed by the Apache License,
+// Version 2.0, that can be found in the LICENSE file.
+
+// pattern: Functional Core
+// Pure transformation: each emitter appends a wasm instruction sequence for one
+// vector-operation opcode, mirroring the matching VM arm element-for-element. No
+// I/O; the only side effect is in `#[cfg(test)]` (which lives in `lower_tests.rs`
+// alongside the rest of the lowering harness).
+
+//! Lowering of the bytecode VM's vector-operation opcodes to WebAssembly
+//! (Phase 6).
+//!
+//! These opcodes operate over the compile-time view stack (`super::views`) and
+//! the operand stack and -- except [`VectorSelect`](emit_vector_select), which
+//! reduces to one scalar -- write their result array to a `write_temp_id` region
+//! of `temp_storage`. Each emitter reproduces the matching VM dispatch arm
+//! element-for-element:
+//!
+//! - [`emit_vector_select`] -- `vm.rs:2444-2502`
+//! - [`emit_vector_elm_map`] -- `crate::vm_vector_elm_map::vector_elm_map`
+//! - [`emit_vector_sort_order`] -- `crate::vm_vector_sort_order::vector_sort_order`
+//! - [`emit_rank`] -- `vm.rs:2540-2584`
+//! - [`emit_lookup_array`] -- `vm.rs:2586-2629`
+//!
+//! ## Runtime loop vs unrolled
+//!
+//! The *stable sort* ([`emit_stable_sort`], backing `VectorSortOrder`/`Rank`) is
+//! a self-contained wasm helper with a **runtime** insertion-sort loop -- never
+//! unrolled, since an unrolled O(n^2) body over a runtime view size would blow
+//! up. Everything else here is a per-element map/gather/scatter over the
+//! *compile-time* view size, so the element addresses fold into wasm constants
+//! and the bodies are unrolled. The caller (`super::lower`) charges the Phase-5
+//! [`EmitState`](super::lower) unroll budget for the view size before invoking
+//! these, so the size cap still bounds an over-large arrayed model.
+//!
+//! ## Invalid input view
+//!
+//! An input view that a dynamic subscript (Phase-5 Task 4) made invalid at
+//! runtime takes the VM's short-circuit: the whole destination temp region is
+//! filled with IEEE `f64::NAN` (NOT the finite `crate::float::NA` sentinel) via
+//! [`super::lower::emit_fill_temp_nan`], while `VectorSelect` pushes a single
+//! NaN. The validity gate is only emitted when an input view actually carries a
+//! runtime validity flag; in the common case (static / temp / full-var views)
+//! every input is statically valid and no runtime check is generated.
+
+use wasm_encoder::{BlockType, Function, Instruction as Ins, ValType};
+
+use crate::bytecode::{GraphicalFunctionId, LookupMode};
+
+use super::WasmGenError;
+use super::lower::{
+    EmitCtx, GF_DIRECTORY_ENTRY_BYTES, SLOT_SIZE, emit_fill_temp_nan, emit_is_truthy,
+    emit_view_element_load, f64_const, i32_memarg, memarg, push_module_relative_base,
+    temp_element_byte_addr,
+};
+use super::views::{ViewBase, ViewDesc};
+
+/// Push `round_half_away(x)` for the f64 already on the wasm stack, reproducing
+/// Rust's `f64::round` (round half AWAY from zero) bit-for-bit -- which is what
+/// the VM uses (`stack.pop().round()`, `offset_val.round()`). This is NOT wasm
+/// `f64.nearest` (round half to EVEN), so the two diverge for half-integer
+/// inputs.
+///
+/// Emits the precision-safe form `t = x.trunc(); if (x - t).abs() >= 0.5 then t
+/// plus-or-minus 1 (sign of x) else t`. The naive `trunc(x + copysign(0.5, x))`
+/// is off-by-one against `f64::round` for two reachable input classes. First:
+/// the largest f64 below 0.5 (`0.49999999999999994` and its negative), where
+/// `x + 0.5` rounds up to exactly 1.0 so `trunc` yields a magnitude of one
+/// though `f64::round` yields zero. Second: already-integer magnitudes in
+/// `[2^52, 2^53)`, where `x + 0.5` rounds up to `x + 1` though `f64::round`
+/// returns `x`. The `(x - t)` fraction here is computed exactly (the operands
+/// are within a factor of two for `|x| < 2^53`, and `t == x` for integer
+/// magnitudes at or above `2^52`), so no rounding can perturb the half-way
+/// test. Verified bit-identical to `f64::round` over 5M random doubles
+/// including sign-of-zero and both boundary classes.
+///
+/// `x_scratch` and `t_scratch` are two free f64 locals (distinct), holding `x`
+/// and `trunc(x)` while each is read more than once.
+pub(crate) fn emit_round_half_away(f: &mut Function, x_scratch: u32, t_scratch: u32) {
+    f.instruction(&Ins::LocalSet(x_scratch)); // x_scratch = x
+    f.instruction(&Ins::LocalGet(x_scratch));
+    f.instruction(&Ins::F64Trunc);
+    f.instruction(&Ins::LocalSet(t_scratch)); // t_scratch = trunc(x)
+
+    // round-up value: t + copysign(1.0, x)  (the deeper Select operand)
+    f.instruction(&Ins::LocalGet(t_scratch));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::LocalGet(x_scratch));
+    f.instruction(&Ins::F64Copysign); // copysign(1.0, x): ±1.0 with x's sign
+    f.instruction(&Ins::F64Add); // t + copysign(1.0, x)
+
+    // keep-trunc value: t  (the shallower Select operand)
+    f.instruction(&Ins::LocalGet(t_scratch));
+
+    // condition: |x - t| >= 0.5  (exact fraction; round half away from zero)
+    f.instruction(&Ins::LocalGet(x_scratch));
+    f.instruction(&Ins::LocalGet(t_scratch));
+    f.instruction(&Ins::F64Sub);
+    f.instruction(&Ins::F64Abs);
+    f.instruction(&f64_const(0.5));
+    f.instruction(&Ins::F64Ge);
+
+    // select([round_up, t, cond]) == round_up when cond != 0, else t.
+    f.instruction(&Ins::Select);
+}
+
+// ── stable sort helper (VectorSortOrder / Rank) ─────────────────────────────
+
+// `stable_sort(pairs_ptr: i32, n: i32, ascending: i32)` local layout.
+const SS_PTR: u32 = 0; // i32 byte address of pair 0
+const SS_N: u32 = 1; // i32 pair count
+const SS_ASC: u32 = 2; // i32 1 = ascending, else descending
+const SS_I: u32 = 3; // i32 outer index
+const SS_J: u32 = 4; // i32 inner index
+const SS_KEY_VAL: u32 = 5; // f64 key value
+const SS_KEY_IDX: u32 = 6; // f64 key idx payload
+const SS_LEFT_VAL: u32 = 7; // f64 the left neighbour's value
+
+/// Bytes per `(value: f64, idx: f64)` sort pair.
+const PAIR_BYTES: i32 = 16;
+
+/// Build the body of `stable_sort(pairs_ptr: i32, n: i32, ascending: i32) -> ()`,
+/// an in-place **stable** insertion sort of `n` `(value: f64 @ +0, idx: f64 @ +8)`
+/// pairs starting at byte `pairs_ptr`, ordered by `value`.
+///
+/// Reproduces the VM's stable `sort_by(|a, b| a.partial_cmp(b).unwrap_or(Equal))`
+/// (ascending) / the `b.partial_cmp(a)` form (descending). The shift predicate is
+/// a **strict** `f64.lt` (ascending) / `f64.gt` (descending) of the left
+/// neighbour against the key: it is `false` whenever either operand is NaN, so a
+/// NaN never displaces a non-NaN and never reorders relative to another NaN --
+/// i.e. NaN comparisons act as `Equal`, exactly matching `partial_cmp(..)
+/// .unwrap_or(Equal)` under a stable sort. Insertion sort only shifts past
+/// strictly-ordered neighbours, so equal-keyed elements keep their input order
+/// (stability) for free.
+///
+/// A runtime loop (never unrolled): `n` is a runtime view size, so an unrolled
+/// O(n^2) body would be unbounded. n is small for real arrays (the corpus's
+/// largest single dimension is 9), so insertion sort is more than adequate.
+pub(crate) fn emit_stable_sort() -> Function {
+    // Locals after the three i32 params: i32 SS_I/SS_J, f64 SS_KEY_VAL/
+    // SS_KEY_IDX/SS_LEFT_VAL.
+    let mut f = Function::new([(2, ValType::I32), (3, ValType::F64)]);
+
+    // i = 1
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::LocalSet(SS_I));
+
+    f.instruction(&Ins::Block(BlockType::Empty)); // $outer_exit
+    f.instruction(&Ins::Loop(BlockType::Empty)); // $outer
+
+    // while-head: if !(i < n) break $outer_exit  (br depth 1)
+    f.instruction(&Ins::LocalGet(SS_I));
+    f.instruction(&Ins::LocalGet(SS_N));
+    f.instruction(&Ins::I32LtS);
+    f.instruction(&Ins::I32Eqz);
+    f.instruction(&Ins::BrIf(1));
+
+    // key_val = mem[ptr + 16*i + 0]; key_idx = mem[ptr + 16*i + 8]
+    push_pair_addr(&mut f, SS_I);
+    f.instruction(&Ins::F64Load(memarg(0)));
+    f.instruction(&Ins::LocalSet(SS_KEY_VAL));
+    push_pair_addr(&mut f, SS_I);
+    f.instruction(&Ins::F64Load(memarg(8)));
+    f.instruction(&Ins::LocalSet(SS_KEY_IDX));
+
+    // j = i - 1
+    f.instruction(&Ins::LocalGet(SS_I));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Sub);
+    f.instruction(&Ins::LocalSet(SS_J));
+
+    f.instruction(&Ins::Block(BlockType::Empty)); // $inner_exit
+    f.instruction(&Ins::Loop(BlockType::Empty)); // $inner
+
+    // while-head: if !(j >= 0) break $inner_exit  (br depth 1)
+    f.instruction(&Ins::LocalGet(SS_J));
+    f.instruction(&Ins::I32Const(0));
+    f.instruction(&Ins::I32GeS);
+    f.instruction(&Ins::I32Eqz);
+    f.instruction(&Ins::BrIf(1));
+
+    // left_val = mem[ptr + 16*j + 0]
+    push_pair_addr(&mut f, SS_J);
+    f.instruction(&Ins::F64Load(memarg(0)));
+    f.instruction(&Ins::LocalSet(SS_LEFT_VAL));
+
+    // cmp = ascending ? (left_val > key_val) : (left_val < key_val)
+    // Both are strict, hence false for any NaN operand (NaN-as-Equal stability).
+    f.instruction(&Ins::LocalGet(SS_LEFT_VAL));
+    f.instruction(&Ins::LocalGet(SS_KEY_VAL));
+    f.instruction(&Ins::F64Gt); // gt (the ascending predicate)
+    f.instruction(&Ins::LocalGet(SS_LEFT_VAL));
+    f.instruction(&Ins::LocalGet(SS_KEY_VAL));
+    f.instruction(&Ins::F64Lt); // lt (the descending predicate)
+    f.instruction(&Ins::LocalGet(SS_ASC));
+    f.instruction(&Ins::Select); // gt if ascending else lt
+    // if !cmp break $inner_exit  (br depth 1)
+    f.instruction(&Ins::I32Eqz);
+    f.instruction(&Ins::BrIf(1));
+
+    // mem[ptr + 16*(j+1)] = mem[ptr + 16*j]  (both value and idx)
+    push_pair_addr_plus1(&mut f, SS_J); // dst addr (value)
+    push_pair_addr(&mut f, SS_J);
+    f.instruction(&Ins::F64Load(memarg(0)));
+    f.instruction(&Ins::F64Store(memarg(0)));
+    push_pair_addr_plus1(&mut f, SS_J); // dst addr (idx)
+    push_pair_addr(&mut f, SS_J);
+    f.instruction(&Ins::F64Load(memarg(8)));
+    f.instruction(&Ins::F64Store(memarg(8)));
+
+    // j -= 1 ; continue $inner
+    f.instruction(&Ins::LocalGet(SS_J));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Sub);
+    f.instruction(&Ins::LocalSet(SS_J));
+    f.instruction(&Ins::Br(0));
+
+    f.instruction(&Ins::End); // end $inner loop
+    f.instruction(&Ins::End); // end $inner_exit block
+
+    // mem[ptr + 16*(j+1)] = (key_val, key_idx)
+    push_pair_addr_plus1(&mut f, SS_J);
+    f.instruction(&Ins::LocalGet(SS_KEY_VAL));
+    f.instruction(&Ins::F64Store(memarg(0)));
+    push_pair_addr_plus1(&mut f, SS_J);
+    f.instruction(&Ins::LocalGet(SS_KEY_IDX));
+    f.instruction(&Ins::F64Store(memarg(8)));
+
+    // i += 1 ; continue $outer
+    f.instruction(&Ins::LocalGet(SS_I));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Add);
+    f.instruction(&Ins::LocalSet(SS_I));
+    f.instruction(&Ins::Br(0));
+
+    f.instruction(&Ins::End); // end $outer loop
+    f.instruction(&Ins::End); // end $outer_exit block
+    f.instruction(&Ins::End); // end function
+    f
+}
+
+/// Push the byte address of sort pair `idx_local`: `ptr + 16 * idx`. A following
+/// `f64.load`/`store` reads `value` at `memarg(0)` and `idx` at `memarg(8)`.
+fn push_pair_addr(f: &mut Function, idx_local: u32) {
+    f.instruction(&Ins::LocalGet(SS_PTR));
+    f.instruction(&Ins::LocalGet(idx_local));
+    f.instruction(&Ins::I32Const(PAIR_BYTES));
+    f.instruction(&Ins::I32Mul);
+    f.instruction(&Ins::I32Add);
+}
+
+/// Push the byte address of sort pair `idx_local + 1`: `ptr + 16 * (idx + 1)`.
+fn push_pair_addr_plus1(f: &mut Function, idx_local: u32) {
+    f.instruction(&Ins::LocalGet(SS_PTR));
+    f.instruction(&Ins::LocalGet(idx_local));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Add);
+    f.instruction(&Ins::I32Const(PAIR_BYTES));
+    f.instruction(&Ins::I32Mul);
+    f.instruction(&Ins::I32Add);
+}
+
+// ── shared input-view helpers ───────────────────────────────────────────────
+
+/// Whether `view` carries a runtime validity flag or runtime offset addend (a
+/// dynamic subscript, Phase-5 Task 4).
+///
+/// This is the *`VectorSelect`-specific* dynamic-view rejection predicate (its
+/// only consumer is [`is_dynamic_select`]). It deliberately keys on *both*
+/// `valid_local` and `runtime_off_local` -- stricter than the temp-writers'
+/// [`emit_with_validity_gate`], which keys on `valid_local` alone. The
+/// difference is by design: `VectorSelect` reads its source via a compile-time-
+/// base path that does NOT fold a runtime offset addend (it has no temp region
+/// to gate and would need to thread the runtime offset into the gather by hand),
+/// so any runtime offset disqualifies it. The temp-writers tolerate a
+/// `runtime_off_local` because their element reads route through
+/// [`emit_view_element_load`], which folds the runtime offset + validity itself.
+fn is_dynamic(view: &ViewDesc) -> bool {
+    view.valid_local.is_some() || view.runtime_off_local.is_some()
+}
+
+/// Push the i32 "all inputs valid" condition for `views`: the bitwise-AND of each
+/// view's `valid_local`, or a constant `1` when no view carries one. Used to gate
+/// the op against the VM's "`!is_valid` -> fill_temp_nan / NaN" short-circuit.
+fn push_all_valid(views: &[&ViewDesc], f: &mut Function) {
+    let valids: Vec<u32> = views.iter().filter_map(|v| v.valid_local).collect();
+    if valids.is_empty() {
+        f.instruction(&Ins::I32Const(1));
+        return;
+    }
+    f.instruction(&Ins::LocalGet(valids[0]));
+    for &v in &valids[1..] {
+        f.instruction(&Ins::LocalGet(v));
+        f.instruction(&Ins::I32And);
+    }
+}
+
+/// The constant base byte address of `view`'s *storage element 0* -- i.e. the
+/// address the VM's `read_view_element(view, flat)` indexes as `base + flat` (the
+/// view's `base_off`, NOT folding in its `offset`, which the caller already folds
+/// into the flat index). For a module-relative var view the runtime `module_off`
+/// addend is signalled via the returned `bool`.
+fn view_storage_base(view: &ViewDesc, ctx: &EmitCtx) -> Result<(u64, bool), WasmGenError> {
+    match view.base {
+        ViewBase::CurrAbsolute => Ok((
+            u64::from(ctx.curr_base) + u64::from(view.base_off) * u64::from(SLOT_SIZE),
+            false,
+        )),
+        ViewBase::CurrModuleRelative => Ok((
+            u64::from(ctx.curr_base) + u64::from(view.base_off) * u64::from(SLOT_SIZE),
+            true,
+        )),
+        ViewBase::Temp => {
+            let temp_off = *ctx
+                .ctx
+                .temp_offsets
+                .get(view.base_off as usize)
+                .ok_or_else(|| {
+                    WasmGenError::Unsupported(
+                        "wasmgen: vector-op source references an out-of-range temp id".to_string(),
+                    )
+                })? as u64;
+            Ok((
+                u64::from(ctx.temp_storage_base) + temp_off * u64::from(SLOT_SIZE),
+                false,
+            ))
+        }
+    }
+}
+
+// ── VectorSelect (vm.rs:2444-2502) ──────────────────────────────────────────
+
+/// Lower `VectorSelect`, mirroring `vm.rs:2444-2502`. The two operands are on the
+/// wasm stack as `[max_value, action]` (`action` on top, matching the VM popping
+/// `action` then `max_value`); the views are `expr_view = top`, `sel_view =
+/// top-1`. Zips the two views to `min(sel.size, expr.size)` with independent
+/// odometers, collects each `expr` value where `is_truthy(sel)`, then for an
+/// empty selection pushes `max_value`, else dispatches the `action` reduction
+/// (1=min, 2=mean, 3=max, 4=product, else sum). The single scalar result is left
+/// on the stack. An invalid input view pushes one NaN.
+///
+/// The gather is unrolled over the (compile-time) zip size; each selected value
+/// is appended to the vector scratch region with a runtime count, and the
+/// reduction is a single runtime pass over the collected values (mirroring the
+/// VM's `selected` Vec). `min`/`max` reproduce Rust's `f64::min`/`f64::max`
+/// (NaN-ignoring), not wasm `f64.min`/`f64.max` (NaN-propagating), so the fold
+/// matches the VM's `fold(±inf, f64::min/max)` exactly.
+pub(crate) fn emit_vector_select(
+    sel_view: &ViewDesc,
+    expr_view: &ViewDesc,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    if is_dynamic_select(sel_view, expr_view) {
+        return Err(WasmGenError::Unsupported(
+            "wasmgen: VectorSelect over a dynamically-subscripted view is not supported"
+                .to_string(),
+        ));
+    }
+
+    let max_value = ctx.apply_locals[0]; // popped second
+    let action = ctx.vector_i32_locals[0];
+    let count = ctx.vector_i32_locals[1];
+    let k = ctx.vector_i32_locals[2];
+    let [acc_sum, acc_prod, acc_min, acc_max, vtmp] = ctx.vector_f64_locals;
+
+    // Pop action (top) -> round-half-away -> i32; then pop max_value. The round
+    // uses `scratch_local` + `apply_locals[0]` as its two f64 temps; both are
+    // free here (`max_value` is parked into `apply_locals[0]` only afterward).
+    emit_round_half_away(f, ctx.scratch_local, ctx.apply_locals[0]);
+    f.instruction(&Ins::I32TruncSatF64S);
+    f.instruction(&Ins::LocalSet(action));
+    f.instruction(&Ins::LocalSet(max_value));
+
+    let size = sel_view.size().min(expr_view.size());
+
+    // count = 0
+    f.instruction(&Ins::I32Const(0));
+    f.instruction(&Ins::LocalSet(count));
+
+    // Gather: for each i in 0..size, if is_truthy(sel[i]) push expr[i] into the
+    // scratch region at scratch[count] and bump count. The two odometers run
+    // independently; element `i` of each view is its row-major iteration index.
+    for i in 0..size {
+        emit_view_element_load(sel_view, i, ctx, f)?;
+        emit_is_truthy(ctx, f);
+        f.instruction(&Ins::If(BlockType::Empty));
+        // scratch[count] = expr[i]. f64.store wants [addr_i32, value_f64];
+        // addr = vector_scratch_base + count*8 (the constant base in memarg).
+        f.instruction(&Ins::LocalGet(count));
+        f.instruction(&Ins::I32Const(SLOT_SIZE as i32));
+        f.instruction(&Ins::I32Mul);
+        emit_view_element_load(expr_view, i, ctx, f)?;
+        f.instruction(&Ins::F64Store(memarg(u64::from(ctx.vector_scratch_base))));
+        // count += 1
+        f.instruction(&Ins::LocalGet(count));
+        f.instruction(&Ins::I32Const(1));
+        f.instruction(&Ins::I32Add);
+        f.instruction(&Ins::LocalSet(count));
+        f.instruction(&Ins::End);
+    }
+
+    // if count == 0 { result = max_value } else { result = reduce(action) }.
+    f.instruction(&Ins::LocalGet(count));
+    f.instruction(&Ins::I32Eqz);
+    f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+    f.instruction(&Ins::LocalGet(max_value));
+    f.instruction(&Ins::Else);
+
+    // Single pass over scratch[0..count] computing sum/product/min/max; then the
+    // action selects the result. min/max init mirror the VM's
+    // fold(INFINITY, f64::min) / fold(NEG_INFINITY, f64::max).
+    f.instruction(&f64_const(0.0));
+    f.instruction(&Ins::LocalSet(acc_sum));
+    f.instruction(&f64_const(1.0));
+    f.instruction(&Ins::LocalSet(acc_prod));
+    f.instruction(&f64_const(f64::INFINITY));
+    f.instruction(&Ins::LocalSet(acc_min));
+    f.instruction(&f64_const(f64::NEG_INFINITY));
+    f.instruction(&Ins::LocalSet(acc_max));
+    // k = 0
+    f.instruction(&Ins::I32Const(0));
+    f.instruction(&Ins::LocalSet(k));
+
+    f.instruction(&Ins::Block(BlockType::Empty)); // $reduce_exit
+    f.instruction(&Ins::Loop(BlockType::Empty)); // $reduce
+    // if !(k < count) break
+    f.instruction(&Ins::LocalGet(k));
+    f.instruction(&Ins::LocalGet(count));
+    f.instruction(&Ins::I32LtS);
+    f.instruction(&Ins::I32Eqz);
+    f.instruction(&Ins::BrIf(1));
+    // v = scratch[k]
+    f.instruction(&Ins::LocalGet(k));
+    f.instruction(&Ins::I32Const(SLOT_SIZE as i32));
+    f.instruction(&Ins::I32Mul);
+    f.instruction(&Ins::F64Load(memarg(u64::from(ctx.vector_scratch_base))));
+    f.instruction(&Ins::LocalSet(vtmp));
+    // acc_sum += v
+    f.instruction(&Ins::LocalGet(acc_sum));
+    f.instruction(&Ins::LocalGet(vtmp));
+    f.instruction(&Ins::F64Add);
+    f.instruction(&Ins::LocalSet(acc_sum));
+    // acc_prod *= v
+    f.instruction(&Ins::LocalGet(acc_prod));
+    f.instruction(&Ins::LocalGet(vtmp));
+    f.instruction(&Ins::F64Mul);
+    f.instruction(&Ins::LocalSet(acc_prod));
+    // acc_min = f64::min(acc_min, v)
+    f.instruction(&Ins::LocalGet(acc_min));
+    f.instruction(&Ins::LocalGet(vtmp));
+    emit_f64_min_rust(ctx, f);
+    f.instruction(&Ins::LocalSet(acc_min));
+    // acc_max = f64::max(acc_max, v)
+    f.instruction(&Ins::LocalGet(acc_max));
+    f.instruction(&Ins::LocalGet(vtmp));
+    emit_f64_max_rust(ctx, f);
+    f.instruction(&Ins::LocalSet(acc_max));
+    // k += 1 ; continue
+    f.instruction(&Ins::LocalGet(k));
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Add);
+    f.instruction(&Ins::LocalSet(k));
+    f.instruction(&Ins::Br(0));
+    f.instruction(&Ins::End); // end $reduce loop
+    f.instruction(&Ins::End); // end $reduce_exit block
+
+    // result = match action { 1 => min, 2 => sum/count, 3 => max, 4 => prod,
+    //                         _ => sum }. wasm `select` pops [v1, v2, cond] and
+    // yields the DEEPER `v1` when cond != 0, so the running default (`sum`) is the
+    // deeper operand, each override is pushed shallower, and the condition is
+    // `action != n` -- keeping the running value unless `action == n`. (Same
+    // pattern as `math::emit_quadrant_select`.)
+    f.instruction(&Ins::LocalGet(acc_sum)); // default: sum (action 0/5/..)
+    // action == 4 -> product
+    f.instruction(&Ins::LocalGet(acc_prod));
+    push_action_ne(f, action, 4);
+    f.instruction(&Ins::Select);
+    // action == 3 -> max
+    f.instruction(&Ins::LocalGet(acc_max));
+    push_action_ne(f, action, 3);
+    f.instruction(&Ins::Select);
+    // action == 2 -> mean (sum / count)
+    f.instruction(&Ins::LocalGet(acc_sum));
+    f.instruction(&Ins::LocalGet(count));
+    f.instruction(&Ins::F64ConvertI32S);
+    f.instruction(&Ins::F64Div);
+    push_action_ne(f, action, 2);
+    f.instruction(&Ins::Select);
+    // action == 1 -> min
+    f.instruction(&Ins::LocalGet(acc_min));
+    push_action_ne(f, action, 1);
+    f.instruction(&Ins::Select);
+
+    f.instruction(&Ins::End); // end if count == 0
+    Ok(())
+}
+
+/// `VectorSelect`'s dynamic-view rejection. The op reduces to a scalar (no temp
+/// region), so an invalid view would push one NaN; rather than emit that gate
+/// (and the runtime-offset folding the gather would need), a dynamically-
+/// subscripted input falls back to the VM.
+fn is_dynamic_select(sel_view: &ViewDesc, expr_view: &ViewDesc) -> bool {
+    is_dynamic(sel_view) || is_dynamic(expr_view)
+}
+
+/// Push i32 `1` when the i32 in `action_local` does NOT equal `n` -- the "keep
+/// the running default" condition for the `VectorSelect` action-dispatch selects
+/// (the override is taken only when `action == n`).
+fn push_action_ne(f: &mut Function, action_local: u32, n: i32) {
+    f.instruction(&Ins::LocalGet(action_local));
+    f.instruction(&Ins::I32Const(n));
+    f.instruction(&Ins::I32Ne);
+}
+
+/// Push `f64::min(a, b)` for `[a, b]` on the wasm stack, reproducing Rust's
+/// NaN-ignoring `f64::min` (return the non-NaN operand if exactly one is NaN, the
+/// lesser otherwise) rather than wasm `f64.min` (NaN-propagating). Parks both
+/// operands so they can be read for the NaN tests and the `<` compare.
+fn emit_f64_min_rust(ctx: &EmitCtx, f: &mut Function) {
+    emit_f64_minmax_rust(ctx, f, true);
+}
+
+/// Push `f64::max(a, b)` for `[a, b]` on the wasm stack, reproducing Rust's
+/// NaN-ignoring `f64::max`.
+fn emit_f64_max_rust(ctx: &EmitCtx, f: &mut Function) {
+    emit_f64_minmax_rust(ctx, f, false);
+}
+
+/// Shared body of [`emit_f64_min_rust`]/[`emit_f64_max_rust`]. Consumes `[a, b]`
+/// and pushes `f64::min(a,b)` (`want_min`) or `f64::max(a,b)`, matching
+/// `f64::min`/`f64::max`'s "ignore NaN; if both NaN, NaN" contract.
+///
+/// Built as three nested `select`s, each in the wasm "deeper operand wins when
+/// cond != 0" form (`select([v1, v2, cond]) == v1 if cond else v2`):
+/// 1. `core = (a {<,>} b) ? a : b`  -- the non-NaN min/max,
+/// 2. `r = (b is NaN) ? a : core`   -- ignore a NaN `b`,
+/// 3. result `= (a is NaN) ? b : r` -- ignore a NaN `a` (and if both NaN, `b`,
+///    which is NaN, so the all-NaN case yields NaN).
+///
+/// The intermediate must be a *shallower* select operand at each step, so it is
+/// parked in a scratch local rather than left on the stack. The `VectorSelect`
+/// reduction reaches this only inside its `count != 0` branch, where all three
+/// `Apply` scratch f64s are free (`apply_locals[0]`'s `max_value` is dead once
+/// the selection is non-empty); this uses `apply_locals[1]`/`[2]` for `a`/`b` and
+/// `apply_locals[0]` for the running register. (The ±0 tie is left to wasm's
+/// `<`/`>`, acceptable for SD parity -- the VM's reductions never depend on ±0.)
+fn emit_f64_minmax_rust(ctx: &EmitCtx, f: &mut Function, want_min: bool) {
+    let a = ctx.apply_locals[1];
+    let b = ctx.apply_locals[2];
+    let r = ctx.apply_locals[0];
+    // The two operands are on the stack as [a, b] (b on top); park them.
+    f.instruction(&Ins::LocalSet(b));
+    f.instruction(&Ins::LocalSet(a));
+
+    // core = (a {<,>} b) ? a : b  -> r
+    f.instruction(&Ins::LocalGet(a));
+    f.instruction(&Ins::LocalGet(b));
+    f.instruction(&Ins::LocalGet(a));
+    f.instruction(&Ins::LocalGet(b));
+    if want_min {
+        f.instruction(&Ins::F64Lt);
+    } else {
+        f.instruction(&Ins::F64Gt);
+    }
+    f.instruction(&Ins::Select);
+    f.instruction(&Ins::LocalSet(r));
+
+    // r = (b is NaN) ? a : r
+    f.instruction(&Ins::LocalGet(a));
+    f.instruction(&Ins::LocalGet(r));
+    f.instruction(&Ins::LocalGet(b));
+    f.instruction(&Ins::LocalGet(b));
+    f.instruction(&Ins::F64Ne); // b != b  (true iff b is NaN)
+    f.instruction(&Ins::Select);
+    f.instruction(&Ins::LocalSet(r));
+
+    // result = (a is NaN) ? b : r  (left on the stack)
+    f.instruction(&Ins::LocalGet(b));
+    f.instruction(&Ins::LocalGet(r));
+    f.instruction(&Ins::LocalGet(a));
+    f.instruction(&Ins::LocalGet(a));
+    f.instruction(&Ins::F64Ne); // a != a  (true iff a is NaN)
+    f.instruction(&Ins::Select);
+}
+
+// ── VectorElmMap (vm_vector_elm_map.rs:33-116) ──────────────────────────────
+
+/// Lower `VectorElmMap { write_temp_id, full_source_len }`, mirroring
+/// `crate::vm_vector_elm_map::vector_elm_map`. The views are `offset_view = top`,
+/// `source_view = top-1`. For each element `i` of the offset view: `flat_i =
+/// base_i + round(offset[i])` over the source's FULL row-major storage, where
+/// `base_i` is 0 for a full contiguous source else the source's flat offset at
+/// element `i`'s carried-axis projection (the offset-view indices scattered onto
+/// the source axes by dim-id). The result is `NaN` if `offset[i]` is NaN or
+/// `flat_i` is out of `[0, full_source_len)`, else `source[flat_i]`. **No
+/// modulo.** Written to `temp[temp_off + i]`; an invalid input view fills the
+/// whole destination temp region with NaN.
+pub(crate) fn emit_vector_elm_map(
+    source_view: &ViewDesc,
+    offset_view: &ViewDesc,
+    write_temp_id: u8,
+    full_source_len: u32,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    // The source's runtime-indexed read folds a module-relative addend, but a
+    // runtime-offset addend (a dynamic subscript) on the source is NOT folded
+    // into the compile-time `base_i`, so reject a dynamically-subscripted source
+    // (VM fallback). The OFFSET view's reads route through `emit_view_element_load`
+    // (which handles a runtime offset + validity), and an invalid offset view is
+    // caught by the op-level validity gate below, so an offset dynamic subscript
+    // is fine.
+    if source_view.runtime_off_local.is_some() {
+        return Err(WasmGenError::Unsupported(
+            "wasmgen: VectorElmMap over a dynamically-subscripted source view is not supported"
+                .to_string(),
+        ));
+    }
+
+    emit_with_validity_gate(
+        &[source_view, offset_view],
+        write_temp_id,
+        ctx,
+        f,
+        |ctx, f| {
+            emit_vector_elm_map_body(
+                source_view,
+                offset_view,
+                write_temp_id,
+                full_source_len,
+                ctx,
+                f,
+            )
+        },
+    )
+}
+
+/// The valid-input body of [`emit_vector_elm_map`].
+fn emit_vector_elm_map_body(
+    source_view: &ViewDesc,
+    offset_view: &ViewDesc,
+    write_temp_id: u8,
+    full_source_len: u32,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let full_len = full_source_len as usize;
+    let offset_size = offset_view.size();
+
+    // source_is_full_array: the fast path where base_i is hard-coded 0 and the
+    // offset indexes the whole array directly (vm_vector_elm_map.rs:52).
+    let source_is_full_array = source_view.size() == full_len && source_view.is_contiguous();
+
+    // Carried source dim -> offset-view axis of the same dim id, mirroring the
+    // VM's `src_to_off_axis` (vm_vector_elm_map.rs:57-61). Used per element to
+    // project the offset-view indices onto the source axes for `base_i`.
+    let src_to_off_axis: Vec<Option<usize>> = source_view
+        .dim_ids
+        .iter()
+        .map(|sd| offset_view.dim_ids.iter().position(|od| od == sd))
+        .collect();
+
+    let (src_base_byte, src_module_relative) = view_storage_base(source_view, ctx)?;
+
+    let offset_val = ctx.vector_f64_locals[0];
+    let flat_i = ctx.vector_i32_locals[0];
+
+    for i in 0..offset_size {
+        // base_i (compile-time): 0 for a full-array source, else the sliced
+        // view's flat offset at this element's carried-dim projection.
+        let base_i: i64 = if source_is_full_array {
+            0
+        } else {
+            let off_indices = ViewDesc::decompose_iter_index(&offset_view.dims, i);
+            let src_indices: Vec<u16> = src_to_off_axis
+                .iter()
+                .map(|slot| match slot {
+                    Some(p) => off_indices[*p],
+                    None => 0,
+                })
+                .collect();
+            source_view.flat_offset_for_indices(&src_indices) as i64
+        };
+
+        // offset_val = offset_view[i]
+        emit_view_element_load(offset_view, i, ctx, f)?;
+        f.instruction(&Ins::LocalSet(offset_val));
+
+        // result = if offset_val.is_nan() || flat_i<0 || flat_i>=full_len { NaN }
+        //          else source[flat_i]. flat_i = base_i + round(offset_val).
+        // Compute flat_i (i32) once. The round consumes the pushed copy of
+        // `offset_val` and uses `scratch_local` + `apply_locals[0]` as its two
+        // f64 temps -- neither is `vector_f64_locals[0]` (the `offset_val` local,
+        // read again below), and `apply_locals` is otherwise unused in this op.
+        f.instruction(&f64_const(base_i as f64));
+        f.instruction(&Ins::LocalGet(offset_val));
+        emit_round_half_away(f, ctx.scratch_local, ctx.apply_locals[0]);
+        f.instruction(&Ins::F64Add); // base_i + round(offset_val)  (as f64)
+        f.instruction(&Ins::I32TruncSatF64S);
+        f.instruction(&Ins::LocalSet(flat_i));
+
+        // store temp[i] = select(NaN, source[flat_i], oob). oob is true when the
+        // offset is NaN OR flat_i is out of [0, full_len). f64.store wants
+        // [addr_i32, value_f64]; push the temp address first.
+        let temp_addr = temp_element_byte_addr(ctx, write_temp_id, i as u32)?;
+        f.instruction(&Ins::I32Const(0)); // dynamic addr part (const base in memarg)
+
+        // value = read source[flat_i] (faithful even when oob -- the select
+        // discards it; flat_i is sat-clamped so the address stays in range only
+        // when in-bounds, but a read at a clamped OOB index is never used).
+        // Guard the read with the in-bounds branch so an OOB index never loads
+        // out of the source storage.
+        f.instruction(&Ins::LocalGet(offset_val));
+        f.instruction(&Ins::LocalGet(offset_val));
+        f.instruction(&Ins::F64Ne); // offset_val is NaN
+        f.instruction(&Ins::LocalGet(flat_i));
+        f.instruction(&Ins::I32Const(0));
+        f.instruction(&Ins::I32LtS); // flat_i < 0
+        f.instruction(&Ins::I32Or);
+        f.instruction(&Ins::LocalGet(flat_i));
+        f.instruction(&Ins::I32Const(full_len as i32));
+        f.instruction(&Ins::I32GeS); // flat_i >= full_len
+        f.instruction(&Ins::I32Or); // oob
+        f.instruction(&Ins::If(BlockType::Result(ValType::F64)));
+        f.instruction(&f64_const(f64::NAN));
+        f.instruction(&Ins::Else);
+        // source[flat_i]: base byte + flat_i*8 (+ module_off*8 if module-relative)
+        emit_storage_indexed_load(src_base_byte, src_module_relative, flat_i, ctx, f);
+        f.instruction(&Ins::End);
+
+        f.instruction(&Ins::F64Store(memarg(temp_addr)));
+    }
+    Ok(())
+}
+
+/// Push `storage[flat_i]` where the storage element-0 byte address is the
+/// constant `base_byte` and `flat_i` (an i32 local) is the runtime slot index:
+/// `f64.load[base_byte + (module_off? )*8 + flat_i*8]`. The constant `base_byte`
+/// rides in the `memarg.offset`; the runtime part is `(module_off + flat_i) * 8`
+/// for a module-relative view, else `flat_i * 8`.
+fn emit_storage_indexed_load(
+    base_byte: u64,
+    module_relative: bool,
+    flat_i: u32,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) {
+    if module_relative {
+        push_module_relative_base(ctx, f); // module_off * 8
+        f.instruction(&Ins::LocalGet(flat_i));
+        f.instruction(&Ins::I32Const(SLOT_SIZE as i32));
+        f.instruction(&Ins::I32Mul);
+        f.instruction(&Ins::I32Add);
+    } else {
+        f.instruction(&Ins::LocalGet(flat_i));
+        f.instruction(&Ins::I32Const(SLOT_SIZE as i32));
+        f.instruction(&Ins::I32Mul);
+    }
+    f.instruction(&Ins::F64Load(memarg(base_byte)));
+}
+
+// ── VectorSortOrder (vm_vector_sort_order.rs:49-101) ─────────────────────────
+
+/// Lower `VectorSortOrder { write_temp_id }`, mirroring
+/// `crate::vm_vector_sort_order::vector_sort_order`. `input_view = top`; the
+/// `direction` operand is popped (`.round() as i32`). The innermost (last)
+/// dimension is the sorted axis; outer dims select independent rows (a scalar/1-D
+/// view is one row of `inner == size`). Per row, the `(value, local_idx 0..inner)`
+/// pairs are staged into the vector scratch region, sorted (ascending if
+/// `direction == 1`, else descending) by the runtime [`emit_stable_sort`] helper,
+/// then `temp[row_base + rank] = local_idx` is written (the 0-based in-row source
+/// index at the sorted position). An invalid input view fills the temp with NaN.
+pub(crate) fn emit_vector_sort_order(
+    input_view: &ViewDesc,
+    write_temp_id: u8,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    // The direction operand is on the stack now; pop it to the `ascending` flag
+    // first (the validity gate's body / fill_temp_nan arms must be
+    // operand-balanced, so the operand is consumed before the gate). A
+    // dynamically-subscripted input is handled by the gate (invalid ->
+    // fill_temp_nan) and `emit_view_element_load` (runtime offset + validity).
+    let ascending = ctx.vector_i32_locals[0];
+    pop_direction_to_ascending(ascending, ctx, f);
+
+    emit_with_validity_gate(&[input_view], write_temp_id, ctx, f, |ctx, f| {
+        emit_vector_sort_order_body(input_view, write_temp_id, ascending, ctx, f)
+    })
+}
+
+/// The valid-input body of [`emit_vector_sort_order`].
+fn emit_vector_sort_order_body(
+    input_view: &ViewDesc,
+    write_temp_id: u8,
+    ascending: u32,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let size = input_view.size();
+    let n_dims = input_view.dims.len();
+    let inner = if n_dims == 0 {
+        size
+    } else {
+        input_view.dims[n_dims - 1] as usize
+    };
+    if inner == 0 {
+        // A zero-length innermost dim yields an empty result; nothing to write.
+        return Ok(());
+    }
+
+    let scratch = u64::from(ctx.vector_scratch_base);
+    // Iterate rows in row-major logical order; each block of `inner` iterations
+    // is one row (mirroring the VM's `increment_indices` walk -- element
+    // `iter_idx` of the view, read row-major, is `flat_element_offset(iter_idx)`).
+    let mut i = 0usize;
+    while i < size {
+        // Gather: pair[local_idx] = (value = input[i + local_idx], idx = local_idx).
+        for local_idx in 0..inner {
+            let pair_val_addr = scratch + (local_idx as u64) * (PAIR_BYTES as u64);
+            // value slot
+            f.instruction(&Ins::I32Const(0));
+            emit_view_element_load(input_view, i + local_idx, ctx, f)?;
+            f.instruction(&Ins::F64Store(memarg(pair_val_addr)));
+            // idx slot (+8)
+            f.instruction(&Ins::I32Const(0));
+            f.instruction(&f64_const(local_idx as f64));
+            f.instruction(&Ins::F64Store(memarg(pair_val_addr + 8)));
+        }
+
+        // stable_sort(scratch, inner, ascending)
+        f.instruction(&Ins::I32Const(ctx.vector_scratch_base as i32));
+        f.instruction(&Ins::I32Const(inner as i32));
+        f.instruction(&Ins::LocalGet(ascending));
+        f.instruction(&Ins::Call(ctx.helpers.stable_sort));
+
+        // Scatter: temp[temp_off + i + rank] = pair[rank].idx.
+        for rank in 0..inner {
+            let pair_idx_addr = scratch + (rank as u64) * (PAIR_BYTES as u64) + 8;
+            let temp_addr = temp_element_byte_addr(ctx, write_temp_id, (i + rank) as u32)?;
+            f.instruction(&Ins::I32Const(0));
+            f.instruction(&Ins::I32Const(0));
+            f.instruction(&Ins::F64Load(memarg(pair_idx_addr)));
+            f.instruction(&Ins::F64Store(memarg(temp_addr)));
+        }
+
+        i += inner;
+    }
+    Ok(())
+}
+
+// ── Rank (vm.rs:2540-2584) ───────────────────────────────────────────────────
+
+/// Lower `Rank { write_temp_id }`, mirroring `vm.rs:2540-2584`. `input_view =
+/// top`; the `direction` operand is popped. Over the WHOLE view, the `(value,
+/// orig_idx 0..size)` pairs are staged into the vector scratch region and sorted
+/// (ascending if `direction == 1`, else descending) by [`emit_stable_sort`], then
+/// `temp[orig_idx] = rank_0based + 1` (1-based, indexed by original position) is
+/// written. An invalid input view fills the temp with NaN.
+pub(crate) fn emit_rank(
+    input_view: &ViewDesc,
+    write_temp_id: u8,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let ascending = ctx.vector_i32_locals[0];
+    pop_direction_to_ascending(ascending, ctx, f);
+
+    emit_with_validity_gate(&[input_view], write_temp_id, ctx, f, |ctx, f| {
+        emit_rank_body(input_view, write_temp_id, ascending, ctx, f)
+    })
+}
+
+/// The valid-input body of [`emit_rank`].
+fn emit_rank_body(
+    input_view: &ViewDesc,
+    write_temp_id: u8,
+    ascending: u32,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let size = input_view.size();
+    if size == 0 {
+        return Ok(());
+    }
+    let scratch = u64::from(ctx.vector_scratch_base);
+    let temp_off = *ctx
+        .ctx
+        .temp_offsets
+        .get(write_temp_id as usize)
+        .ok_or_else(|| {
+            WasmGenError::Unsupported(format!("wasmgen: temp id {write_temp_id} out of range"))
+        })?;
+
+    // Gather: pair[orig_idx] = (value = input[orig_idx], idx = orig_idx).
+    for orig_idx in 0..size {
+        let pair_val_addr = scratch + (orig_idx as u64) * (PAIR_BYTES as u64);
+        f.instruction(&Ins::I32Const(0));
+        emit_view_element_load(input_view, orig_idx, ctx, f)?;
+        f.instruction(&Ins::F64Store(memarg(pair_val_addr)));
+        f.instruction(&Ins::I32Const(0));
+        f.instruction(&f64_const(orig_idx as f64));
+        f.instruction(&Ins::F64Store(memarg(pair_val_addr + 8)));
+    }
+
+    // stable_sort(scratch, size, ascending)
+    f.instruction(&Ins::I32Const(ctx.vector_scratch_base as i32));
+    f.instruction(&Ins::I32Const(size as i32));
+    f.instruction(&Ins::LocalGet(ascending));
+    f.instruction(&Ins::Call(ctx.helpers.stable_sort));
+
+    // Scatter by ORIGINAL position: for each rank, orig_idx = pair[rank].idx
+    // (runtime); temp[temp_off + orig_idx] = rank + 1. The destination slot is
+    // runtime-indexed (it depends on the sorted permutation), so the dynamic
+    // address part is `orig_idx * 8` and the constant `temp_storage_base +
+    // temp_off*8` rides in the `memarg.offset`. f64.store wants
+    // [addr_i32, value_f64], so push the address first, then `rank + 1`.
+    let temp_base_byte =
+        u64::from(ctx.temp_storage_base) + (temp_off as u64) * u64::from(SLOT_SIZE);
+    for rank in 0..size {
+        let pair_idx_addr = scratch + (rank as u64) * (PAIR_BYTES as u64) + 8;
+        // dynamic addr = orig_idx * 8, where orig_idx = trunc(pair[rank].idx).
+        f.instruction(&Ins::I32Const(0));
+        f.instruction(&Ins::F64Load(memarg(pair_idx_addr)));
+        f.instruction(&Ins::I32TruncSatF64S);
+        f.instruction(&Ins::I32Const(SLOT_SIZE as i32));
+        f.instruction(&Ins::I32Mul);
+        // value = rank + 1 (1-based)
+        f.instruction(&f64_const((rank + 1) as f64));
+        f.instruction(&Ins::F64Store(memarg(temp_base_byte)));
+    }
+    Ok(())
+}
+
+/// Pop the `direction` operand off the wasm stack (the VM does `.round() as
+/// i32`), compute `ascending = (round(direction) == 1) as i32`, and store it in
+/// `ascending_local`. Shared by `VectorSortOrder`/`Rank`.
+fn pop_direction_to_ascending(ascending_local: u32, ctx: &EmitCtx, f: &mut Function) {
+    // The round's two f64 temps (`scratch_local` + `apply_locals[0]`) are both
+    // free here -- nothing survives across this direction pop.
+    emit_round_half_away(f, ctx.scratch_local, ctx.apply_locals[0]);
+    f.instruction(&Ins::I32TruncSatF64S);
+    f.instruction(&Ins::I32Const(1));
+    f.instruction(&Ins::I32Eq);
+    f.instruction(&Ins::LocalSet(ascending_local));
+}
+
+// ── LookupArray (vm.rs:2586-2629) ────────────────────────────────────────────
+
+/// Lower `LookupArray { base_gf, table_count, mode, write_temp_id }`, mirroring
+/// `vm.rs:2586-2629`. The shared `index` is popped; `input_view = top`. For each
+/// element `i`, `elem_off = flat_offset(indices)` (compile-time); if `elem_off >=
+/// table_count` the result is NaN, else the GF directory entry at `base_gf +
+/// elem_off` is read and the mode's Phase-3 helper (`lookup_interp`/`forward`/
+/// `backward`) is `call`ed at `index`. Written to `temp[temp_off + i]` (sequential
+/// index). An invalid input view fills the temp with NaN.
+///
+/// Each element's `elem_off` is compile-time, so the bound check, the GF
+/// directory entry address, and the mode dispatch all resolve at compile time;
+/// only the `index` and the `lookup_*` evaluation are runtime. Unrolled over the
+/// view size (the caller charges the unroll budget).
+pub(crate) fn emit_lookup_array(
+    input_view: &ViewDesc,
+    base_gf: GraphicalFunctionId,
+    table_count: u16,
+    mode: LookupMode,
+    write_temp_id: u8,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    // Pop `index` to a scratch f64 (read once per element). Done before the gate
+    // so both gate arms are operand-balanced. A dynamically-subscripted input is
+    // handled by the gate (invalid -> fill_temp_nan) and `emit_view_element_load`.
+    let index = ctx.scratch_local;
+    f.instruction(&Ins::LocalSet(index));
+
+    emit_with_validity_gate(&[input_view], write_temp_id, ctx, f, |ctx, f| {
+        emit_lookup_array_body(
+            input_view,
+            base_gf,
+            table_count,
+            mode,
+            write_temp_id,
+            index,
+            ctx,
+            f,
+        )
+    })
+}
+
+/// The valid-input body of [`emit_lookup_array`].
+#[allow(clippy::too_many_arguments)]
+fn emit_lookup_array_body(
+    input_view: &ViewDesc,
+    base_gf: GraphicalFunctionId,
+    table_count: u16,
+    mode: LookupMode,
+    write_temp_id: u8,
+    index: u32,
+    ctx: &EmitCtx,
+    f: &mut Function,
+) -> Result<(), WasmGenError> {
+    let helper_idx = match mode {
+        LookupMode::Interpolate => ctx.helpers.lookup_interp,
+        LookupMode::Forward => ctx.helpers.lookup_forward,
+        LookupMode::Backward => ctx.helpers.lookup_backward,
+    };
+    let size = input_view.size();
+    for i in 0..size {
+        // elem_off (compile-time) = flat offset of element i over the view.
+        let elem_off = input_view.flat_element_offset(i);
+        let temp_addr = temp_element_byte_addr(ctx, write_temp_id, i as u32)?;
+        f.instruction(&Ins::I32Const(0)); // temp store dynamic addr (const base)
+
+        if elem_off >= table_count as usize {
+            // Out-of-range element offset -> NaN (matching the scalar Lookup
+            // bound; vm.rs:2615).
+            f.instruction(&f64_const(f64::NAN));
+        } else {
+            // table_idx = base_gf + elem_off (compile-time). Read (data_off,
+            // count) from the GF directory at gf_directory_base + table_idx*8,
+            // then call the mode's helper at `index`.
+            let dir_addr = u64::from(ctx.gf_directory_base)
+                + (base_gf as u64 + elem_off as u64) * (GF_DIRECTORY_ENTRY_BYTES as u64);
+            f.instruction(&Ins::I32Const(0));
+            f.instruction(&Ins::I32Load(i32_memarg(dir_addr))); // data_off
+            f.instruction(&Ins::I32Const(0));
+            f.instruction(&Ins::I32Load(i32_memarg(dir_addr + 4))); // count
+            f.instruction(&Ins::LocalGet(index));
+            f.instruction(&Ins::Call(helper_idx));
+        }
+        f.instruction(&Ins::F64Store(memarg(temp_addr)));
+    }
+    Ok(())
+}
+
+// ── validity gate ────────────────────────────────────────────────────────────
+
+/// Emit `body` for the temp-writing vector ops, gated on the VM's "`!is_valid`
+/// -> fill_temp_nan" short-circuit. When no input view carries a runtime validity
+/// flag (the common static/temp/full-var case), `body` is emitted directly with
+/// no runtime check. Otherwise: `if all_valid { body } else { fill_temp_nan }`.
+fn emit_with_validity_gate(
+    views: &[&ViewDesc],
+    write_temp_id: u8,
+    ctx: &EmitCtx,
+    f: &mut Function,
+    body: impl FnOnce(&EmitCtx, &mut Function) -> Result<(), WasmGenError>,
+) -> Result<(), WasmGenError> {
+    let any_dynamic = views.iter().any(|v| v.valid_local.is_some());
+    if !any_dynamic {
+        return body(ctx, f);
+    }
+    push_all_valid(views, f);
+    f.instruction(&Ins::If(BlockType::Empty));
+    body(ctx, f)?;
+    f.instruction(&Ins::Else);
+    emit_fill_temp_nan(ctx, write_temp_id, f)?;
+    f.instruction(&Ins::End);
+    Ok(())
+}
diff --git a/src/simlin-engine/src/wasmgen/views.rs b/src/simlin-engine/src/wasmgen/views.rs
new file mode 100644
index 000000000..8aef2b9cd
--- /dev/null
+++ b/src/simlin-engine/src/wasmgen/views.rs
@@ -0,0 +1,811 @@
+// Copyright 2026 The Simlin Authors. All rights reserved.
+// Use of this source code is governed by the Apache License,
+// Version 2.0, that can be found in the LICENSE file.
+
+// pattern: Functional Core
+// Pure compile-time model of the VM's runtime `view_stack`. No I/O; the only
+// state is the `Vec<ViewDesc>` the emitter threads through `emit_bytecode`.
+
+//! Compile-time view descriptors -- the wasm backend's analogue of the VM's
+//! runtime `view_stack` (`crate::vm`).
+//!
+//! The VM resolves every array access through a runtime stack of [`RuntimeView`]s
+//! built and transformed by the `Push*View` / `View*` opcodes. Because every
+//! static view's geometry (base offset, dims, strides, offset, sparsity,
+//! is_temp) is known at compile time, the wasm emitter maintains a *compile-time*
+//! stack of [`ViewDesc`]s instead, mirroring the static parts of `RuntimeView`
+//! field-for-field and reproducing the `RuntimeView::apply_*` transforms in
+//! `apply_*` here. Element addressing then routes through a single source of
+//! truth -- [`ViewDesc::element_addr`] -- so Tasks 2-4 and Phase 6 all address
+//! elements identically to the VM's `flat_offset` / `offset_for_iter_index`.
+//!
+//! [`RuntimeView`]: crate::bytecode::RuntimeView
+
+use crate::bytecode::{ByteCodeContext, StaticArrayView};
+
+/// Where a view's base address lives, mirroring how the VM resolves the base of
+/// a `RuntimeView` element read (`reduce_view` in `vm.rs`).
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub(crate) enum ViewBase {
+    /// `curr[base_off + ..]` at an *absolute* slot base. This is what
+    /// `PushStaticView` produces: `StaticArrayView::to_runtime_view` copies
+    /// `base_off` verbatim (no `module_off` added), so the byte address is
+    /// `curr_base + (base_off + flat) * 8` with no runtime addend.
+    CurrAbsolute,
+    /// `curr[module_off + base_off + ..]`. `PushVarView` / `PushVarViewDirect`
+    /// fold the runtime `module_off` into the base (`vm.rs:1749` / `1784`), so a
+    /// read adds `module_off * 8` to the constant address. In the current
+    /// single-root scope `module_off == 0`, but the distinction is preserved so
+    /// Phase 7 can thread a real `module_off` without changing addressing.
+    CurrModuleRelative,
+    /// `temp_storage[temp_offsets[base_off] + ..]` (`is_temp`): the base is a
+    /// temp id, resolved against the `temp_storage` region via `temp_offsets`.
+    Temp,
+}
+
+/// A single sparse-dimension mapping, mirroring
+/// [`crate::bytecode::RuntimeSparseMapping`]: the view's index along
+/// `dim_index` is remapped through `parent_offsets` before being multiplied by
+/// the stride (`RuntimeView::flat_offset`).
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub(crate) struct SparseDim {
+    pub dim_index: usize,
+    pub parent_offsets: Vec<u16>,
+}
+
+/// Compile-time mirror of the static parts of [`crate::bytecode::RuntimeView`].
+///
+/// Holds exactly the geometry needed to compute an element's byte address:
+/// `base` (where the storage lives), `dims`/`strides`/`offset`/`sparse` (the
+/// flat-offset arithmetic), and `dim_ids` (broadcast matching, used by Phase 5
+/// Task 3's iteration). `runtime_off_local` / `valid_local` are `None` for every
+/// static view; Task 4's dynamic subscripts set them to wasm locals carrying a
+/// runtime offset addend and a validity flag.
+#[derive(Clone, PartialEq, Debug)]
+pub(crate) struct ViewDesc {
+    /// Base slot offset (in `curr`) or temp id (when `base == Temp`).
+    pub base_off: u32,
+    pub base: ViewBase,
+    /// Dimension sizes (`size() == product`).
+    pub dims: Vec<u16>,
+    /// Per-dimension strides (signed: a transposed view has non-row-major,
+    /// still-positive strides; the sign supports future reversed views).
+    pub strides: Vec<i32>,
+    /// Starting flat offset within the base array (folds in collapsed subscripts
+    /// and range starts).
+    pub offset: u32,
+    /// Sparse dimension mappings (empty unless a star-range was applied).
+    pub sparse: Vec<SparseDim>,
+    /// Dimension IDs, for broadcast matching during iteration (Task 3).
+    pub dim_ids: Vec<u16>,
+    /// wasm i32 local holding a runtime offset addend (dynamic subscript, Task
+    /// 4). `None` for static views.
+    pub runtime_off_local: Option<u32>,
+    /// wasm i32 local that is 0 when the view is invalid (out-of-bounds dynamic
+    /// subscript, Task 4). `None` for static views (always valid).
+    pub valid_local: Option<u32>,
+}
+
+impl ViewDesc {
+    /// Build a `ViewDesc` from a baked [`StaticArrayView`] (`PushStaticView`).
+    ///
+    /// `StaticArrayView::to_runtime_view` copies `base_off` verbatim with no
+    /// `module_off`, so the base is [`ViewBase::CurrAbsolute`] for a variable
+    /// view and [`ViewBase::Temp`] when `is_temp`.
+    pub fn from_static(view: &StaticArrayView) -> Self {
+        ViewDesc {
+            base_off: view.base_off,
+            base: if view.is_temp {
+                ViewBase::Temp
+            } else {
+                ViewBase::CurrAbsolute
+            },
+            dims: view.dims.to_vec(),
+            strides: view.strides.to_vec(),
+            offset: view.offset,
+            sparse: view
+                .sparse
+                .iter()
+                .map(|s| SparseDim {
+                    dim_index: s.dim_index as usize,
+                    parent_offsets: s.parent_offsets.to_vec(),
+                })
+                .collect(),
+            dim_ids: view.dim_ids.to_vec(),
+            runtime_off_local: None,
+            valid_local: None,
+        }
+    }
+
+    /// Build a contiguous view over a full variable/temp array from a dim-list
+    /// (the `(n_dims, sizes)` for `PushVarViewDirect`, or dim sizes resolved
+    /// from `ctx.dimensions` for `PushVarView`/`PushTempView`). Strides are
+    /// row-major, built right-to-left, exactly as `RuntimeView::for_var`.
+    pub fn contiguous(base_off: u32, base: ViewBase, dims: Vec<u16>, dim_ids: Vec<u16>) -> Self {
+        let mut strides = Vec::with_capacity(dims.len());
+        let mut stride = 1i32;
+        for &d in dims.iter().rev() {
+            strides.push(stride);
+            stride *= d as i32;
+        }
+        strides.reverse();
+        ViewDesc {
+            base_off,
+            base,
+            dims,
+            strides,
+            offset: 0,
+            sparse: Vec::new(),
+            dim_ids,
+            runtime_off_local: None,
+            valid_local: None,
+        }
+    }
+
+    /// `size() == product of dims` (`RuntimeView::size`). A scalar view (no
+    /// dims) has size 1. The array reducer (Task 2) bounds its unrolled fold by
+    /// this.
+    pub fn size(&self) -> usize {
+        self.dims.iter().map(|&d| d as usize).product()
+    }
+
+    /// Whether the view is contiguous: offset 0, no sparse mappings, and
+    /// row-major strides (`RuntimeView::is_contiguous`).
+    pub fn is_contiguous(&self) -> bool {
+        if self.offset != 0 || !self.sparse.is_empty() {
+            return false;
+        }
+        let mut expected = 1i32;
+        for i in (0..self.dims.len()).rev() {
+            if self.strides[i] != expected {
+                return false;
+            }
+            expected *= self.dims[i] as i32;
+        }
+        true
+    }
+
+    /// Apply a single-element subscript at `dim_idx` (0-based index), dropping
+    /// that dimension. Exactly mirrors `RuntimeView::apply_single_subscript`:
+    /// a sparse dim's index is first remapped through `parent_offsets` (and the
+    /// mapping removed), the resolved index is folded into `offset`, the
+    /// dimension is removed, and later sparse mappings shift down by one.
+    pub fn apply_single_subscript(&mut self, dim_idx: usize, index: u16) {
+        let actual_index =
+            if let Some(pos) = self.sparse.iter().position(|s| s.dim_index == dim_idx) {
+                let parent_idx = self.sparse[pos].parent_offsets[index as usize];
+                self.sparse.remove(pos);
+                parent_idx
+            } else {
+                index
+            };
+
+        self.offset += actual_index as u32 * self.strides[dim_idx] as u32;
+
+        self.dims.remove(dim_idx);
+        self.strides.remove(dim_idx);
+        self.dim_ids.remove(dim_idx);
+
+        for s in &mut self.sparse {
+            if s.dim_index > dim_idx {
+                s.dim_index -= 1;
+            }
+        }
+    }
+
+    /// Remove `dim_idx` for a *dynamic* single subscript (Task 4): drop the
+    /// dimension/stride/dim_id and return that dimension's stride, leaving the
+    /// (runtime) offset contribution to the caller's `runtime_off_local` rather
+    /// than the compile-time `offset`. This is the runtime-index analogue of
+    /// `apply_single_subscript`: the *shape* change (which dim is collapsed) is
+    /// compile-time, only the offset addend is runtime.
+    ///
+    /// Returns `None` if the dim is out of range or sparse. A sparse dynamic
+    /// subscript would need a runtime `parent_offsets` table lookup, but the
+    /// dynamic-subscript base (`PushVarViewDirect`) is always dense, so this
+    /// never arises in practice; rejecting it keeps a wrong module from being
+    /// emitted if it ever did.
+    pub fn apply_single_subscript_dynamic(&mut self, dim_idx: usize) -> Option<i32> {
+        if dim_idx >= self.dims.len() {
+            return None;
+        }
+        if self.sparse.iter().any(|s| s.dim_index == dim_idx) {
+            return None;
+        }
+        let stride = self.strides[dim_idx];
+        self.dims.remove(dim_idx);
+        self.strides.remove(dim_idx);
+        self.dim_ids.remove(dim_idx);
+        for s in &mut self.sparse {
+            if s.dim_index > dim_idx {
+                s.dim_index -= 1;
+            }
+        }
+        Some(stride)
+    }
+
+    /// The stride of `dim_idx` (for a dynamic subscript's runtime offset
+    /// computation), or `None` if out of range.
+    pub fn stride_at(&self, dim_idx: usize) -> Option<i32> {
+        self.strides.get(dim_idx).copied()
+    }
+
+    /// The size of `dim_idx` (the bound a dynamic subscript range-checks
+    /// against), or `None` if out of range.
+    pub fn dim_at(&self, dim_idx: usize) -> Option<u16> {
+        self.dims.get(dim_idx).copied()
+    }
+
+    /// Apply a `[start:end)` range (0-based) to `dim_idx`
+    /// (`RuntimeView::apply_range`): fold the start into `offset` and shrink the
+    /// dimension to `end - start`.
+    pub fn apply_range(&mut self, dim_idx: usize, start: u16, end: u16) {
+        self.offset += start as u32 * self.strides[dim_idx] as u32;
+        self.dims[dim_idx] = end - start;
+    }
+
+    /// Apply a star-range (sparse) at `dim_idx`
+    /// (`RuntimeView::apply_sparse_with_dim_id`): the dimension's size becomes
+    /// the number of parent offsets, a sparse mapping is recorded, and the
+    /// dim id is relabeled to the subdimension for broadcast matching.
+    pub fn apply_sparse(&mut self, dim_idx: usize, parent_offsets: Vec<u16>, new_dim_id: u16) {
+        self.dims[dim_idx] = parent_offsets.len() as u16;
+        self.sparse.push(SparseDim {
+            dim_index: dim_idx,
+            parent_offsets,
+        });
+        self.dim_ids[dim_idx] = new_dim_id;
+    }
+
+    /// Transpose the view (`RuntimeView::transpose`): reverse dims/strides/
+    /// dim_ids and renumber the sparse `dim_index`es to `n-1-dim_index`.
+    pub fn transpose(&mut self) {
+        self.dims.reverse();
+        self.strides.reverse();
+        self.dim_ids.reverse();
+        let n = self.dims.len();
+        for s in &mut self.sparse {
+            s.dim_index = n - 1 - s.dim_index;
+        }
+    }
+
+    /// The flat element offset (within the base array, in slots) for a flat
+    /// iteration index `iter_idx in 0..size()`. Mirrors
+    /// `RuntimeView::offset_for_iter_index` + `flat_offset`: contiguous views
+    /// short-circuit to `offset + iter_idx`; otherwise the flat index is
+    /// decomposed into row-major multi-dim indices and each (sparse-remapped)
+    /// index multiplied by its stride.
+    pub fn flat_element_offset(&self, iter_idx: usize) -> usize {
+        if self.dims.is_empty() {
+            return self.offset as usize;
+        }
+        if self.is_contiguous() {
+            return self.offset as usize + iter_idx;
+        }
+
+        // Decompose iter_idx into per-dimension indices (last dim varies fastest).
+        let n = self.dims.len();
+        let mut indices = vec![0u16; n];
+        let mut remaining = iter_idx;
+        for d in (0..n).rev() {
+            let dim = self.dims[d] as usize;
+            indices[d] = (remaining % dim) as u16;
+            remaining /= dim;
+        }
+
+        let mut flat = self.offset as usize;
+        for (i, &idx) in indices.iter().enumerate() {
+            let actual = if let Some(s) = self.sparse.iter().find(|s| s.dim_index == i) {
+                s.parent_offsets[idx as usize] as usize
+            } else {
+                idx as usize
+            };
+            flat += actual * self.strides[i] as usize;
+        }
+        flat
+    }
+
+    /// The flat element offset (in slots) for an explicit multi-dimensional
+    /// index, mirroring `RuntimeView::flat_offset`: `offset + Σ idx_k *
+    /// strides[k]`, with a sparse dimension's index first remapped through its
+    /// `parent_offsets`. The broadcast paths below build the multi-dim index
+    /// themselves (rather than from a flat iteration index), so they route
+    /// through this rather than [`flat_element_offset`](Self::flat_element_offset).
+    pub fn flat_offset_for_indices(&self, indices: &[u16]) -> usize {
+        let mut flat = self.offset as usize;
+        for (i, &idx) in indices.iter().enumerate() {
+            let actual = if let Some(s) = self.sparse.iter().find(|s| s.dim_index == i) {
+                s.parent_offsets[idx as usize] as usize
+            } else {
+                idx as usize
+            };
+            flat += actual * self.strides[i] as usize;
+        }
+        flat
+    }
+
+    /// Decompose a flat iteration index into per-dimension indices in row-major
+    /// order (last dim varies fastest), mirroring the VM's iteration-index
+    /// decomposition in `LoadIterViewTop` / `reduce_view` / `increment_indices`.
+    ///
+    /// Shared with `vector.rs` (VectorElmMap's sliced-source projection walks the
+    /// same row-major order), so it is `pub(crate)` rather than private.
+    pub(crate) fn decompose_iter_index(dims: &[u16], iter_idx: usize) -> Vec<u16> {
+        let n = dims.len();
+        let mut indices = vec![0u16; n];
+        let mut remaining = iter_idx;
+        for d in (0..n).rev() {
+            let dim = dims[d] as usize;
+            indices[d] = (remaining % dim) as u16;
+            remaining /= dim;
+        }
+        indices
+    }
+
+    /// The flat element offset (in slots) for reading `self` as the *source* of
+    /// an iteration whose output geometry is `iter` at flat index `current`,
+    /// reproducing the VM's `LoadIterViewTop` / `LoadIterViewAt` broadcast
+    /// (`vm.rs:1946-2182`). Returns `None` when the VM would push NaN: a smaller
+    /// source than the iteration, or a dimension that does not match.
+    ///
+    /// Fast path (source dims/dim_ids equal the iteration's): the simple
+    /// `offset_for_iter_index(current)` read, bounds-checked against the source
+    /// size. Otherwise the broadcast path decomposes `current` into the
+    /// iteration's multi-dim indices, matches dimensions through
+    /// [`crate::dimensions::match_dimensions_two_pass`] (exact dim-id match, then
+    /// the indexed size-fallback), and rebuilds the source indices (bounds-checked
+    /// per dimension). `is_indexed` for each dim comes from `ctx.dimensions`,
+    /// exactly as the VM resolves it.
+    pub fn iter_broadcast_offset(
+        &self,
+        iter: &ViewDesc,
+        current: usize,
+        ctx: &ByteCodeContext,
+    ) -> Option<usize> {
+        // Fast path: dims and dim_ids match exactly -> direct iteration-index read
+        // (with the VM's "source smaller than iteration -> NaN" bounds check).
+        if self.dims == iter.dims && self.dim_ids == iter.dim_ids {
+            if current >= self.size() {
+                return None;
+            }
+            return Some(self.flat_element_offset(current));
+        }
+
+        // Broadcast path: decompose `current` into the iteration's indices, then
+        // map each source dimension to an iteration dimension.
+        let iter_indices = Self::decompose_iter_index(&iter.dims, current);
+
+        let dim_indexed = |dim_ids: &[u16]| -> Vec<bool> {
+            dim_ids
+                .iter()
+                .map(|&dim_id| {
+                    ctx.dimensions
+                        .get(dim_id as usize)
+                        .is_some_and(|d| d.is_indexed)
+                })
+                .collect()
+        };
+        let source_is_indexed = dim_indexed(&self.dim_ids);
+        let iter_is_indexed = dim_indexed(&iter.dim_ids);
+
+        let source_to_iter = crate::dimensions::match_dimensions_two_pass(
+            &self.dim_ids,
+            &self.dims,
+            &source_is_indexed,
+            &iter.dim_ids,
+            &iter.dims,
+            &iter_is_indexed,
+        );
+
+        let mut source_indices: Vec<u16> = Vec::with_capacity(self.dims.len());
+        for (src_dim_pos, mapped_iter_pos) in source_to_iter.iter().enumerate() {
+            let iter_pos = (*mapped_iter_pos)?;
+            let idx = iter_indices[iter_pos];
+            if idx >= self.dims[src_dim_pos] {
+                return None;
+            }
+            source_indices.push(idx);
+        }
+        Some(self.flat_offset_for_indices(&source_indices))
+    }
+
+    /// The byte address of view element `iter_idx`, decomposed into the constant
+    /// part (which rides in a `memarg.offset`) and whether a runtime `module_off`
+    /// addend is still required. This is the single source of truth for element
+    /// addressing -- the unrolled reducer (Task 2), the iteration loop (Task 3),
+    /// and Phase 6 all route through it.
+    ///
+    /// - `CurrAbsolute`: `const = curr_base + (base_off + flat) * 8`,
+    ///   `module_relative = false` (static views bake `module_off` in already).
+    /// - `Temp`: `const = temp_storage_base + (temp_offsets[base_off] + flat)*8`,
+    ///   `module_relative = false`.
+    /// - `CurrModuleRelative`: `const = curr_base + (base_off + flat) * 8`,
+    ///   `module_relative = true` (the caller adds `module_off * 8`). The VM
+    ///   folds `module_off` into the base at `PushVarView` time (`vm.rs:1749`);
+    ///   in the single-root scope `module_off == 0`, so the read is the same as
+    ///   `CurrAbsolute` today, but the flag keeps Phase 7 correct.
+    ///
+    /// A dynamically-subscripted view (`runtime_off_local` set, Task 4) carries
+    /// the runtime addend + validity flag in the returned [`ElementAddr`]; static
+    /// views leave both `None`, so the address is fully constant.
+    pub fn element_addr(
+        &self,
+        iter_idx: usize,
+        curr_base: u32,
+        temp_storage_base: u32,
+        ctx: &ByteCodeContext,
+    ) -> Option<ElementAddr> {
+        let flat = self.flat_element_offset(iter_idx);
+        self.element_addr_for_flat(flat, curr_base, temp_storage_base, ctx)
+    }
+
+    /// Like [`element_addr`](Self::element_addr) but for an *already-computed*
+    /// flat slot offset (the broadcast paths build the flat offset themselves via
+    /// [`flat_offset_for_indices`](Self::flat_offset_for_indices), rather than
+    /// from an iteration index). Static-view behaviour is byte-identical to
+    /// `element_addr` for the same flat offset (both `runtime_off_local` /
+    /// `valid_local` are `None`).
+    pub fn element_addr_for_flat(
+        &self,
+        flat: usize,
+        curr_base: u32,
+        temp_storage_base: u32,
+        ctx: &ByteCodeContext,
+    ) -> Option<ElementAddr> {
+        let flat = flat as u64;
+        let (const_byte_offset, module_relative) = match self.base {
+            ViewBase::CurrAbsolute => (
+                u64::from(curr_base) + (u64::from(self.base_off) + flat) * 8,
+                false,
+            ),
+            ViewBase::CurrModuleRelative => (
+                u64::from(curr_base) + (u64::from(self.base_off) + flat) * 8,
+                true,
+            ),
+            ViewBase::Temp => {
+                let temp_off = *ctx.temp_offsets.get(self.base_off as usize)? as u64;
+                (u64::from(temp_storage_base) + (temp_off + flat) * 8, false)
+            }
+        };
+        Some(ElementAddr {
+            const_byte_offset,
+            module_relative,
+            runtime_off_local: self.runtime_off_local,
+            valid_local: self.valid_local,
+        })
+    }
+}
+
+/// The byte address of a view element, split into the compile-time-constant part
+/// (a `memarg.offset`) and the runtime addends a dynamic subscript (Task 4)
+/// requires. Returned by [`ViewDesc::element_addr`].
+///
+/// `module_relative` adds `module_off * 8` (var views; 0 in the single-root
+/// scope). `runtime_off_local` (when `Some`) adds that i32 local's slot offset
+/// times 8 (a dynamic subscript's accumulated `(index-1)*stride`).
+/// `valid_local` (when `Some`) gates the load: 0 means out of bounds, so the
+/// read yields NaN rather than touching memory. Both are `None` for a static
+/// view, leaving the address fully constant.
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub(crate) struct ElementAddr {
+    pub const_byte_offset: u64,
+    pub module_relative: bool,
+    pub runtime_off_local: Option<u32>,
+    pub valid_local: Option<u32>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::bytecode::{RuntimeSparseMapping, RuntimeView};
+    use smallvec::SmallVec;
+
+    /// Build the VM `RuntimeView` equivalent of a `ViewDesc` so the two
+    /// addressing implementations can be cross-checked. Validity/runtime locals
+    /// are not part of the geometry, so a static-shaped `ViewDesc` maps directly.
+    fn to_runtime_view(d: &ViewDesc) -> RuntimeView {
+        RuntimeView {
+            base_off: d.base_off,
+            is_temp: matches!(d.base, ViewBase::Temp),
+            dims: SmallVec::from_slice(&d.dims),
+            strides: SmallVec::from_slice(&d.strides),
+            offset: d.offset,
+            sparse: d
+                .sparse
+                .iter()
+                .map(|s| RuntimeSparseMapping {
+                    dim_index: s.dim_index as u8,
+                    parent_offsets: SmallVec::from_slice(&s.parent_offsets),
+                })
+                .collect(),
+            dim_ids: SmallVec::from_slice(&d.dim_ids),
+            is_valid: true,
+        }
+    }
+
+    /// Assert `ViewDesc::flat_element_offset` agrees with the VM's
+    /// `RuntimeView::offset_for_iter_index` for every element of the view -- the
+    /// addressing oracle Task 1 must match.
+    fn assert_flat_matches_vm(d: &ViewDesc) {
+        let rv = to_runtime_view(d);
+        assert_eq!(d.size(), rv.size(), "size mismatch");
+        assert_eq!(d.is_contiguous(), rv.is_contiguous(), "contiguity mismatch");
+        for i in 0..d.size() {
+            assert_eq!(
+                d.flat_element_offset(i),
+                rv.offset_for_iter_index(i),
+                "flat offset mismatch at element {i}"
+            );
+        }
+    }
+
+    fn dense(base_off: u32, dims: &[u16]) -> ViewDesc {
+        ViewDesc::contiguous(
+            base_off,
+            ViewBase::CurrAbsolute,
+            dims.to_vec(),
+            vec![0u16; dims.len()],
+        )
+    }
+
+    #[test]
+    fn contiguous_1d_addresses_match_vm() {
+        assert_flat_matches_vm(&dense(0, &[5]));
+        assert_flat_matches_vm(&dense(7, &[5]));
+    }
+
+    #[test]
+    fn contiguous_2d_addresses_match_vm() {
+        assert_flat_matches_vm(&dense(0, &[2, 3]));
+        assert_flat_matches_vm(&dense(0, &[3, 4]));
+    }
+
+    #[test]
+    fn subscript_const_drops_dim_like_vm() {
+        // 2x3 matrix; subscript dim 0 to index 1 -> a 1-D row at offset 3.
+        let mut d = dense(0, &[2, 3]);
+        let mut rv = to_runtime_view(&d);
+        d.apply_single_subscript(0, 1);
+        rv.apply_single_subscript(0, 1);
+        assert_eq!(d.offset, rv.offset);
+        assert_eq!(d.dims.as_slice(), rv.dims.as_slice());
+        assert_eq!(d.strides.as_slice(), rv.strides.as_slice());
+        assert_flat_matches_vm(&d);
+    }
+
+    #[test]
+    fn range_matches_vm() {
+        // [1:4) of a 5-element dim: offset 1, dim 3.
+        let mut d = dense(0, &[5]);
+        d.apply_range(0, 1, 4);
+        assert_eq!(d.offset, 1);
+        assert_eq!(d.dims, vec![3]);
+        assert_flat_matches_vm(&d);
+    }
+
+    #[test]
+    fn transpose_matches_vm() {
+        let mut d = dense(0, &[2, 3]);
+        let mut rv = to_runtime_view(&d);
+        d.transpose();
+        rv.transpose();
+        assert_eq!(d.dims.as_slice(), rv.dims.as_slice());
+        assert_eq!(d.strides.as_slice(), rv.strides.as_slice());
+        assert!(
+            !d.is_contiguous(),
+            "a transposed 2x3 view is non-contiguous"
+        );
+        assert_flat_matches_vm(&d);
+    }
+
+    #[test]
+    fn star_range_sparse_matches_vm() {
+        // A 1-D dim of 4, star-ranged to parent offsets [1, 3].
+        let mut d = dense(0, &[4]);
+        let mut rv = to_runtime_view(&d);
+        d.apply_sparse(0, vec![1, 3], 1);
+        rv.apply_sparse_with_dim_id(0, SmallVec::from_slice(&[1, 3]), 1);
+        assert_eq!(d.dims, vec![2]);
+        assert_flat_matches_vm(&d);
+        // The two selected elements map to parent flat offsets 1 and 3.
+        assert_eq!(d.flat_element_offset(0), 1);
+        assert_eq!(d.flat_element_offset(1), 3);
+    }
+
+    #[test]
+    fn subscript_then_renumbers_sparse_like_vm() {
+        // A 2-D view [3,4] with a sparse mapping on dim 1; subscript dim 0 must
+        // shift the sparse dim_index down to 0, matching the VM.
+        let mut d = dense(0, &[3, 4]);
+        d.apply_sparse(1, vec![0, 2], 5); // sparse on dim 1 -> dim 1 size 2
+        let mut rv = to_runtime_view(&d);
+        d.apply_single_subscript(0, 1);
+        rv.apply_single_subscript(0, 1);
+        assert_eq!(d.sparse.len(), 1);
+        assert_eq!(d.sparse[0].dim_index, rv.sparse[0].dim_index as usize);
+        assert_flat_matches_vm(&d);
+    }
+
+    #[test]
+    fn element_addr_curr_absolute_const() {
+        let d = dense(2, &[3]);
+        let ctx = ByteCodeContext::default();
+        // element 1 at curr_base=0: (base_off 2 + flat 1) * 8 = 24.
+        let a = d.element_addr(1, 0, 0, &ctx).unwrap();
+        assert_eq!(a.const_byte_offset, 24);
+        assert!(!a.module_relative);
+        // A static view carries no runtime addend or validity gate.
+        assert_eq!(a.runtime_off_local, None);
+        assert_eq!(a.valid_local, None);
+    }
+
+    #[test]
+    fn element_addr_curr_module_relative_flag() {
+        let d = ViewDesc::contiguous(2, ViewBase::CurrModuleRelative, vec![3], vec![0]);
+        let ctx = ByteCodeContext::default();
+        let a = d.element_addr(1, 0, 0, &ctx).unwrap();
+        assert_eq!(a.const_byte_offset, 24);
+        assert!(
+            a.module_relative,
+            "var views carry a runtime module_off addend"
+        );
+    }
+
+    #[test]
+    fn element_addr_temp_uses_offset_table() {
+        let mut ctx = ByteCodeContext::default();
+        ctx.set_temp_info(vec![0, 4], 8);
+        let d = ViewDesc::contiguous(1, ViewBase::Temp, vec![2], vec![0]);
+        // temp_storage_base = 1000; temp 1 offset = 4; element 1 -> (4+1)*8 = 40.
+        let a = d.element_addr(1, 0, 1000, &ctx).unwrap();
+        assert_eq!(a.const_byte_offset, 1000 + 40);
+        assert!(!a.module_relative);
+    }
+
+    #[test]
+    fn element_addr_dynamic_view_carries_runtime_locals() {
+        // A view with a runtime offset addend + validity flag (Task 4) returns
+        // the constant base plus the locals the caller must add/guard.
+        let mut d = dense(0, &[3]);
+        d.runtime_off_local = Some(9);
+        d.valid_local = Some(7);
+        let ctx = ByteCodeContext::default();
+        let a = d.element_addr(0, 0, 0, &ctx).unwrap();
+        // Element 0: const base is just curr_base + base_off*8 = 0; the runtime
+        // index offset rides in local 9, the validity in local 7.
+        assert_eq!(a.const_byte_offset, 0);
+        assert_eq!(a.runtime_off_local, Some(9));
+        assert_eq!(a.valid_local, Some(7));
+    }
+
+    // ── iter_broadcast_offset (Task 3): cross-check against the VM ─────────
+
+    /// A `ByteCodeContext` whose dimension table makes the dims with the given
+    /// ids indexed (so `match_dimensions_two_pass`'s size-fallback can fire), all
+    /// of `size`. Used only so `iter_broadcast_offset` can resolve `is_indexed`.
+    fn ctx_indexed_dims(n: usize, size: u16) -> ByteCodeContext {
+        let mut ctx = ByteCodeContext::default();
+        for _ in 0..n {
+            let nid = ctx.intern_name("D");
+            ctx.add_dimension(crate::bytecode::DimensionInfo::indexed(nid, size));
+        }
+        ctx
+    }
+
+    /// Build a `ViewDesc` with explicit dims/dim_ids (row-major contiguous).
+    fn view_with_dim_ids(dims: &[u16], dim_ids: &[u16]) -> ViewDesc {
+        ViewDesc::contiguous(0, ViewBase::CurrAbsolute, dims.to_vec(), dim_ids.to_vec())
+    }
+
+    #[test]
+    fn iter_broadcast_offset_matches_fast_path() {
+        // Source dims == iter dims: every element reads its own offset.
+        let ctx = ctx_indexed_dims(2, 3);
+        let iter = view_with_dim_ids(&[2, 3], &[0, 1]);
+        let src = view_with_dim_ids(&[2, 3], &[0, 1]);
+        for current in 0..iter.size() {
+            assert_eq!(
+                src.iter_broadcast_offset(&iter, current, &ctx),
+                Some(current),
+                "fast-path element {current}"
+            );
+        }
+    }
+
+    #[test]
+    fn iter_broadcast_offset_broadcasts_smaller_source() {
+        // iter is 2-D [DimA(2), DimB(3)]; source is 1-D [DimA(2)] (dim_id 0). The
+        // VM broadcasts the source along the missing DimB, so result element
+        // (a, b) reads source[a]. dim_ids: iter [0,1], source [0].
+        let ctx = ctx_indexed_dims(2, 3);
+        let iter = view_with_dim_ids(&[2, 3], &[0, 1]);
+        let src = view_with_dim_ids(&[2], &[0]);
+        for a in 0..2u16 {
+            for b in 0..3u16 {
+                let current = (a as usize) * 3 + b as usize;
+                // Result element (a,b) -> source index [a] -> flat offset a.
+                assert_eq!(
+                    src.iter_broadcast_offset(&iter, current, &ctx),
+                    Some(a as usize),
+                    "broadcast element ({a},{b})"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn iter_broadcast_offset_smaller_source_same_shape_is_nan() {
+        // Same dims/dim_ids fast path, but the source is genuinely shorter than
+        // the iteration: the VM returns NaN past the source size.
+        let ctx = ctx_indexed_dims(1, 5);
+        let iter = view_with_dim_ids(&[5], &[0]);
+        let src = view_with_dim_ids(&[3], &[0]);
+        assert_eq!(src.iter_broadcast_offset(&iter, 2, &ctx), Some(2));
+        assert_eq!(
+            src.iter_broadcast_offset(&iter, 3, &ctx),
+            None,
+            "element past the source size must be NaN"
+        );
+    }
+
+    #[test]
+    fn iter_broadcast_offset_unmatched_dim_is_nan() {
+        // Source dim_id 7 has no counterpart in the iteration (dim_ids [0,1]) and
+        // is named (not indexed), so the size-fallback cannot match it either:
+        // the VM returns NaN.
+        let mut ctx = ByteCodeContext::default();
+        let n0 = ctx.intern_name("A");
+        ctx.add_dimension(crate::bytecode::DimensionInfo::indexed(n0, 2)); // id 0
+        let n1 = ctx.intern_name("B");
+        ctx.add_dimension(crate::bytecode::DimensionInfo::indexed(n1, 3)); // id 1
+        // A named (non-indexed) dim id 2 used only by the source.
+        let n2 = ctx.intern_name("C");
+        ctx.add_dimension(crate::bytecode::DimensionInfo::named(
+            n2,
+            SmallVec::from_slice(&[n0, n1]),
+        )); // id 2, size 2, named
+        let iter = view_with_dim_ids(&[2, 3], &[0, 1]);
+        let src = view_with_dim_ids(&[2], &[2]);
+        assert_eq!(src.iter_broadcast_offset(&iter, 0, &ctx), None);
+    }
+
+    /// Cross-check `iter_broadcast_offset` against a from-scratch reimplementation
+    /// of the VM's `LoadIterViewTop` broadcast over a `RuntimeView`, for a
+    /// transpose-broadcast case (iter [DimA,DimB], source [DimB] -- the source's
+    /// single dim matches the iteration's *second* axis by dim-id).
+    #[test]
+    fn iter_broadcast_offset_matches_vm_loaditerviewtop() {
+        let ctx = ctx_indexed_dims(2, 0); // sizes overwritten below
+        // Rebuild with distinct sizes: DimA=2 (id 0), DimB=4 (id 1).
+        let mut ctx2 = ByteCodeContext::default();
+        let na = ctx2.intern_name("A");
+        ctx2.add_dimension(crate::bytecode::DimensionInfo::indexed(na, 2));
+        let nb = ctx2.intern_name("B");
+        ctx2.add_dimension(crate::bytecode::DimensionInfo::indexed(nb, 4));
+        let _ = ctx;
+
+        let iter = view_with_dim_ids(&[2, 4], &[0, 1]);
+        let src = view_with_dim_ids(&[4], &[1]); // only DimB
+        let iter_rv = to_runtime_view(&iter);
+        let src_rv = to_runtime_view(&src);
+
+        for current in 0..iter.size() {
+            // VM reference: decompose current into iter indices, match dims by id
+            // (DimB is id 1 in both), read source[that DimB index].
+            let n = iter_rv.dims.len();
+            let mut idx: SmallVec<[u16; 4]> = smallvec::smallvec![0; n];
+            let mut rem = current;
+            for d in (0..n).rev() {
+                idx[d] = (rem % iter_rv.dims[d] as usize) as u16;
+                rem /= iter_rv.dims[d] as usize;
+            }
+            // DimB is iteration axis 1.
+            let want = src_rv.flat_offset(&[idx[1]]);
+            assert_eq!(
+                src.iter_broadcast_offset(&iter, current, &ctx2),
+                Some(want),
+                "element {current}"
+            );
+        }
+    }
+}
diff --git a/src/simlin-engine/tests/simulate.rs b/src/simlin-engine/tests/simulate.rs
index 2d9ada4ff..958d17dc8 100644
--- a/src/simlin-engine/tests/simulate.rs
+++ b/src/simlin-engine/tests/simulate.rs
@@ -15,7 +15,9 @@ use simlin_engine::serde::{deserialize, serialize};
 use simlin_engine::{Method, Results, SimSpecs as Specs, Vm, project_io};
 use simlin_engine::{load_csv, load_dat, open_vensim, open_vensim_with_data, xmile};
 
-use test_helpers::{ensure_results, ensure_results_excluding};
+use test_helpers::{
+    WasmRunOutcome, ensure_results, ensure_results_excluding, ensure_wasm_matches, wasm_results_for,
+};
 
 const OUTPUT_FILES: &[(&str, u8)] = &[("output.csv", b','), ("output.tab", b'\t')];
 
@@ -100,6 +102,86 @@ static TEST_MODELS: &[&str] = &[
     "test/test-models/tests/unicode_characters/unicode_test_model.xmile",
 ];
 
+/// End-state wasm parity gate (wasm-backend AC3.2 / AC3.3): EVERY corpus model
+/// in `TEST_MODELS` must run through the wasm backend to VM parity -- zero may
+/// return `WasmGenError::Unsupported`. `expected` is the VM's own output (the
+/// parse + `compile_vm` + run path), so this is a direct wasm-vs-VM check
+/// independent of the on-disk reference files; the per-model inline hook
+/// (`wasm_parity_hook`) separately checks every model against its on-disk
+/// `expected` and likewise hard-fails on `Unsupported`.
+///
+/// This replaces the Phase 1-7 monotonic floor (a `ran >= FLOOR` count). The
+/// backend now covers the full core-simulation surface -- scalar + every
+/// `Apply` builtin + arrays/reducers/iteration + vector ops + allocation +
+/// scalar/array lookups + Euler/RK2/RK4 + PREVIOUS/INIT + nested modules -- so
+/// the end state is total coverage, and any regression that makes a previously
+/// supported model `Unsupported` fails here (AC3.3) with the offending model and
+/// reason. The genuinely out-of-scope constructs (a runtime view range
+/// `arr[lo:hi]` with non-literal bounds -> `ViewRangeDynamic`, or array
+/// unrolling past the per-function budget) are not reached by any `TEST_MODELS`
+/// member; they are pinned by the inline `wasmgen` unit tests and
+/// `ensure_wasm_matches_skips_unsupported_model`. The heavy `#[ignore]`-class
+/// models (C-LEARN) have their own `#[ignore]`d wasm twins so this gate stays
+/// within the default suite's 3-minute wall-clock cap.
+///
+/// Iterating the full `TEST_MODELS` list under the un-JITed DLR-FT interpreter
+/// stays well within that cap (the corpus is small/medium scalar/arrayed
+/// models), so the gate covers the whole list rather than a subset.
+#[test]
+fn wasm_parity_floor() {
+    let mut unsupported: Vec<(String, String)> = Vec::new();
+    for &path in TEST_MODELS {
+        let file_path = format!("../../{path}");
+        if let WasmRunOutcome::Skipped(msg) = wasm_parity_outcome_for_path(&file_path) {
+            unsupported.push((path.to_string(), msg));
+        }
+    }
+    eprintln!(
+        "wasm_parity_floor: {} of {} corpus models ran to VM parity ({} unsupported)",
+        TEST_MODELS.len() - unsupported.len(),
+        TEST_MODELS.len(),
+        unsupported.len()
+    );
+    assert!(
+        unsupported.is_empty(),
+        "wasm parity gate (AC3.2/AC3.3): every corpus model must run through the \
+         wasm backend, but {} of {} returned Unsupported -- a regression that \
+         dropped a previously-supported model, or a new feature whose lowering is \
+         missing:\n{}",
+        unsupported.len(),
+        TEST_MODELS.len(),
+        unsupported
+            .iter()
+            .map(|(p, m)| format!("  {p}: {m}"))
+            .collect::<Vec<_>>()
+            .join("\n")
+    );
+}
+
+/// Parse the XMILE/STMX model at `path`, run it through the VM for an `expected`
+/// baseline, and return whether the wasm backend reproduces it (`Ran`) or
+/// returns `Unsupported` (`Skipped`). Used only by `wasm_parity_floor`, which
+/// turns any `Skipped` into a hard failure. A parse or VM failure is surfaced as
+/// `Skipped` (the VM corpus tests gate those paths directly; this gate only
+/// checks wasm-vs-VM parity, never re-litigates VM correctness), so an
+/// upstream parse/VM break would also trip the gate -- intended, since a model
+/// that no longer VM-simulates can't establish wasm parity either.
+fn wasm_parity_outcome_for_path(path: &str) -> WasmRunOutcome {
+    let datamodel = {
+        let Ok(f) = File::open(path) else {
+            return WasmRunOutcome::Skipped(format!("could not open {path}"));
+        };
+        let mut f = BufReader::new(f);
+        match xmile::project_from_reader(&mut f) {
+            Ok(p) => p,
+            Err(e) => return WasmRunOutcome::Skipped(format!("parse failed: {e}")),
+        }
+    };
+
+    let expected = vm_results(&datamodel);
+    ensure_wasm_matches(&datamodel, "main", &expected, &[])
+}
+
 /// Compile a datamodel project to a VM simulation using the incremental
 /// salsa-backed path.
 fn compile_vm(
@@ -821,6 +903,80 @@ fn run_vacuous_comparison_scenarios() {
     asserts_panic("near-zero but meaningful divergence", &expected_nz, &sim_nz);
 }
 
+/// Run the named model of `datamodel` through the VM and return its
+/// `Results`, used as the `expected` baseline both the focused
+/// `ensure_wasm_matches` tests and `wasm_parity_floor` compare wasm output
+/// against. Mirrors the corpus VM path (`compile_vm` -> `Vm::new` ->
+/// `run_to_end`).
+fn vm_results(datamodel: &simlin_engine::datamodel::Project) -> Results {
+    let compiled = compile_vm(datamodel);
+    let mut vm = Vm::new(compiled).unwrap();
+    vm.run_to_end().unwrap();
+    vm.into_results()
+}
+
+/// AC1.1: a scalar Euler model the wasm backend supports runs through
+/// `ensure_wasm_matches` and clears the same `ensure_results` comparator the VM
+/// clears (the helper panics internally on any divergence), so the outcome is
+/// `Ran`.
+#[test]
+fn ensure_wasm_matches_runs_supported_scalar_model() {
+    let datamodel = simlin_engine::test_common::TestProject::new("simple")
+        .with_sim_time(0.0, 10.0, 1.0)
+        .aux("inflow_rate", "2", None)
+        .stock("level", "0", &["inflow"], &[], None)
+        .flow("inflow", "inflow_rate", None)
+        .build_datamodel();
+
+    let expected = vm_results(&datamodel);
+    let outcome = ensure_wasm_matches(&datamodel, "main", &expected, &[]);
+    assert!(
+        matches!(outcome, WasmRunOutcome::Ran),
+        "a supported scalar model must run through the wasm backend, got {outcome:?}"
+    );
+}
+
+/// AC3.1: a model using a not-yet-supported construct is SKIPPED, not failed --
+/// `compile_simulation` returns `WasmGenError::Unsupported` and the helper
+/// surfaces it as `Skipped(msg)` carrying that message.
+///
+/// The example construct has migrated as the backend's coverage grew: `^`
+/// (`Op2::Exp`) became supported in Phase 2 Task 3, RK4 in Phase 4, and *modules*
+/// (so `SMTH1`/`DELAY3` stdlib expansions) in Phase 7. The stable still-
+/// unsupported construct is now a *true runtime range* `arr[lo:hi]` with
+/// non-literal bounds, which lowers to `Opcode::ViewRangeDynamic` -- a runtime
+/// view *size* the fully-unrolled emitter cannot express (`wasmgen.rs`'s
+/// `ViewRangeDynamic` arm returns `Unsupported`). A literal range is
+/// constant-folded into a static view, so the bounds must be variables.
+#[test]
+fn ensure_wasm_matches_skips_unsupported_model() {
+    let datamodel = simlin_engine::test_common::TestProject::new("unsupported")
+        .with_sim_time(0.0, 5.0, 1.0)
+        .indexed_dimension("A", 5)
+        .array_aux("source[A]", "A")
+        .scalar_aux("lo", "2")
+        .scalar_aux("hi", "4")
+        // SUM over a runtime range (variable bounds) -> ViewRangeDynamic, which
+        // the wasm backend cannot express (a runtime view size in a fully-
+        // unrolled emitter), so the whole model is Skipped.
+        .scalar_aux("total", "SUM(source[lo:hi])")
+        .build_datamodel();
+
+    let expected = vm_results(&datamodel);
+    let outcome = ensure_wasm_matches(&datamodel, "main", &expected, &[]);
+    match outcome {
+        WasmRunOutcome::Skipped(msg) => {
+            assert!(
+                msg.contains("ViewRangeDynamic"),
+                "expected the runtime-range rejection message, got: {msg}"
+            );
+        }
+        WasmRunOutcome::Ran => {
+            panic!("a model using a runtime-range construct must be Skipped, not Ran")
+        }
+    }
+}
+
 type CompileFn = fn(&simlin_engine::datamodel::Project) -> simlin_engine::CompiledSimulation;
 
 fn simulate_path(xmile_path: &str) {
@@ -912,6 +1068,39 @@ fn simulate_path_with_excluding(xmile_path: &str, compile: CompileFn, excluded:
     // byte-for-byte identical (we aren't losing any information)
     let serialized_xmile2 = xmile::project_to_xmile(&roundtripped_project).unwrap();
     assert_eq!(&serialized_xmile, &serialized_xmile2);
+
+    // wasm-backend parity: after the VM comparisons pass, run the model through
+    // the wasm backend once and assert it clears the SAME comparator against the
+    // same `expected`. A supported model that diverges panics inside the helper;
+    // an `Unsupported` outcome for this VM-simulated model is now a HARD FAILURE
+    // (the corpus gate, AC3.2). See AC1.1 / AC3.2.
+    wasm_parity_hook(&datamodel_project, &expected, excluded);
+}
+
+/// Run one already-parsed, VM-simulated model through the wasm backend and
+/// assert parity. This is reached only from the `simulate_path`/`simulate_mdl`
+/// helpers, i.e. AFTER the VM has simulated the model, so a `Skipped`
+/// (`WasmGenError::Unsupported`) here means a model the VM handles is NOT
+/// covered by the wasm backend -- a hard failure (AC3.2: every core-simulation
+/// model runs through both backends). A model the VM itself cannot simulate
+/// (DELAY FIXED, GET DATA) is `#[ignore]`d and never reaches this hook, so it
+/// stays out of scope. A supported-but-divergent model panics inside
+/// `ensure_wasm_matches`.
+fn wasm_parity_hook(
+    datamodel: &simlin_engine::datamodel::Project,
+    expected: &Results,
+    excluded: &[&str],
+) {
+    if let WasmRunOutcome::Skipped(msg) = ensure_wasm_matches(datamodel, "main", expected, excluded)
+    {
+        panic!(
+            "wasm parity gate: a VM-simulated model returned Unsupported from the \
+             wasm backend -- every core-simulation model must run through both \
+             backends (AC3.2). Close the lowering gap or, if this is a genuinely \
+             VM-unsupported feature, the test should be #[ignore]d so it never \
+             reaches this hook. Reason: {msg}"
+        );
+    }
 }
 
 fn load_expected_results_for_mdl(mdl_path: &str) -> Option<Results> {
@@ -957,6 +1146,8 @@ fn simulate_mdl_path(mdl_path: &str) {
     let expected = load_expected_results_for_mdl(mdl_path)
         .unwrap_or_else(|| panic!("no reference data found for {mdl_path}"));
     ensure_results(&expected, &results);
+
+    wasm_parity_hook(&datamodel_project, &expected, &[]);
 }
 
 /// Simulate a Vensim MDL file that references external data files.
@@ -987,6 +1178,8 @@ fn simulate_mdl_path_with_data(mdl_path: &str) {
     let expected = load_expected_results_for_mdl(mdl_path)
         .unwrap_or_else(|| panic!("no reference data found for {mdl_path}"));
     ensure_results(&expected, &results);
+
+    wasm_parity_hook(&datamodel_project, &expected, &[]);
 }
 
 #[test]
@@ -1714,6 +1907,48 @@ fn simulates_wrld3_03() {
     assert_eq!(vdf_results.step_count, results.step_count);
 }
 
+/// WORLD3 wasm parity twin (wasm-backend.AC1.1, heavy-model scale check): WORLD3
+/// is a large model, so its wasm blob exercises the backend well beyond the
+/// small/medium default corpus. The VM test above only smoke-checks the VDF
+/// decoder (no series comparison), so this twin asserts the wasm output matches
+/// the VM output element-for-element via `ensure_results` -- the strongest
+/// available parity check for this model (both backends consume the same
+/// `CompiledSimulation`, so any divergence is a wasm lowering bug). A
+/// `WasmGenError::Unsupported` would be a hard failure: WORLD3 is a
+/// core-simulation model the VM handles. `#[ignore]`d for runtime class, like
+/// the other heavy models.
+///
+/// Run with: cargo test --release -- --ignored simulates_wrld3_03_wasm
+#[test]
+#[ignore]
+fn simulates_wrld3_03_wasm() {
+    let mdl_path = "../../test/metasd/WRLD3-03/wrld3-03.mdl";
+
+    eprintln!("model (vensim mdl): {mdl_path}");
+
+    let contents = std::fs::read_to_string(mdl_path)
+        .unwrap_or_else(|e| panic!("failed to read {mdl_path}: {e}"));
+
+    let datamodel_project =
+        open_vensim(&contents).unwrap_or_else(|e| panic!("failed to parse {mdl_path}: {e}"));
+
+    // VM reference run.
+    let compiled = compile_vm(&datamodel_project);
+    let mut vm =
+        Vm::new(compiled).unwrap_or_else(|e| panic!("VM creation failed for {mdl_path}: {e}"));
+    vm.run_to_end()
+        .unwrap_or_else(|e| panic!("VM run failed for {mdl_path}: {e}"));
+    let vm_results = vm.into_results();
+
+    // wasm twin: compile through the backend, run under the interpreter, and
+    // match the VM element-for-element.
+    let wasm_results = wasm_results_for(&datamodel_project, "main").unwrap_or_else(|msg| {
+        panic!("WORLD3 must compile to wasm (a core-simulation model the VM handles): {msg}")
+    });
+
+    ensure_results(&vm_results, &wasm_results);
+}
+
 /// Known-residual C-LEARN base-variable names excluded from the
 /// `simulates_clearn` VDF gate. C-LEARN compiles via the incremental path,
 /// runs to FINAL TIME, and matches `Ref.vdf` within the 1% cross-simulator
@@ -1838,13 +2073,11 @@ fn simulates_clearn() {
     ensure_vdf_results_excluding(&vdf_results, &results, EXPECTED_VDF_RESIDUAL);
 }
 
-/// Compile and run C-LEARN end-to-end and parse `Ref.vdf`, returning
-/// `(vdf_results, results)`. Shared by `simulates_clearn` (the 1% gate) and
-/// `clearn_residual_exactness` (the exclusion-exactness guard) so both exercise
-/// the byte-identical `open_vensim` -> `compile_vm` -> `run_to_end` -> parse-VDF
-/// path and compare the same data. Heavy (C-LEARN is ~53k lines / 1.4 MB,
-/// ~5s just to parse on release), so every caller is `#[ignore]`d.
-fn run_clearn_vs_vdf() -> (Results, Results) {
+/// Read and parse the C-LEARN `.mdl` into a datamodel project. Shared by the VM
+/// path ([`run_clearn_vs_vdf`]) and the wasm twin ([`simulates_clearn_wasm`]) so
+/// both compile the byte-identical model. Heavy (C-LEARN is ~53k lines / 1.4 MB,
+/// ~5s just to parse on release).
+fn clearn_datamodel() -> simlin_engine::datamodel::Project {
     let mdl_path = "../../test/xmutil_test_models/C-LEARN v77 for Vensim.mdl";
 
     eprintln!("model (vensim mdl): {mdl_path}");
@@ -1852,28 +2085,72 @@ fn run_clearn_vs_vdf() -> (Results, Results) {
     let contents = std::fs::read_to_string(mdl_path)
         .unwrap_or_else(|e| panic!("failed to read {mdl_path}: {e}"));
 
-    let datamodel_project =
-        open_vensim(&contents).unwrap_or_else(|e| panic!("failed to parse {mdl_path}: {e}"));
-
-    let compiled = compile_vm(&datamodel_project);
-    let mut vm =
-        Vm::new(compiled).unwrap_or_else(|e| panic!("VM creation failed for {mdl_path}: {e}"));
-    vm.run_to_end()
-        .unwrap_or_else(|e| panic!("VM run failed for {mdl_path}: {e}"));
-    let results = vm.into_results();
+    open_vensim(&contents).unwrap_or_else(|e| panic!("failed to parse {mdl_path}: {e}"))
+}
 
+/// Parse the C-LEARN `Ref.vdf` genuine-Vensim reference output into `Results`.
+/// Shared by every C-LEARN comparison path so they assert against identical data.
+fn clearn_vdf_results() -> Results {
     let vdf_path = "../../test/xmutil_test_models/Ref.vdf";
     let vdf_data_bytes =
         std::fs::read(vdf_path).unwrap_or_else(|e| panic!("failed to read {vdf_path}: {e}"));
     let vdf_file = simlin_engine::vdf::VdfFile::parse(vdf_data_bytes)
         .unwrap_or_else(|e| panic!("failed to parse VDF {vdf_path}: {e}"));
-    let vdf_results = vdf_file
+    vdf_file
         .to_results_via_records()
-        .unwrap_or_else(|e| panic!("VDF to_results_via_records failed: {e}"));
+        .unwrap_or_else(|e| panic!("VDF to_results_via_records failed: {e}"))
+}
+
+/// Compile and run C-LEARN end-to-end through the VM and parse `Ref.vdf`,
+/// returning `(vdf_results, results)`. Shared by `simulates_clearn` (the 1% gate)
+/// and `clearn_residual_exactness` (the exclusion-exactness guard) so both
+/// exercise the byte-identical `open_vensim` -> `compile_vm` -> `run_to_end` ->
+/// parse-VDF path and compare the same data. Heavy, so every caller is
+/// `#[ignore]`d.
+fn run_clearn_vs_vdf() -> (Results, Results) {
+    let datamodel_project = clearn_datamodel();
+
+    let compiled = compile_vm(&datamodel_project);
+    let mut vm =
+        Vm::new(compiled).unwrap_or_else(|e| panic!("VM creation failed for C-LEARN: {e}"));
+    vm.run_to_end()
+        .unwrap_or_else(|e| panic!("VM run failed for C-LEARN: {e}"));
+    let results = vm.into_results();
+
+    let vdf_results = clearn_vdf_results();
 
     (vdf_results, results)
 }
 
+/// C-LEARN wasm parity twin (wasm-backend.AC1.3): compile C-LEARN through the
+/// wasm backend, run it under the DLR-FT interpreter, and assert its output
+/// clears the SAME hard 1% VDF gate + `EXPECTED_VDF_RESIDUAL` carve-out that
+/// `simulates_clearn` applies to the VM. Both backends consume the same
+/// `CompiledSimulation` produced by `compile_project_incremental`, so the wasm
+/// output must clear the gate exactly as the VM does (a divergence is a wasm
+/// lowering bug); the residual carve-out is identical because it is a property
+/// of the model + reference data, not the execution engine. `#[ignore]`d for
+/// runtime class -- C-LEARN under the non-JIT interpreter is slow -- exactly
+/// like `simulates_clearn`.
+///
+/// A `WasmGenError::Unsupported` here would be a hard failure: C-LEARN is a
+/// core-simulation model the VM handles, so the wasm backend must too.
+///
+/// Run with: cargo test --release -- --ignored simulates_clearn_wasm
+#[test]
+#[ignore]
+fn simulates_clearn_wasm() {
+    let datamodel_project = clearn_datamodel();
+
+    let wasm_results = wasm_results_for(&datamodel_project, "main").unwrap_or_else(|msg| {
+        panic!("C-LEARN must compile to wasm (a core-simulation model the VM handles): {msg}")
+    });
+
+    let vdf_results = clearn_vdf_results();
+
+    ensure_vdf_results_excluding(&vdf_results, &wasm_results, EXPECTED_VDF_RESIDUAL);
+}
+
 /// Committed regression guard that `EXPECTED_VDF_RESIDUAL` stays EXACT: it is
 /// the precise set of C-LEARN base variables that the live `classify_vdf_ident`
 /// comparator flags, neither over- nor under-broad. Runs C-LEARN through the
diff --git a/src/simlin-engine/tests/simulate_systems.rs b/src/simlin-engine/tests/simulate_systems.rs
index 25cb432c3..26722c444 100644
--- a/src/simlin-engine/tests/simulate_systems.rs
+++ b/src/simlin-engine/tests/simulate_systems.rs
@@ -16,7 +16,7 @@ use simlin_engine::db::{
 };
 use simlin_engine::load_csv;
 
-use test_helpers::ensure_results;
+use test_helpers::{WasmRunOutcome, ensure_results, ensure_wasm_matches};
 
 /// All valid systems format test models.
 const ALL_VALID_MODELS: &[&str] = &[
@@ -61,6 +61,98 @@ fn simulate_systems_file(txt_path: &str, csv_path: &str, rounds: u64) {
         .unwrap_or_else(|e| panic!("VM execution failed for {txt_path}: {e}"));
     let results = vm.into_results();
     ensure_results(&expected, &results);
+
+    // wasm-backend parity (AC3.2): a systems-format model translates to
+    // stdlib-module instances (`systems_rate`/`systems_leak`/`systems_conversion`),
+    // so this exercises the wasm backend's module path end-to-end. Every
+    // VM-simulated systems model must run through the wasm backend and clear the
+    // SAME comparator against `expected`; an `Unsupported` outcome here is a hard
+    // failure (this model VM-simulated, so the wasm backend must cover it).
+    // `ensure_wasm_matches` panics internally on a supported-but-wrong model.
+    if let WasmRunOutcome::Skipped(msg) =
+        ensure_wasm_matches(&datamodel_project, "main", &expected, &[])
+    {
+        panic!(
+            "wasm parity gate: systems model {txt_path} VM-simulated but the wasm \
+             backend returned Unsupported (AC3.2 -- every core-simulation model \
+             must run through both backends): {msg}"
+        );
+    }
+}
+
+/// Parse + translate the systems model at `path` (a fixed `rounds`), run it
+/// through the VM for an `expected` baseline, and return whether the wasm backend
+/// reproduces it (`Ran`) or returns `Unsupported` (`Skipped`). A parse/
+/// translate/VM failure is surfaced as `Skipped` (those paths are gated by the
+/// per-model simulation tests; this gate only checks wasm-vs-VM parity).
+fn wasm_systems_outcome_for_path(path: &str, rounds: u64) -> WasmRunOutcome {
+    let Ok(contents) = std::fs::read_to_string(path) else {
+        return WasmRunOutcome::Skipped(format!("could not read {path}"));
+    };
+    let systems_model = match simlin_engine::systems::parse(&contents) {
+        Ok(m) => m,
+        Err(e) => return WasmRunOutcome::Skipped(format!("parse failed: {e}")),
+    };
+    let datamodel = match simlin_engine::systems::translate::translate(&systems_model, rounds) {
+        Ok(p) => p,
+        Err(e) => return WasmRunOutcome::Skipped(format!("translate failed: {e}")),
+    };
+
+    let mut db = SimlinDb::default();
+    let sync = sync_from_datamodel_incremental(&mut db, &datamodel, None);
+    let compiled = match compile_project_incremental(&db, sync.project, "main") {
+        Ok(c) => c,
+        Err(e) => return WasmRunOutcome::Skipped(format!("VM compile failed: {e:?}")),
+    };
+    let mut vm = match Vm::new(compiled) {
+        Ok(vm) => vm,
+        Err(e) => return WasmRunOutcome::Skipped(format!("VM creation failed: {e}")),
+    };
+    if let Err(e) = vm.run_to_end() {
+        return WasmRunOutcome::Skipped(format!("VM run failed: {e}"));
+    }
+    let expected = vm.into_results();
+    ensure_wasm_matches(&datamodel, "main", &expected, &[])
+}
+
+/// End-state wasm parity gate (AC3.2 / AC3.3): EVERY systems-format model must
+/// run through the wasm backend to VM parity -- zero may return
+/// `WasmGenError::Unsupported`. Systems-format models translate to stdlib-module
+/// instances (`systems_rate`/`systems_leak`/`systems_conversion`), so they
+/// exercise the wasm backend's `EvalModule`/`LoadModuleInput` path end-to-end.
+/// This is a direct wasm-vs-VM check (the VM's own output is the baseline),
+/// independent of the on-disk CSV fixtures. The per-model simulation tests
+/// additionally run the inline `ensure_wasm_matches` hook against their
+/// CSV-cleared `expected` and likewise hard-fail on `Unsupported`. A regression
+/// that drops a previously-supported systems model fails here with the offender.
+#[test]
+fn wasm_systems_parity_floor() {
+    let mut unsupported: Vec<(String, String)> = Vec::new();
+    for &path in ALL_VALID_MODELS {
+        // A fixed `rounds` like the compile-only gate; the wasm-vs-VM parity does
+        // not depend on the exact horizon, only that both backends agree on it.
+        if let WasmRunOutcome::Skipped(msg) = wasm_systems_outcome_for_path(path, 5) {
+            unsupported.push((path.to_string(), msg));
+        }
+    }
+    eprintln!(
+        "wasm_systems_parity_floor: {} of {} systems models ran to VM parity ({} unsupported)",
+        ALL_VALID_MODELS.len() - unsupported.len(),
+        ALL_VALID_MODELS.len(),
+        unsupported.len()
+    );
+    assert!(
+        unsupported.is_empty(),
+        "wasm systems parity gate (AC3.2/AC3.3): every systems model must run \
+         through the wasm backend, but {} of {} returned Unsupported:\n{}",
+        unsupported.len(),
+        ALL_VALID_MODELS.len(),
+        unsupported
+            .iter()
+            .map(|(p, m)| format!("  {p}: {m}"))
+            .collect::<Vec<_>>()
+            .join("\n")
+    );
 }
 
 #[test]
diff --git a/src/simlin-engine/tests/test_helpers.rs b/src/simlin-engine/tests/test_helpers.rs
index bd6bb9da8..e4537bc8d 100644
--- a/src/simlin-engine/tests/test_helpers.rs
+++ b/src/simlin-engine/tests/test_helpers.rs
@@ -6,10 +6,23 @@
 //!
 //! Extracted from `simulate.rs` so that multiple integration test files
 //! (simulate.rs, simulate_systems.rs, etc.) can share the comparison logic.
+//!
+//! pattern: Mixed (unavoidable)
+//! Reason: `ensure_results*` is a pure comparator (Functional Core), while
+//! `ensure_wasm_matches` is an Imperative Shell (it drives the salsa compile
+//! pipeline and executes the emitted wasm under the DLR-FT interpreter). They
+//! live together because this is the single shared test-helper module the
+//! implementation plan centralizes comparison logic in, and the wasm shell's
+//! only job is to feed the pure comparator. The slab -> `Results` conversion is
+//! extracted as a pure function (`wasm_results_from_slab`) to keep the I/O
+//! boundary explicit.
 
+use checked::Store;
 use float_cmp::approx_eq;
-use simlin_engine::Results;
 use simlin_engine::common::{Canonical, Ident};
+use simlin_engine::wasmgen::{WasmGenError, WasmLayout, compile_simulation};
+use simlin_engine::{Results, SimSpecs};
+use wasm::validate;
 
 /// Columns that are vendor-specific or otherwise not important for
 /// simulation correctness.
@@ -128,3 +141,176 @@ pub fn ensure_results_excluding(expected: &Results, results: &Results, excluded:
             .contains_key(&Ident::<Canonical>::from_str_unchecked("UNKNOWN"))
     );
 }
+
+// The wasm-parity helpers below are consumed only by the `simulate` corpus
+// binary; the other test binaries that share this module (`simulate_systems`,
+// `systems_roundtrip`, `metasd_macros`) include the file but do not run wasm
+// parity, so each item is `#[allow(dead_code)]` to stay clean under
+// `cargo clippy --all-targets -- -D warnings` (the same shared-helper idiom as
+// `SimTier` in `metasd_macros.rs`).
+
+/// Outcome of running a model through the wasm backend via
+/// [`ensure_wasm_matches`].
+///
+/// `Ran` means the model was within the wasm backend's supported feature set,
+/// executed under the interpreter, and CLEARED the parity comparator (the
+/// helper panics internally on any divergence -- a supported-but-wrong model is
+/// a hard failure, never a `Ran`). `Skipped` means `compile_simulation`
+/// returned [`WasmGenError::Unsupported`] (an out-of-scope construct); the
+/// message is carried so the caller decides whether that is a failure.
+///
+/// Phase 8 closed the corpus gate: for a model the VM SIMULATED in the default
+/// suite, a `Skipped` outcome is now a HARD FAILURE -- the corpus callers
+/// (`wasm_parity_hook`, the parity-floor gates, the systems harness) panic on
+/// it (wasm-backend AC3.2: every core-simulation model runs through both
+/// backends). The variant survives only so the `ensure_wasm_matches_skips_*`
+/// unit test can still observe a *genuinely* out-of-scope construct returning a
+/// clean `Unsupported` (AC1.4) -- never a panic or a silently wrong result --
+/// rather than reaching the hook.
+#[allow(dead_code)]
+#[derive(Debug)]
+pub enum WasmRunOutcome {
+    Ran,
+    Skipped(String),
+}
+
+/// Build a `Results` from a wasm backend's step-major results slab.
+///
+/// The slab is `layout.n_chunks * layout.n_slots` f64 laid out row-major by
+/// saved step (the same step-major order the bytecode VM's `Results` uses), so
+/// `step_size = n_slots` and `step_count = n_chunks` make `Results::iter` yield
+/// one chunk per saved step. Each canonical variable name in `layout` maps back
+/// to its slot offset within a chunk. `is_vensim = false`: a wasm-emitted run is
+/// a Simlin computation, so it takes the absolute-tolerance branch of the
+/// comparator (never the Vensim relative-tolerance branch).
+///
+/// Pure: no I/O, no global state -- it only reshapes already-read data, so it is
+/// the Functional Core boundary of [`ensure_wasm_matches`].
+#[allow(dead_code)]
+fn wasm_results_from_slab(layout: &WasmLayout, slab: Vec<f64>, specs: SimSpecs) -> Results {
+    let offsets = layout
+        .var_offsets
+        .iter()
+        // The names came from `CompiledSimulation::offsets`, whose keys are
+        // already `Ident<Canonical>`, so they round-trip without re-canonicalizing.
+        .map(|(name, off)| (Ident::<Canonical>::from_str_unchecked(name), *off))
+        .collect();
+
+    Results {
+        offsets,
+        data: slab.into_boxed_slice(),
+        step_size: layout.n_slots,
+        step_count: layout.n_chunks,
+        specs,
+        is_vensim: false,
+    }
+}
+
+/// Compile `model_name` of `datamodel` to wasm, run it under the DLR-FT
+/// interpreter, and reshape the results slab into a [`Results`] — or return the
+/// `Unsupported` message if the model is outside the wasm backend's feature set.
+///
+/// Builds the `CompiledSimulation` exactly as the corpus VM path does
+/// (simulate.rs `compile_vm`), so the wasm blob is the twin of the VM's run. An
+/// incremental-compile error (a VM-side issue gated elsewhere) and an
+/// `Unsupported` codegen result both return `Err(msg)`; the caller decides
+/// whether that is a skip or a hard failure.
+///
+/// Imperative Shell: drives the salsa compile pipeline and the wasm interpreter
+/// (side effects), delegating the reshape to the pure [`wasm_results_from_slab`].
+/// Shared by [`ensure_wasm_matches`] (the corpus `.dat`/CSV comparator) and the
+/// C-LEARN wasm twin (which compares against `Ref.vdf` instead).
+#[allow(dead_code)]
+pub fn wasm_results_for(
+    datamodel: &simlin_engine::datamodel::Project,
+    model_name: &str,
+) -> Result<Results, String> {
+    use simlin_engine::db::{
+        SimlinDb, compile_project_incremental, sync_from_datamodel_incremental,
+    };
+
+    let mut db = SimlinDb::default();
+    let sync = sync_from_datamodel_incremental(&mut db, datamodel, None);
+    let sim = compile_project_incremental(&db, sync.project, model_name)
+        .map_err(|e| format!("incremental compile failed: {e:?}"))?;
+
+    let artifact = match compile_simulation(&sim) {
+        Ok(artifact) => artifact,
+        Err(WasmGenError::Unsupported(msg)) => return Err(msg),
+    };
+
+    let slab = run_wasm_results(&artifact.wasm, &artifact.layout);
+    let specs = SimSpecs::from(&datamodel.sim_specs);
+    Ok(wasm_results_from_slab(&artifact.layout, slab, specs))
+}
+
+/// Compile `model_name` of `datamodel` to wasm, run it under the DLR-FT
+/// interpreter, and assert its results clear the SAME `ensure_results_excluding`
+/// comparator the VM clears against `expected`.
+///
+/// There is no separate, tighter wasm-vs-VM threshold (per the design's
+/// validation bar): "wasm-vs-VM parity" is established because both backends
+/// clear the identical comparator against the identical expected outputs. A
+/// model outside the wasm backend's supported feature set returns
+/// [`WasmRunOutcome::Skipped`] (never a failure); a supported model whose wasm
+/// output diverges panics inside `ensure_results_excluding`.
+///
+/// Imperative Shell: it drives the salsa compile pipeline and the wasm
+/// interpreter (side effects), delegating the reshape to the pure
+/// [`wasm_results_from_slab`] and the comparison to the pure
+/// [`ensure_results_excluding`].
+#[allow(dead_code)]
+pub fn ensure_wasm_matches(
+    datamodel: &simlin_engine::datamodel::Project,
+    model_name: &str,
+    expected: &Results,
+    excluded: &[&str],
+) -> WasmRunOutcome {
+    let wasm_results = match wasm_results_for(datamodel, model_name) {
+        Ok(results) => results,
+        Err(msg) => return WasmRunOutcome::Skipped(msg),
+    };
+
+    // The same comparator the VM clears: panics loudly on any divergence, so a
+    // supported-but-wrong wasm module fails here rather than reporting Ran.
+    ensure_results_excluding(expected, &wasm_results, excluded);
+    WasmRunOutcome::Ran
+}
+
+/// Instantiate `wasm` under the DLR-FT `checked::Store`, invoke the exported
+/// `run`, and copy `n_chunks * n_slots` f64 out of the results region (located
+/// via `layout.results_offset`). This is the wasm-execution side effect of
+/// [`ensure_wasm_matches`]; the bytes it returns are consumed purely afterward.
+#[allow(dead_code)]
+fn run_wasm_results(wasm: &[u8], layout: &WasmLayout) -> Vec<f64> {
+    let info = validate(wasm).expect("generated wasm module must validate");
+    let mut store = Store::new(());
+    let inst = store
+        .module_instantiate(&info, Vec::new(), None)
+        .expect("instantiate wasm module")
+        .module_addr;
+    let run = store
+        .instance_export(inst, "run")
+        .expect("run export must exist")
+        .as_func()
+        .expect("run export must be a function");
+    store
+        .invoke_simple_typed::<(), ()>(run, ())
+        .expect("run wasm");
+    let mem = store
+        .instance_export(inst, "memory")
+        .expect("memory export must exist")
+        .as_mem()
+        .expect("memory export must be a memory");
+
+    let n = layout.n_chunks * layout.n_slots;
+    let base = layout.results_offset;
+    store.mem_access_mut_slice(mem, |bytes| {
+        (0..n)
+            .map(|i| {
+                let a = base + i * 8;
+                f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap())
+            })
+            .collect()
+    })
+}