diff --git a/CLAUDE.md b/CLAUDE.md index 6a38b1dea..095264d43 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -95,6 +95,7 @@ IMPORTANT: If feedback seems non-actionable, it means you need comments explaini - Public Rust items and non-trivial internal functions should have concise rustdoc describing purpose, key assumptions, and side effects. - When behavior changes, update nearby comments in the same commit so docs and code stay aligned. - If you intentionally remove a comment block, replace it with an updated equivalent when the context is still non-obvious. +- NEVER add a "Last updated" (or "Last verified") line to a `CLAUDE.md`: it is a perpetual rebase/merge-conflict magnet and goes stale immediately. Describe current state in prose; rely on `git log` / `git blame` for history. ## Development Standards diff --git a/Cargo.lock b/Cargo.lock index de6d8942e..84f27515e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -414,6 +414,16 @@ dependencies = [ "rand_core 0.10.1", ] +[[package]] +name = "checked" +version = "0.2.0" +source = "git+https://github.com/DLR-FT/wasm-interpreter.git?rev=64cedbba603edfd64cbb6b5a19f5fa34530bb03a#64cedbba603edfd64cbb6b5a19f5fa34530bb03a" +dependencies = [ + "interop", + "linker", + "wasm-interpreter", +] + [[package]] name = "chrono" version = "0.4.44" @@ -1864,6 +1874,14 @@ dependencies = [ "generic-array", ] +[[package]] +name = "interop" +version = "0.2.0" +source = "git+https://github.com/DLR-FT/wasm-interpreter.git?rev=64cedbba603edfd64cbb6b5a19f5fa34530bb03a#64cedbba603edfd64cbb6b5a19f5fa34530bb03a" +dependencies = [ + "wasm-interpreter", +] + [[package]] name = "intrusive-collections" version = "0.9.7" @@ -2072,9 +2090,9 @@ checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f" [[package]] name = "libm" -version = "0.2.16" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "libmimalloc-sys" @@ -2085,6 +2103,14 @@ dependencies = [ "cc", ] +[[package]] +name = "linker" +version = "0.2.0" +source = "git+https://github.com/DLR-FT/wasm-interpreter.git?rev=64cedbba603edfd64cbb6b5a19f5fa34530bb03a#64cedbba603edfd64cbb6b5a19f5fa34530bb03a" +dependencies = [ + "wasm-interpreter", +] + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -2112,6 +2138,11 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "log_wrapper" +version = "0.1.0" +source = "git+https://github.com/DLR-FT/wasm-interpreter.git?rev=64cedbba603edfd64cbb6b5a19f5fa34530bb03a#64cedbba603edfd64cbb6b5a19f5fa34530bb03a" + [[package]] name = "loom" version = "0.7.2" @@ -3786,11 +3817,13 @@ name = "simlin" version = "0.1.0" dependencies = [ "anyhow", + "checked", "mimalloc", "prost", "serde", "serde_json", "simlin-engine", + "wasm-interpreter", ] [[package]] @@ -3812,6 +3845,7 @@ dependencies = [ "base64", "bumpalo", "calamine", + "checked", "criterion", "csv", "ed25519", @@ -3836,6 +3870,8 @@ dependencies = [ "tempfile", "test-generator", "unicode-xid", + "wasm-encoder", + "wasm-interpreter", "xmutil", ] @@ -4783,6 +4819,15 @@ dependencies = [ "wasmparser", ] +[[package]] +name = "wasm-interpreter" +version = "0.2.0" +source = "git+https://github.com/DLR-FT/wasm-interpreter.git?rev=64cedbba603edfd64cbb6b5a19f5fa34530bb03a#64cedbba603edfd64cbb6b5a19f5fa34530bb03a" +dependencies = [ + "libm", + "log_wrapper", +] + [[package]] name = "wasm-metadata" version = "0.244.0" diff --git a/docs/README.md b/docs/README.md index ea9715bb8..c0e7a573f 100644 --- a/docs/README.md +++ b/docs/README.md @@ -30,8 +30,10 @@ - [design-plans/2026-05-11-ltm-arrays-hardening.md](design-plans/2026-05-11-ltm-arrays-hardening.md) -- Arrayed/cross-element LTM hardening: unify the reference-site walkers behind one classification IR (#520), then layer eight fixes (#487, #511, #510, #514, #515, #483, #502, #492) - [design-plans/2026-05-13-macros.md](design-plans/2026-05-13-macros.md) -- Vensim macro support: macros as a data-driven generalization of the stdlib module mechanism, persisted via a `MacroSpec` marker on `Model`; 7 implementation phases - [design-plans/2026-05-19-clearn-residual.md](design-plans/2026-05-19-clearn-residual.md) -- Close C-LEARN's residual (#590/#591) as general Vensim import/simulation primitives: arrayed inline graphical functions, import-time macro shadowing, user-macro INITIAL recurrence, residual attribution; 5 phases + - [design-plans/2026-05-20-wasm-backend.md](design-plans/2026-05-20-wasm-backend.md) -- WebAssembly code-generation backend: compile a model to one self-contained wasm module as an alternative to the bytecode VM (for fast interactive re-simulation), validated to full VM parity; 8 phases - [plans/](plans/README.md) -- Implementation plans (active and completed) - [test-plans/](test-plans/) -- Human verification plans for completed features + - [test-plans/2026-05-20-wasm-backend.md](test-plans/2026-05-20-wasm-backend.md) -- Manual verification for the WebAssembly simulation backend: the heavy `#[ignore]`d parity twins (C-LEARN vs `Ref.vdf`, WORLD3), driving the libsimlin FFI from a real host, and the AC3.3 deliberate-regression check (the bytecode VM is the automated oracle for everything else) - `implementation-plans/` -- Detailed phase-by-phase implementation plans, created during plan execution ## Security diff --git a/docs/architecture.md b/docs/architecture.md index 9772c6b9d..4693a7a61 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -14,6 +14,7 @@ Core simulation engine. Compiles, type-checks, unit-checks, and simulates SD mod - Primary compilation path is `db::compile_project_incremental()` using salsa tracked functions for fine-grained incrementality (`db.rs`, `db_analysis.rs`, `db_ltm.rs`, `db_ltm_ir.rs`) - Equation text is parsed via recursive descent parser (`parser/mod.rs`) - Simulations run on a stack-based bytecode VM (`vm.rs`) with `PREVIOUS`/`INIT` intrinsic opcodes +- An alternative WebAssembly code-generation backend (`wasmgen/`) lowers a compiled model to one self-contained wasm module (no host imports) for fast repeated re-simulation; the VM stays the correctness oracle (every emitted module is checked against it). Surfaced through libsimlin `simlin_model_compile_to_wasm` - `builtins.rs` defines builtin functions (including `PREVIOUS`, `INIT`); stateful module functions (TREND, SMOOTH3) are model definitions in `stdlib/*.stmx`, generated into `stdlib.gen.rs` - Native Vensim MDL parser in `mdl/` (replaces C++ xmutil); see [docs/design/mdl-parser.md](/docs/design/mdl-parser.md) diff --git a/docs/design-plans/2026-05-20-wasm-backend.md b/docs/design-plans/2026-05-20-wasm-backend.md index 8cdb899f5..43f7cd420 100644 --- a/docs/design-plans/2026-05-20-wasm-backend.md +++ b/docs/design-plans/2026-05-20-wasm-backend.md @@ -125,7 +125,7 @@ Turn the validated proof-of-concept (branch `wasm-backend-poc`) into a full, cor ## Architecture -The backend translates the engine's compiled bytecode into an equivalent WebAssembly module, mirroring the bytecode VM (`src/simlin-engine/src/vm.rs`) opcode-for-opcode. It consumes the public salsa output `compile_project_incremental(db, project, model) -> CompiledSimulation` (`vm.rs:134`) — the same value `Vm::new` consumes — so no salsa-internal queries are touched and all engine assembly (dependency ordering, model-global offset resolution, recurrence-SCC handling, graphical-function layout, module instantiation, implicit SMOOTH/DELAY variables) is inherited unchanged. +The backend translates the engine's compiled bytecode into an equivalent WebAssembly module, mirroring the bytecode VM (`src/simlin-engine/src/vm.rs`) opcode-for-opcode. It consumes the public salsa output `compile_project_incremental(db, project, model) -> CompiledSimulation` (`db.rs:5886`, returning the `CompiledSimulation` defined at `vm.rs:134`) — the same value `Vm::new` consumes — so no salsa-internal queries are touched and all engine assembly (dependency ordering, model-global offset resolution, recurrence-SCC handling, graphical-function layout, module instantiation, implicit SMOOTH/DELAY variables) is inherited unchanged. `CompiledSimulation` is `{ modules: HashMap, specs: Specs, root: ModuleKey, offsets: HashMap }`. Each `CompiledModule` (`bytecode.rs:4616`) holds three opcode programs (`compiled_initials`, `compiled_flows`, `compiled_stocks`), per-program `literals`, and a shared `ByteCodeContext` (`bytecode.rs:1585`: graphical-function tables, module declarations, dimensions, temp-array sizes, static array views). It is the *un-fused* form — the 3-address `fuse_three_address` pass runs later in `Vm::new` — so the backend translates the plain opcode set only. diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_01.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_01.md new file mode 100644 index 000000000..e21a68276 --- /dev/null +++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_01.md @@ -0,0 +1,330 @@ +# WebAssembly Simulation Backend — Phase 1: Bytecode-to-wasm scalar core + parity harness + +**Goal:** Restructure the `wasmgen` proof-of-concept so it consumes the salsa-compiled bytecode (`CompiledSimulation`) instead of the monolithic `compiler::Module`/`Expr` IR, lower the scalar-core opcode set + the Euler integration loop to a self-contained wasm module, and stand up the dual VM-vs-wasm parity gate in `tests/simulate.rs`. + +**Architecture:** The bytecode VM (`src/simlin-engine/src/vm.rs`) is a stack machine over a flat f64 "slab" in linear memory; wasm is also a stack machine over linear memory, so each `Opcode` lowers to a short, mostly 1:1 wasm instruction sequence operating on the wasm operand stack. The backend walks the un-fused opcode programs of each `CompiledModule` (`compiled_initials`/`compiled_flows`/`compiled_stocks`) and emits three wasm functions, then a `run` function that seeds the reserved globals + initials and drives the Euler loop, writing step-major snapshots into a results region. The module exports `memory`, `run`, and three i32 geometry globals (`n_slots`, `n_chunks`, `results_offset`); a `WasmLayout` (variable-name→slot-offset map) is returned alongside the bytes for host-side by-name reads. + +**Tech Stack:** Rust; `wasm-encoder` 0.244 (module emission); the DLR-FT `wasm-interpreter` (`wasm::validate`) + `checked::Store` (host run) as the in-test execution oracle; the existing `compile_project_incremental` salsa pipeline; the `tests/simulate.rs` corpus harness. + +**Scope:** Phase 1 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`. + +**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`). + +--- + +## Acceptance Criteria Coverage + +This phase implements and tests: + +### wasm-backend.AC1: The wasm backend reproduces the VM's simulation results +- **wasm-backend.AC1.1 Success:** A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` against the model's expected outputs at those tests' existing tolerances. (No separate, tighter wasm-vs-VM threshold.) *(Phase 1 covers scalar, Euler models; later phases widen the supported set.)* +- **wasm-backend.AC1.4 Failure:** A model using a not-yet-supported construct returns `WasmGenError::Unsupported` — a clean error, never a panic or a silently wrong result. +- **wasm-backend.AC1.5 Edge:** Empty-view reducers, out-of-bounds subscripts, and division-by-zero produce the same NaN / finite-`:NA:` / Inf values the VM produces. *(Phase 1 covers the division-by-zero portion — raw `Op2::Div`; the empty-reducer/OOB and finite-`:NA:`-vs-NaN portions complete in Phases 5 and 2.)* + +### wasm-backend.AC2: The backend consumes the salsa compiled bytecode +- **wasm-backend.AC2.1 Success:** The wasm module is produced from `compile_project_incremental(...) -> CompiledSimulation`, not from the `Expr` IR or the monolithic `compiler::Module`. +- **wasm-backend.AC2.2 Success:** The POC's `#[cfg(test)]` un-gating of the monolithic builder is reverted; the crate builds with `Module::new`/`build_metadata`/`calc_n_slots`/`calc_module_model_map` test-only again. + +### wasm-backend.AC3: simulate.rs runs the corpus through both backends +- **wasm-backend.AC3.1 Success:** During rollout, each corpus model runs through the VM and (when supported) the wasm backend, comparing wasm-vs-VM; unsupported models are skipped (not failed) and counted against a monotonically rising floor. + +### wasm-backend.AC4: Self-describing results + efficient by-name retrieval +- **wasm-backend.AC4.1 Success:** The blob exports `n_slots`/`n_chunks`/`results_offset` and writes step-major snapshots; a host locates and strides the results with no external metadata. + +### wasm-backend.AC7: Numeric-parity specifics +- **wasm-backend.AC7.4 Success:** Euler, RK2, and RK4 each match the VM's saved samples (cadence and values); `PREVIOUS`/`INIT` match via the snapshot regions. *(Phase 1 establishes the Euler cadence/values portion only; RK2/RK4 + PREVIOUS/INIT complete this AC in Phase 4.)* + +### wasm-backend.AC8: Engineering quality (cross-cutting) +- **wasm-backend.AC8.1 / AC8.2** are satisfied cross-cuttingly across every phase rather than headered per-phase: each functionality task is TDD with inline `#[cfg(test)]` unit tests that execute emitted wasm under the DLR-FT interpreter, each opcode/feature group is individually tested toward ≥95% coverage, and every phase ends with passing tests for the ACs it claims (its "Done When"). + +--- + +## Notes for the implementer (read first) + +- **The VM is the executable spec.** Every opcode's wasm lowering must reproduce the matching arm of `vm.rs`. Cite-and-mirror, do not invent. Key references confirmed during planning: + - The `Opcode` enum: `src/simlin-engine/src/bytecode.rs:561`. The scalar-core variants are `Op2 { op: Op2 }`, `Not {}`, `LoadConstant { id: LiteralId }`, `LoadVar { off: VariableOffset }`, `LoadGlobalVar { off: VariableOffset }`, `SetCond {}`, `If {}`, `AssignCurr { off }`, `AssignNext { off }`, `Ret`. (`LiteralId`/`VariableOffset` are `u16`.) + - `Op2` enum: `bytecode.rs:526` — `Add, Sub, Exp, Mul, Div, Mod, Gt, Gte, Lt, Lte, Eq, And, Or`. **There is no `Neq`** (the AST `Neq` lowers to `Eq` then `Not`). The VM's `eval_op2` is `vm.rs:94-111`. + - `is_truthy(n) = !crate::float::approx_eq(n, 0.0)` — `vm.rs:89`. + - The Euler loop and the `save_advance!` macro — `vm.rs:631-711` (Euler arm `vm.rs:698-711`; `save_advance!` `vm.rs:675-695`). + - Reserved global slots `TIME_OFF=0`, `DT_OFF=1`, `INITIAL_TIME_OFF=2`, `FINAL_TIME_OFF=3`, `IMPLICIT_VAR_COUNT=4` — `vm.rs:83-87`. +- **`CompiledSimulation` shape** (`vm.rs:132-140`), all fields `pub(crate)` (the in-crate `wasmgen` module reads them directly): + - `modules: HashMap`, `specs: Specs`, `root: ModuleKey`, `offsets: HashMap, usize>` (the global var-name→slot map — this becomes `WasmLayout.var_offsets`), plus a private `cached_constant_info` (ignore until Phase 7). + - `ModuleKey = (Ident, BTreeSet>)` (`vm.rs:24`). + - `CompiledModule` (`bytecode.rs:4616`): `ident`, `n_slots: usize`, `context: Arc`, `compiled_initials: Arc>`, `compiled_flows: Arc`, `compiled_stocks: Arc`. + - `ByteCode { literals: Vec, code: Vec }` (`bytecode.rs:1702`). **`literals` live inside each `ByteCode`**, not on `CompiledModule`. `CompiledInitial { ident, offsets: Vec, bytecode: ByteCode }` (`bytecode.rs:4603`) — initials are a **vector of per-variable programs**, each its own `ByteCode`. + - `Specs` (`results.rs:22`): `start`, `stop`, `dt`, `save_step`, `method: Method`, `n_chunks: usize`. `Method` is `Euler | RungeKutta2 | RungeKutta4`. +- **The opcode programs are un-fused.** `fuse_three_address` runs inside `Vm::new` (`vm.rs:397`), *after* `CompiledSimulation` is produced, on the VM's private execution copy. A `CompiledSimulation` consumer only ever sees the plain opcode set above — never `BinVarVar`, `AssignConstCurr`, etc. The emitter does not need to handle the fused/superinstruction opcodes; if one is ever encountered, return `WasmGenError::Unsupported`. +- **DLR-FT oracle pattern** (used by every wasm-executing test), confirmed verbatim at `wasmgen/module.rs:392-422`: + ```rust + use checked::Store; + use wasm::validate; + let info = validate(&wasm_bytes).expect("module must validate"); + let mut store = Store::new(()); + let inst = store.module_instantiate(&info, Vec::new(), None).expect("instantiate").module_addr; + let run = store.instance_export(inst, "run").unwrap().as_func().unwrap(); + store.invoke_simple_typed::<(), ()>(run, ()).expect("run wasm"); + let mem = store.instance_export(inst, "memory").unwrap().as_mem().unwrap(); + let data: Vec = store.mem_access_mut_slice(mem, |bytes| { /* read f64 LE at byte offsets */ }); + ``` +- **Visibility latitude (per the repo owner):** widen any engine item to `pub(crate)` — or `pub` where the `tests/` parity harness (a crate-external target) needs it — wherever it produces a cleaner backend. The repo has no external API consumers; breaking changes are fine if tests pass. Do not contort the design to avoid touching visibility. (`compile_project_incremental`, `db::sync_from_datamodel_incremental`, `SimlinDb`, `Results`, and the new `compile_simulation`/`WasmArtifact`/`WasmLayout` must be reachable from `tests/`; make them `pub`.) +- **TDD, 95%+ coverage, inline `#[cfg(test)] mod tests`.** Each unit test that executes wasm builds a tiny module, runs it under the DLR-FT interpreter, and asserts on memory/return values. Keep each test < 2s (the suite runs under a 3-minute wall-clock cap; `docs/dev/rust.md:13-17`). Run the engine tests with `cargo test -p simlin-engine --features file_io` (the corpus tests are gated on `file_io`; bare `cargo test`/`cargo test --workspace` also activate it via workspace feature unification). +- **Addressing scheme (uniform across all phases, module-ready).** The per-program wasm functions take a single `i32` parameter `module_off` (slot base of this module instance within a chunk; `0` for the root in Phase 1). A module-relative slot `off` resolves to byte address `chunk_base + (module_off + off) * 8`, emitted as: push the dynamic part `local.get module_off; i32.const 8; i32.mul`, then `f64.load`/`f64.store` with `memarg.offset = chunk_base + off*8` (a compile-time constant) and `memarg.align = 3`. An **absolute global** slot (`LoadGlobalVar`, slots 0..4) ignores `module_off`: `i32.const 0; f64.load memarg{offset: chunk_base + off*8}`. Using `module_off` from Phase 1 (always 0 for the root) avoids a Phase 7 rewrite. `chunk_base` is `curr_base` for `LoadVar`/`LoadGlobalVar`/`AssignCurr`, `next_base` for `AssignNext`. + +--- + + + + +### Task 1: Scalar-core opcode emitter (`wasmgen/lower.rs`) + +**Verifies:** wasm-backend.AC2.1 (consumes bytecode opcodes, not `Expr`); wasm-backend.AC1.4 (unsupported opcodes return a clean `WasmGenError::Unsupported`); wasm-backend.AC1.5 (raw `Op2::Div` by zero). + +**Files:** +- Create: `src/simlin-engine/src/wasmgen/lower.rs` +- Modify: `src/simlin-engine/src/wasmgen/mod.rs` (add `mod lower;`) +- Test: inline `#[cfg(test)] mod tests` in `wasmgen/lower.rs` + +**Implementation:** +Create the per-opcode emitter that walks a `&crate::bytecode::ByteCode` and appends wasm instructions to a `wasm_encoder::Function`, mirroring `eval_bytecode` (`vm.rs:1257+`). Reuse the POC's `EmitCtx`/`memarg`/`f64_const` helpers (currently in `wasmgen/expr.rs`) but generalize `EmitCtx` to carry `module_off` handling per the addressing scheme above. + +Define: +```rust +pub(crate) struct EmitCtx { + pub curr_base: u32, // byte offset of slot 0 of the curr chunk + pub next_base: u32, // byte offset of slot 0 of the next chunk + pub dt: f64, + pub start_time: f64, + pub final_time: f64, + pub module_off_local: u32, // wasm local index holding this instance's module_off (i32) +} +``` + +`pub(crate) fn emit_bytecode(bc: &ByteCode, ctx: &EmitCtx, f: &mut Function) -> Result<(), WasmGenError>`: +walk `bc.code` in order; for each `Opcode` emit wasm. A scratch f64 local (reserved by the caller; pass its index in `EmitCtx` or as an arg) is needed for `AssignCurr`/`AssignNext` (the value is already on the wasm stack and the store address must be pushed under it). + +Per-opcode lowering (Phase 1 supported set; everything else → `WasmGenError::Unsupported(format!(...))`): + +| Opcode | wasm emitted | +|---|---| +| `LoadConstant { id }` | `f64.const bc.literals[id as usize]` | +| `LoadVar { off }` | address(`curr_base`, `off`, dynamic `module_off`); `f64.load` | +| `LoadGlobalVar { off }` | `i32.const 0; f64.load memarg{curr_base + off*8}` (absolute, no `module_off`) | +| `Op2 { op }` | operands already on stack. `Add/Sub/Mul/Div` → `f64.add/sub/mul/div`. `Gt/Gte/Lt/Lte` → `f64.gt/ge/lt/le` then convert the i32 0/1 to f64 (`f64.convert_i32_u`) so booleans stay f64 1.0/0.0 like the VM. `Eq/And/Or/Mod/Exp` → `Unsupported` (Phase 2). | +| `Not {}` | operand on stack; truthiness-negate. Phase 1 uses simple `value == 0.0` (`f64.const 0.0; f64.eq; f64.convert_i32_u`), matching the POC; Phase 2 routes through the `approx_eq` helper. | +| `SetCond {}` | pop the f64 condition; reduce to i32 truthiness (Phase 1: `f64.const 0.0; f64.ne` → i32) and `local.set` into a reserved i32 "condition" local. | +| `If {}` | the two arm values (`t` then `f`) are already on the wasm stack from preceding opcodes; emit `local.get ; select`. wasm `select` pops `[t, f, cond_i32]` and yields `t` if `cond != 0` else `f` — exactly the VM's `If` (`push(if condition { t } else { f })`). | +| `AssignCurr { off }` | pop value into the scratch f64 local; emit address(`curr_base`, `off`, `module_off`); `local.get scratch`; `f64.store`. | +| `AssignNext { off }` | same as `AssignCurr` but `next_base`. | +| `Ret` | emit nothing (the wasm function's `End` is emitted by the caller). | + +**Critical correctness notes** (all confirmed against the VM): +- `SetCond` is a *separate opcode* that sets a condition register read by `If`; they are always emitted adjacently by codegen but the emitter must reserve a dedicated i32 local for the condition. Nesting: an inner `If` can occur between an outer `SetCond` and its `If`, so use a **stack of condition locals** (push on `SetCond`, pop on `If`) rather than a single local, to be safe — confirm against `compiler/codegen.rs:1153-1159` that emission is well-nested; if codegen guarantees `SetCond` immediately precedes its `If` with no intervening `SetCond`, a single local suffices. Default to the local-stack to be robust. +- `Op2` operand order: the VM pops `r` then `l` and computes `l op r`; wasm leaves them in push order `[l, r]` on the stack, so `f64.sub`/`f64.div` (non-commutative) are already correct. +- Comparisons must yield f64 `1.0`/`0.0` (not raw i32), because downstream opcodes consume them as f64. + +**Testing:** +Hand-build small `ByteCode` values (`ByteCode { literals, code }` — fields are `pub(crate)`, reachable in-crate) wrapping each opcode/sequence, wrap in a one-function test module that exports `eval`/`mem` (mirror the harness in the current `wasmgen/expr.rs:300-396`), execute under the DLR-FT interpreter, and assert. Cover: +- wasm-backend.AC2.1: each scalar-core opcode (`LoadConstant`, `LoadVar`, `LoadGlobalVar`, every supported `Op2`, `Not`, `SetCond`+`If` true/false, `AssignCurr`, `AssignNext`) lowers and produces the value/store the VM's `eval_op2`/handler produces. +- `If` selecting the correct arm for truthy and zero conditions; nested `If`. +- wasm-backend.AC1.5: raw `Op2::Div` by zero matches the VM (`x/0` → ±Inf, `0/0` → NaN — IEEE-identical, since wasm `f64.div` matches Rust `f64`). +- wasm-backend.AC1.4: unsupported opcodes (`Op2::Eq`, `Op2::Mod`, `Apply`, `Lookup`, an array opcode) return `WasmGenError::Unsupported` (a clean error, never a panic). + +**Verification:** +Run: `cargo test -p simlin-engine --features file_io wasmgen::lower` +Expected: all new tests pass. + +**Commit:** `engine: wasmgen scalar-core opcode emitter over bytecode` + + + +### Task 2: `compile_simulation` — whole-model assembly (root, Euler) + +**Verifies:** wasm-backend.AC2.1, wasm-backend.AC4.1, wasm-backend.AC7.4 (Euler portion). + +**Files:** +- Modify: `src/simlin-engine/src/wasmgen/module.rs` (add the new `compile_simulation` path + `WasmArtifact`/`WasmLayout`; the old `compile_module(&Module, &Specs)` is removed in Task 3) +- Modify: `src/simlin-engine/src/wasmgen/mod.rs` (export the new types/fn) +- Test: inline `#[cfg(test)] mod tests` in `wasmgen/module.rs` + +**Implementation:** +Add the public contract types and entry point. Place the types in `mod.rs` (or `module.rs` and re-export); make them `pub`: +```rust +pub struct WasmArtifact { + pub wasm: Vec, + pub layout: WasmLayout, +} + +pub struct WasmLayout { + pub n_slots: usize, + pub n_chunks: usize, + pub results_offset: usize, // byte offset of the results region + pub var_offsets: Vec<(String, usize)>, // canonical variable name -> slot offset +} + +pub fn compile_simulation(sim: &CompiledSimulation) -> Result; +``` + +`compile_simulation` (Phase 1 supports the root module only, Euler only): +1. Look up the root `CompiledModule` via `sim.modules.get(&sim.root)`. Return `Unsupported` if `sim.specs.method != Method::Euler`. Return `Unsupported` if the root has any nested modules (`root.context.modules` non-empty) — modules land in Phase 7. +2. Compute layout: `n_slots = root.n_slots`, `n_chunks = sim.specs.n_chunks`, `stride = n_slots*8`, `curr_base = 0`, `next_base = stride`, `results_base = 2*stride`, `pages = ceil((results_base + n_chunks*stride)/65536)`. (Mirror the POC's `compile_module`, `module.rs:72-85`.) `save_every = max(1, round(save_step/dt))`. +3. Emit three wasm functions over the shared linear memory, each `(module_off: i32) -> ()`: + - **initials**: for each `CompiledInitial` in `root.compiled_initials`, `emit_bytecode(&ci.bytecode, ...)` in order. + - **flows**: `emit_bytecode(&root.compiled_flows, ...)`. + - **stocks**: `emit_bytecode(&root.compiled_stocks, ...)`. + Each function reserves the scratch f64 local + condition i32 local(s) the emitter needs. +4. Emit the **`run`** function (`() -> ()`): seed `curr[TIME_OFF]=start`, `curr[DT_OFF]=dt`, `curr[INITIAL_TIME_OFF]=start`, `curr[FINAL_TIME_OFF]=stop`; `call initials(0)`; then the Euler loop mirroring `vm.rs:698-711` + `save_advance!` (`vm.rs:675-695`): each step call `flows(0)` then `stocks(0)`, force-save the t=start sample then every `save_every` steps, write the full `curr` row (all `n_slots`) into `results[saved]`, advance stocks `next→curr` and `time += dt`, stop after `n_chunks` saves or when `time > stop`. The POC's `emit_run` (`module.rs:172-286`) is a correct reference for this control-flow shape — adapt it to call the three opcode-emitted functions instead of inlining `Expr` lowering, and to derive the stock copy-back offsets from the `AssignNext` opcodes in `root.compiled_stocks` (collect their `off`, analogous to the POC's `collect_assign_next_offsets`, `module.rs:139-147`). +5. Assemble the module (Type/Function/Memory/Global/Export/Code sections per the POC's `assemble`, `module.rs:293-338`): export `memory`, `run`, and three immutable i32 globals `n_slots`/`n_chunks`/`results_offset` (= `results_base`). With multiple functions, emit a type section entry for `(i32)->()` and `()->()`, a function section indexing them, and export `run` by its function index. +6. Build `WasmLayout`: `var_offsets = sim.offsets.iter().map(|(k,v)| (k.as_str().to_string(), *v)).collect()`; `n_slots`, `n_chunks`, `results_offset = results_base`. + +**Testing:** +- wasm-backend.AC2.1 + AC7.4(Euler): build a `CompiledSimulation` for a small scalar Euler model via `compile_project_incremental` (mirror `wasmgen/module.rs:367-373`) — e.g. the `default_projects/population/model.xmile` already used by the POC test, and 1-2 hand-built scalar models via `TestProject` (`src/simlin-engine/src/test_common.rs`). Run the blob under DLR-FT, read the step-major slab, and assert every shared variable's full series matches `Vm::new(sim).run_to_end().into_results()` (reuse the comparison shape from `module.rs:425-457`). Assert `step_count == n_chunks` and the saved cadence matches. +- wasm-backend.AC4.1: a dedicated test reads the three exported i32 globals from the instantiated module (via the `checked` crate's `instance_export(inst, "n_slots").as_global()` accessor) and verifies they equal the `WasmLayout` values; then uses `results_offset`/`n_slots`/`n_chunks` (read from the module, no external metadata) to stride to one variable's series and confirm it matches the VM. + +**Verification:** +Run: `cargo test -p simlin-engine --features file_io wasmgen::module` +Expected: all new tests pass. + +**Commit:** `engine: wasmgen compile_simulation (root, Euler) over CompiledSimulation` + + + +### Task 3: Reroute the datamodel entry point; remove the `Expr`-based path + +**Verifies:** wasm-backend.AC2.1. + +**Files:** +- Modify: `src/simlin-engine/src/wasmgen/module.rs` (replace `compile_datamodel_to_wasm` body; remove `compile_module(&Module, &Specs)` and the `collect_assign_next_offsets(&[Expr])`/`store_curr_const`/`emit_run` helpers that consumed `Expr`/`compiler::Module`) +- Delete: `src/simlin-engine/src/wasmgen/expr.rs` +- Modify: `src/simlin-engine/src/wasmgen/mod.rs` (remove `mod expr;`, update `pub use`) + +**Implementation:** +Rewrite `compile_datamodel_to_wasm(datamodel, model_name) -> Result, WasmGenError>` to go through the salsa pipeline and the new entry point (this is what makes AC2.1 true end-to-end and removes the only production use of `compiler::Module`): +```rust +pub fn compile_datamodel_to_wasm(datamodel: &crate::datamodel::Project, model_name: &str) + -> Result, WasmGenError> +{ + let mut db = crate::db::SimlinDb::default(); + let sync = crate::db::sync_from_datamodel_incremental(&mut db, datamodel, None); + let sim = crate::db::compile_project_incremental(&db, sync.project, model_name) + .map_err(|e| WasmGenError::Unsupported(format!("wasmgen: incremental compile failed: {e:?}")))?; + Ok(compile_simulation(&sim)?.wasm) +} +``` +(The `WasmLayout` is dropped here; Phase 7 changes the FFI to surface it. Keep this function's signature stable so `libsimlin` and the `wasm-backend-poc.mjs` exploratory script keep building.) + +Delete `wasmgen/expr.rs` entirely (its `Expr`-tree lowering is replaced by `lower.rs`'s opcode emitter). Move the still-needed shared helpers (`memarg`, `f64_const`) into `lower.rs` if not already there. Replace the old `population_wasm_matches_vm` test so it builds the wasm via `compile_simulation(&compiled)` (the same `compiled` it already produces for the VM golden at `module.rs:369-373`) rather than `compile_module(&module, &specs)`; drop the monolithic `compiler::Module::new` usage from the test. + +**Testing:** +- The rerouted `population_wasm_matches_vm` (now compiling via `compile_simulation`) passes. +- Add a test that `compile_datamodel_to_wasm` returns a non-empty blob for the population model and that the blob validates under `wasm::validate`. + +**Verification:** +Run: `cargo test -p simlin-engine --features file_io wasmgen` +Expected: all wasmgen tests pass; `wasmgen/expr.rs` no longer exists; no references to `crate::compiler::Module` remain in `wasmgen/`. + +**Commit:** `engine: route wasmgen through compile_simulation; drop Expr path` + + + +### Task 4: Revert the monolithic-compiler `#[cfg(test)]` un-gating + +**Verifies:** wasm-backend.AC2.2. + +**Files:** +- Modify: `src/simlin-engine/src/compiler/mod.rs` + +**Implementation:** +The POC removed `#[cfg(test)]` from the monolithic builder so the `Expr`-based wasmgen could use it in production. Now that wasmgen consumes `CompiledSimulation`, re-gate it (restoring `main`'s state). Re-add `#[cfg(test)]` to: +- the four imports the POC un-gated at `compiler/mod.rs:16-29` (`use crate::common::{Error, ErrorCode, ErrorKind};`, `use crate::model::ModelStage1;`, `use crate::project::Project;`, `use crate::vm::IMPLICIT_VAR_COUNT;` — confirm exact set against `git diff main -- src/simlin-engine/src/compiler/mod.rs`), +- `calc_module_model_map` (`mod.rs:2660`, currently `pub(crate) fn`), +- `build_metadata` (`mod.rs:2694`, currently `pub(crate) fn`), +- `calc_n_slots` (`mod.rs:2830`, currently bare-private `fn`), +- the `impl Module { fn new }` block (`mod.rs:2849`, `pub(crate) fn new`). + +Use `git diff main -- src/simlin-engine/src/compiler/mod.rs` to see precisely what the POC changed and invert exactly that diff (do **not** touch the separate pre-existing `#[cfg(test)] impl Module` test-helper block at `mod.rs:3046`, nor the non-test `impl Module { pub fn compile() }` at `mod.rs:2839`). + +**Testing:** +This is a visibility/gating revert verified operationally (no new behavior; **Verifies: AC2.2** via build state). The existing `#[cfg(test)]` users of `Module::new` (and the test suite) continue to compile. + +**Verification:** +Run: `cargo build -p simlin-engine` — builds with the four items test-only again (a non-test build no longer references them). +Run: `cargo test -p simlin-engine --features file_io` — compiles and passes (test code still reaches the now-`#[cfg(test)]` builder). +Run: `git diff main -- src/simlin-engine/src/compiler/mod.rs` — shows only the re-gating (the POC's un-gating is fully inverted). + +**Commit:** `engine: re-gate monolithic compiler builder to test-only` + + + + + + +### Task 5: `ensure_wasm_matches` parity helper + +**Verifies:** wasm-backend.AC1.1, wasm-backend.AC3.1. + +**Files:** +- Modify: `src/simlin-engine/tests/test_helpers.rs` (add the helper + a `WasmRunOutcome` type; add the `checked`/`wasm` imports) +- (If `compile_simulation`/`WasmArtifact`/`WasmLayout`/`sync_from_datamodel_incremental`/`compile_project_incremental`/`SimlinDb` are not `pub`, widen them to `pub` so this `tests/` target can call them.) + +**Implementation:** +Add a helper that compiles a model to wasm, runs it under the DLR-FT interpreter, builds a `Results` from the step-major slab, and compares it to the model's expected outputs with the **existing** comparator (`ensure_results_excluding`, `test_helpers.rs:62`) — the same check the VM passes. There is no separate wasm-vs-VM threshold (per the design's validation bar); "wasm-vs-VM parity" is achieved because both clear the same comparator against the same expected outputs. + +```rust +pub enum WasmRunOutcome { Ran, Skipped(String) } // Skipped carries the Unsupported message + +pub fn ensure_wasm_matches( + datamodel: &simlin_engine::datamodel::Project, + model_name: &str, + expected: &simlin_engine::Results, + excluded: &[&str], +) -> WasmRunOutcome +``` +Steps: +1. Build `CompiledSimulation` exactly as the VM corpus path does (`simulate.rs:105-111` `compile_vm`): `SimlinDb::default()` → `sync_from_datamodel_incremental` → `compile_project_incremental(&db, sync.project, model_name)`. (If the incremental compile itself errors, that is a VM-side issue already covered elsewhere — return `Skipped` with the message rather than failing here.) +2. `let artifact = match simlin_engine::wasmgen::compile_simulation(&sim) { Ok(a) => a, Err(WasmGenError::Unsupported(m)) => return WasmRunOutcome::Skipped(m) };` +3. Instantiate `artifact.wasm` under `checked::Store`, invoke `run`, and read the results region. Read geometry from `artifact.layout` (`n_slots`, `n_chunks`, `results_offset`) — copy `n_chunks * n_slots` f64 from `results_offset`. +4. Build a `simlin_engine::Results`: `offsets` from `artifact.layout.var_offsets` (map each `String` back to `Ident` via the canonicalizing constructor), `data` = the slab (boxed), `step_size = n_slots`, `step_count = n_chunks`, `specs = sim.specs.clone()`, `is_vensim = false`. +5. `ensure_results_excluding(expected, &wasm_results, excluded);` (panics on mismatch — a supported model producing wrong wasm fails loudly). Return `WasmRunOutcome::Ran`. + +**Testing:** +This helper is exercised by Task 6's corpus wiring and by a focused unit test here: call `ensure_wasm_matches` on one tiny scalar model (build its `expected` from the VM) and assert it returns `Ran`; call it on a model using an unsupported construct (e.g. a builtin/`Apply`) and assert it returns `Skipped`. (AC1.1: a supported model clears `ensure_results`; AC3.1: an unsupported model is skipped, not failed.) + +**Verification:** +Run: `cargo test -p simlin-engine --features file_io --test simulate ensure_wasm_matches` +Expected: helper unit tests pass. + +**Commit:** `engine: add ensure_wasm_matches parity helper` + + + +### Task 6: Wire the corpus through both backends + the rising floor gate + +**Verifies:** wasm-backend.AC1.1, wasm-backend.AC3.1, wasm-backend.AC4.1. + +**Files:** +- Modify: `src/simlin-engine/tests/simulate.rs` + +**Implementation:** +1. **Inline hook:** in `simulate_path_with_excluding` (`simulate.rs:843-915`), after the existing VM `ensure_results_excluding` comparisons pass, call `ensure_wasm_matches(&datamodel, "main", &expected, excluded)` once per model. A `Ran` outcome means the wasm output already cleared `ensure_results` inside the helper (a supported-but-wrong model panics there); a `Skipped` outcome is recorded, not failed. Do the same in the `.mdl` path (`simulate_mdl_path*`). Do **not** add the hook to `run_clearn_vs_vdf`/`simulates_clearn` or other `#[ignore]` heavy-model paths — those get `#[ignore]`d wasm twins in Phase 8 (the DLR-FT interpreter is slow; keep the default suite under the 3-minute cap). +2. **Floor gate:** add `const WASM_SUPPORTED_FLOOR: usize = ;` and a `#[test] fn wasm_parity_floor()` that iterates the small/medium corpus list (`TEST_MODELS`, `simulate.rs:22-101`, skipping any entry that is itself `#[ignore]`-class/heavy), runs each through `ensure_wasm_matches` (building `expected` from the VM via the existing parse+`compile_vm`+run path), counts `Ran`, and asserts `ran >= WASM_SUPPORTED_FLOOR`. Set `WASM_SUPPORTED_FLOOR` to the count Phase 1 actually achieves (run the test once, observe, pin it). Document with a comment that each subsequent phase raises this floor and that dropping below it is a regression (AC3.1 / AC3.3). Keep the gate's total runtime within budget — if iterating all of `TEST_MODELS` under the interpreter is too slow, restrict the gate to a representative scalar subset and note it; the per-model inline hook still covers the rest functionally. + +**Testing:** +The gate test *is* the test. Also confirm (manually, noted in the commit) that at least one scalar model reports `Ran` and that introducing a deliberate `Unsupported` (temporarily) lowers the count — i.e. the floor would catch a regression. + +**Verification:** +Run: `cargo test -p simlin-engine --features file_io --test simulate wasm_parity_floor` +Expected: passes with `ran >= WASM_SUPPORTED_FLOOR`. +Run: `cargo test -p simlin-engine --features file_io --test simulate` +Expected: the full corpus passes (VM unchanged; supported models also clear wasm; unsupported models skip). + +**Commit:** `engine: run corpus through wasm backend with rising floor gate` + + + +--- + +## Phase 1 Done When +- Scalar Euler corpus models match the VM through wasm (clearing the existing `ensure_results` comparator); unsupported models skip cleanly via `WasmGenError::Unsupported`. +- The floor gate (`wasm_parity_floor`) is active and pinned. +- The monolithic builder is re-gated to `#[cfg(test)]`; `cargo build -p simlin-engine` and `cargo test -p simlin-engine --features file_io` both pass. +- The blob is self-describing (exports `n_slots`/`n_chunks`/`results_offset`, step-major results) and a test reads geometry from the module to stride results (AC4.1). diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_02.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_02.md new file mode 100644 index 000000000..36f50fa19 --- /dev/null +++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_02.md @@ -0,0 +1,185 @@ +# WebAssembly Simulation Backend — Phase 2: Full scalar builtins + numeric parity + +**Goal:** Bring every scalar `BuiltinId` and `Op2` to VM parity: open-code the transcendentals wasm lacks, route equality/truthiness through a wasm `approx_eq` helper that matches `crate::float::approx_eq` exactly, and lower `Mod`/`Exp` and the composed builtins (`Step`/`Pulse`/`Ramp`/`Sshape`/`Sign`/`Quantum`/`SafeDiv`) to faithful f64 sequences. + +**Architecture:** Builds on Phase 1's opcode emitter (`wasmgen/lower.rs`). Math wasm provides natively (`f64.abs`/`sqrt`/`floor`/`min`/`max`/arithmetic/compares) maps to the instruction directly; the transcendentals (`sin`/`cos`/`tan`/`asin`/`acos`/`atan`/`exp`/`ln`/`log10`/`pow`) are emitted once each as self-contained wasm helper functions (range reduction + polynomial) and called by name — the blob needs no math imports. Equality and truthiness route through a single emitted `approx_eq` helper so the backend takes the same branch the VM takes. + +**Tech Stack:** Rust; `wasm-encoder` (multi-function modules, `call`); the DLR-FT interpreter oracle; `crate::float::approx_eq` (`float_cmp` 0.10) as the equality reference; the VM's `apply()` (`vm.rs:2938-3012`) and `eval_op2` (`vm.rs:94-111`) as the builtin/operator spec. + +**Scope:** Phase 2 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`. + +**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`). + +--- + +## Acceptance Criteria Coverage + +### wasm-backend.AC1 +- **wasm-backend.AC1.1 Success:** A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` against the model's expected outputs at those tests' existing tolerances. (No separate, tighter wasm-vs-VM threshold.) +- **wasm-backend.AC1.5 Edge:** Empty-view reducers, out-of-bounds subscripts, and division-by-zero produce the same NaN / finite-`:NA:` / Inf values the VM produces. *(Phase 2 covers the finite-`:NA:`-sentinel-vs-genuine-NaN distinction via the `approx_eq` helper; the empty-reducer/OOB portions complete in Phase 5.)* + +### wasm-backend.AC7 +- **wasm-backend.AC7.1 Success:** Math wasm provides natively (`sqrt`, `abs`, `floor`/`ceil`/`trunc`/`nearest`, `min`/`max`, arithmetic) uses wasm instructions; the transcendentals wasm lacks (`sin`/`cos`/`tan`/`asin`/`acos`/`atan`/`exp`/`ln`/`log10`/`pow`) and the allocation `erfc` are open-coded as self-contained wasm helper functions (range reduction + polynomial). Each open-coded helper has a unit test comparing its output to Rust `f64` over a sampled range; results need not be bit-identical to the VM's libm — only close enough that the existing tests pass. *(The allocation `erfc`/`normal_cdf` helpers land in Phase 6; Phase 2 covers the scalar transcendentals.)* +- **wasm-backend.AC7.2 Success:** Equality and truthiness (`Eq`/`Neq`/`And`/`Or`/`If` condition) use ULP-based `approx_eq` matching the VM. +- **wasm-backend.AC7.3 Edge:** `Mod` matches the VM's `rem_euclid` semantics (computed via wasm `floor`). `Max`/`Min` use the wasm `f64.max`/`f64.min` instructions; if a corpus test surfaces a NaN/±0 difference from the VM's compare-based form, fall back to explicit compare-and-select for that case. + +--- + +## Notes for the implementer (read first) + +- **Confirmed enums.** `Op2` (`bytecode.rs:527`): `Add, Sub, Exp, Mul, Div, Mod, Gt, Gte, Lt, Lte, Eq, And, Or` (no `Neq`). `BuiltinId` (`bytecode.rs:500`): `Abs, Arccos, Arcsin, Arctan, Cos, Exp, Inf, Int, Ln, Log10, Max, Min, Pi, Pulse, Quantum, Ramp, SafeDiv, Sign, Sin, Sshape, Sqrt, Step, Tan`. **There is no `Mean` and no `IsModuleInput` `BuiltinId`** — scalar `MEAN(a,b,…)` is lowered by codegen to `(0+a+b+…)/N` using `Op2::Add`/`Op2::Div` (already handled by Phase 1/2), single-arg `MEAN(array)` becomes `ArrayMean` (Phase 5), and `IsModuleInput` is resolved to a `LoadConstant 1.0/0.0` at codegen. So the backend never sees a `Mean`/`IsModuleInput` opcode. +- **`Apply` always pops exactly 3 operands** (codegen pads: 1-arg builtins with two `LoadConstant 0.0`; 2-arg with one; `Ramp` pads its end-time with `LoadGlobalVar{FINAL_TIME_OFF}`). So lower `Apply{func}` by popping the 3 stack values into three scratch f64 locals `a`, `b`, `c` (top is `c`), reading `time = curr[TIME_OFF]`/`dt = curr[DT_OFF]` from memory when the builtin needs them, computing per `apply()` (`vm.rs:2938-3012`), and pushing the result. +- **`apply()` exact sequences** (mirror verbatim, `vm.rs:2938-3012`): `Abs=a.abs()`, `Sqrt=a.sqrt()`, `Int=a.floor()` (**floor, not trunc**), `Min={if ab {a} else {b}}`, `Sign={if a>0 {1} else if a<0 {-1} else {0}}`, `Quantum={if b==0.0 {a} else {(a/b).trunc()*b}}`, `SafeDiv={if b != 0.0 {a/b} else {c}}` (**exact `!= 0.0`, not approx**), `Sshape=b + (c-b)/(1.0 + (-4.0*(2.0*a-1.0)).exp())`, `Exp=a.exp()`, `Ln=a.ln()`, `Log10=a.log10()`, `Sin/Cos/Tan/Arcsin/Arccos/Arctan` = the libm calls, `Inf=f64::INFINITY`, `Pi=PI`, `Step=step(time,dt,a,b)`, `Pulse=pulse(time,dt,a,b,c)`, `Ramp=ramp(time,a,b,Some(c))`. Helper bodies: `step` (`vm.rs:3027`): `if time + dt/2.0 > step_time {height} else {0.0}`; `ramp` (`vm.rs:3014`): `if time > start {if end.is_some() && time>=end {slope*(end-start)} else {slope*(time-start)}} else {0.0}`; `pulse` (`vm.rs:3036`): a `while` loop — emit it as a wasm helper function with a loop. +- **`eval_op2`** (`vm.rs:94-111`): `Exp=l.powf(r)`, `Mod=l.rem_euclid(r)`, `Eq=approx_eq(l,r) as f64`, `And=(is_truthy(l)&&is_truthy(r)) as f64`, `Or=(is_truthy(l)||is_truthy(r)) as f64`. The rest (`Add/Sub/Mul/Div/Gt/Gte/Lt/Lte`) are Phase 1. +- **`approx_eq` is `float_cmp::approx_eq!(f64, a, b)`** with `float-cmp` 0.10.0 defaults `epsilon = f64::EPSILON`, `ulps = 4`. Exact algorithm (must be reproduced bit-faithfully in wasm; confirmed by reading the crate): + - `a == b` → true (handles ±inf and exact equality), OR + - `(a-b).abs() <= f64::EPSILON` → true, OR + - `|ulps_diff(a,b)| <= 4` → true, + where `ulps_diff(a,b) = ordered(a).wrapping_sub(ordered(b))` as `i64` (then `saturating_abs`), and `ordered(f) = { let bits = f.to_bits() as i64; if (bits as u64) & (1<<63) != 0 { !bits ... } else { bits ^ (1<<63) } }` — i.e. map the sign-magnitude bit pattern to a monotonic ordered integer. Consequence: **`approx_eq(NaN, NaN) == true`** (identical bits → 0 ulps), and the finite `:NA:` sentinel (`crate::float::NA = -2^109`) compares unequal to ordinary values (its exponent is far from theirs). `is_truthy(n) = !approx_eq(n, 0.0)` (`vm.rs:89`). +- **`pub(crate)`/`pub` latitude** (per the repo owner): widen visibility freely. Reuse the Rust `crate::float::approx_eq` in unit tests as the oracle for the wasm helper. +- **TDD, inline `#[cfg(test)] mod tests`, < 2s per test.** Run: `cargo test -p simlin-engine --features file_io wasmgen`. + +--- + + + +### Task 1: `approx_eq` wasm helper + equality/truthiness routing + +**Verifies:** wasm-backend.AC7.2, wasm-backend.AC1.5 (the finite `:NA:` sentinel vs genuine NaN — `approx_eq` keeps them distinct). + +**Files:** +- Modify: `src/simlin-engine/src/wasmgen/lower.rs` (the emitter) and the module-assembly code so the helper function is emitted once and callable. +- Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +1. Emit one wasm helper function `approx_eq(a: f64, b: f64) -> i32` (returns 1/0) reproducing the algorithm above using `i64.reinterpret_f64`, `i64` arithmetic (`wrapping_sub` is plain `i64.sub`; replicate `saturating_abs` and the `ordered` bit map), `f64.eq`, `f64.sub`/`f64.abs`, and `f64.const f64::EPSILON`. Reserve a function index for it (it joins the module's function table; later phases reuse it). Provide a small `pub(crate)` helper in the emitter that pushes two f64 operands and emits `call approx_eq`. +2. Replace Phase 1's placeholder truthiness everywhere it matters: + - `Not {}`: `call approx_eq(value, 0.0)` → i32 `is_false`; logical-not (`i32.eqz`) → `is_truthy`; convert to f64 1.0/0.0. (i.e. `Not` pushes `(!is_truthy) as f64` = `is_false as f64`; mirror `vm.rs` `Not` = `(!is_truthy(pop)) as f64`.) + - `SetCond {}`: `is_truthy(pop) = approx_eq(pop, 0.0) == 0` → store the i32 into the condition local. + - `Op2::Eq`: `call approx_eq(l, r)` → i32 → `f64.convert_i32_u` (f64 1.0/0.0). + - `Op2::And`: `is_truthy(l) & is_truthy(r)` → f64; `Op2::Or`: `is_truthy(l) | is_truthy(r)` → f64. (Both operands are on the stack; compute `is_truthy` of each via `approx_eq(·,0.0); i32.eqz`, combine with `i32.and`/`i32.or`, convert to f64.) + - `If {}` condition: unchanged structurally (reads the condition local set by `SetCond`), but the local now holds the `approx_eq`-based truthiness. + `Neq` is not an `Op2` (codegen lowers it to `Eq`+`Not`), so routing `Eq` through `approx_eq` automatically makes `Neq` correct. + +**Testing:** +- A unit test that emits a tiny module exporting `eq(a,b)->i32` wired to the `approx_eq` helper, runs it under DLR-FT for a curated + randomized sample of f64 pairs, and asserts the wasm result equals `crate::float::approx_eq(a,b)` for every pair. Sample must include: exact equal, far apart, 1–4 ULP apart, `f64::EPSILON`-apart around 1.0, around-zero (subnormals), `(NaN,NaN)`, `(NaN,1.0)`, `(NA, NA)`, `(NA, 0.0)`, `(+0.0,-0.0)`, `(±inf, ±inf)`. +- Tests that `Op2::Eq`, `Op2::And`, `Op2::Or`, `Not`, and `SetCond`+`If` now match the VM's `eval_op2`/`is_truthy` for near-zero / ULP-adjacent operands where raw `==`/`!=0.0` would diverge. + +**Verification:** +Run: `cargo test -p simlin-engine --features file_io wasmgen::lower` +Expected: the `approx_eq`-parity tests pass. + +**Commit:** `engine: wasmgen approx_eq helper + equality/truthiness routing` + + + + + +### Task 2: Open-coded transcendental helpers + +**Verifies:** wasm-backend.AC7.1. + +**Files:** +- Create: `src/simlin-engine/src/wasmgen/math.rs` (the transcendental helper emitters) — or add to `lower.rs`; prefer a dedicated module for clarity. +- Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +Emit one self-contained wasm helper function per transcendental, each `(f64) -> f64` (or `(f64,f64)->f64` for `pow`), using range reduction + a polynomial/rational approximation. The blob imports no host math. There is no external library to integrate — this is standard numerical method work, validated against Rust `f64`. Recommended kernels (refine only if a corpus model needs more accuracy — the bar is the `simulate.rs` tolerances, abs `2e-3` / rel `5e-6` / VDF 1%): +- `exp(x)`: reduce `x = k·ln2 + r`, `|r| <= ln2/2`; `exp(x) = 2^k · exp(r)` (poly in `r`); assemble `2^k` by composing the exponent bits (`i64`→`f64` via `f64.reinterpret_i64`). Handle overflow→`+inf`, underflow→`0`, `NaN`→`NaN`. +- `ln(x)`: split `x = m · 2^e` with `m ∈ [1,2)` (decompose the f64 bits); `ln(x) = e·ln2 + ln(m)` (poly/`atanh` series in `(m-1)/(m+1)`). `x<0`→`NaN`, `x==0`→`-inf`. +- `sin(x)`/`cos(x)`: reduce modulo `π/2` (Cody–Waite or a simple `k = round(x/(π/2))` with extended-precision subtraction), choose the kernel poly by `k mod 4`. +- `atan(x)`: reduce using `atan(x) = π/2 - atan(1/x)` for `|x|>1` and a small-argument poly; sign symmetry. +- Composed: `tan = sin/cos`; `pow(x,y) = exp(y·ln x)` (matches `powf` for `x>0`; **negative-base integer powers diverge** — note this as a known limitation, refine only if a corpus model uses it); `log10(x) = ln(x)·(1/ln10)`; `asin(x) = atan(x / sqrt(1-x²))` (with domain clamping at `|x|=1`); `acos(x) = π/2 - asin(x)`. + +Wire each `BuiltinId` transcendental in the `Apply` lowering (Task 4) to `call` the matching helper. Emit each helper at most once per module (lazily, recording its function index). + +**Testing:** +Per AC7.1, **each helper gets a unit test comparing the emitted wasm output to Rust `f64` over a sampled range**: emit a module exporting the helper, run it under DLR-FT for a dense sample across the function's domain (and edge cases: 0, ±large, near asymptotes, the `asin`/`acos` endpoints, negative args for `ln`/`sqrt`/even roots), and assert `|wasm(x) - rust_f64(x)| <= tol` with a tol comfortably inside the `simulate.rs` tolerances (e.g. rel `1e-9`..`1e-6` depending on the function; document the chosen tol per helper and why it suffices). Include NaN/inf propagation assertions. + +**Verification:** +Run: `cargo test -p simlin-engine --features file_io wasmgen::math` +Expected: every transcendental helper's accuracy test passes. + +**Commit:** `engine: open-coded wasm transcendental helpers (exp/ln/sin/cos/atan + composed)` + + + +### Task 3: `Op2::Exp` and `Op2::Mod` + +**Verifies:** wasm-backend.AC7.3 (Mod), wasm-backend.AC1.1. + +**Files:** +- Modify: `src/simlin-engine/src/wasmgen/lower.rs` (extend the `Op2` arm). + +**Implementation:** +- `Op2::Exp`: operands `[l, r]` on stack → `call pow` (the Task 2 helper). Matches `l.powf(r)` for positive base. +- `Op2::Mod`: compute `rem_euclid(l, r)` faithfully (do **not** use a plain truncated remainder). `r0 = l - r * (l / r).trunc()` (the `%` result, via `f64.div`, `f64.trunc`, `f64.mul`, `f64.sub`); then `if r0 < 0.0 { r0 + r.abs() } else { r0 }` (via `f64.lt`, `f64.abs`, `f64.add`, `select`). This reproduces Rust's `f64::rem_euclid` exactly (a result in `[0, |r|)`). (The design's "via floor" phrasing is approximate; the trunc-then-adjust form matches `rem_euclid` for negative divisors too.) + +**Testing:** +- `Op2::Exp`: assert wasm matches `l.powf(r)` (via the VM) for a sample of positive bases and assorted exponents (integer, fractional, negative). +- `Op2::Mod`: assert wasm matches `l.rem_euclid(r)` for the four sign combinations of `(l, r)` and non-integer operands; assert the result is always in `[0, |r|)`. + +**Verification:** +Run: `cargo test -p simlin-engine --features file_io wasmgen::lower` +Expected: Exp/Mod parity tests pass. + +**Commit:** `engine: wasmgen Op2 Exp (pow) and Mod (rem_euclid)` + + + +### Task 4: `Apply` lowering for the full `BuiltinId` set + +**Verifies:** wasm-backend.AC1.1, wasm-backend.AC7.1, wasm-backend.AC7.3 (Min/Max). + +**Files:** +- Modify: `src/simlin-engine/src/wasmgen/lower.rs` (add the `Apply { func }` arm). + +**Implementation:** +Add the `Apply { func }` arm: pop the 3 operands into scratch f64 locals `a`/`b`/`c` (top is `c`), then emit per `func`, reading `time`/`dt` from `curr[TIME_OFF]`/`curr[DT_OFF]` where needed (mirror `apply()` exactly): +- Native f64 instr: `Abs`→`f64.abs(a)`, `Sqrt`→`f64.sqrt(a)`, `Int`→`f64.floor(a)`, `Max`→`f64.max(a,b)`, `Min`→`f64.min(a,b)`. + - **AC7.3 Min/Max note:** `f64.min`/`f64.max` differ from the VM's compare form (`if a>b {a} else {b}`) on NaN and ±0. Use the wasm instructions first; if a corpus test surfaces a NaN/±0 divergence, switch *that* op to the compare-and-select form `(a>b)?a:b` / `(a0`→1, `a<0`→-1, else 0 via compares+selects), `Quantum` (`b==0.0`→`a` else `(a/b).trunc()*b` — exact `==`), `SafeDiv` (`b != 0.0`→`a/b` else `c` — exact `!=`), `Sshape` (`b + (c-b)/(1.0 + exp(-4.0*(2.0*a-1.0)))`, calling the `exp` helper). +- Transcendental: `Exp/Ln/Log10/Sin/Cos/Tan/Arcsin/Arccos/Arctan` → `call` the Task 2 helpers on `a`. +- Time-driven helpers: `Step` (`time + dt/2 > b ? a : 0`), `Ramp` (the `ramp(time, a, b, Some(c))` branch logic), `Pulse` (emit/`call` a `pulse(time, dt, volume, first, interval)` wasm helper containing the VM's `while` loop, `vm.rs:3036-3053`). +- Constants: `Inf`→`f64.const INFINITY`, `Pi`→`f64.const PI`. (Codegen usually emits these as `LoadConstant`, but handle the `Apply` form too.) + +**Testing:** +- Per-builtin unit tests: emit each `Apply{func}` over hand-built operand sequences, run under DLR-FT, assert equality with the VM's `apply(func, time, dt, a, b, c)` over representative inputs (including the edge values: `Int` of negatives (floor vs trunc), `Quantum` with `b==0`, `SafeDiv` with `b==0` and `b`=subnormal, `Sign(0)`, `Step`/`Ramp` across their breakpoints, `Pulse` across multiple intervals, `Sshape` across `[0,1]`). +- AC7.1: the transcendental `Apply` arms produce values within the documented tolerance of Rust `f64`. + +**Verification:** +Run: `cargo test -p simlin-engine --features file_io wasmgen::lower` +Expected: all builtin parity tests pass. + +**Commit:** `engine: wasmgen Apply lowering for full scalar builtin set` + + + + +### Task 5: Raise the floor; scalar-only corpus parity + +**Verifies:** wasm-backend.AC1.1. + +**Files:** +- Modify: `src/simlin-engine/tests/simulate.rs` (raise `WASM_SUPPORTED_FLOOR`). + +**Implementation:** +With all scalar builtins/operators supported, more corpus models now run through wasm. Re-run the `wasm_parity_floor` gate, observe the new `Ran` count, and raise `WASM_SUPPORTED_FLOOR` to it. Any model that is purely scalar (no arrays, lookups, modules, RK, PREVIOUS/INIT) should now `Ran` and clear `ensure_results`. Models still using unsupported constructs (graphical functions, arrays, modules, RK2/RK4, PREVIOUS/INIT) remain `Skipped` until their phases land. + +**Testing:** +The raised floor gate is the test. Confirm (note in the commit) that scalar models which were `Skipped` in Phase 1 due to `Eq`/builtins now `Ran`. + +**Verification:** +Run: `cargo test -p simlin-engine --features file_io --test simulate` +Expected: full corpus passes; `wasm_parity_floor` passes at the raised floor. + +**Commit:** `engine: raise wasm parity floor after full scalar builtins` + + +--- + +## Phase 2 Done When +- All scalar-only corpus models match the VM through wasm (clearing `ensure_results`). +- Unit tests cover each builtin, each transcendental helper (vs Rust `f64`), and the `approx_eq`/NaN/`:NA:` edge cases. +- `Mod`=`rem_euclid`, `Exp`=`pow`, equality/truthiness via `approx_eq`; `Min`/`Max` via `f64.min`/`f64.max` (compare-fallback noted). +- The floor gate is raised to the new supported count. diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_03.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_03.md new file mode 100644 index 000000000..cb05d7334 --- /dev/null +++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_03.md @@ -0,0 +1,119 @@ +# WebAssembly Simulation Backend — Phase 3: Graphical functions (lookups) + +**Goal:** Bring the scalar `Lookup` opcode (Interpolate / Forward / Backward modes) to VM parity by laying the graphical-function tables into the blob's linear memory and emitting a shared lookup helper that mirrors the VM's three lookup functions exactly. + +**Architecture:** The `ByteCodeContext.graphical_functions` (a `Vec>`) is serialized into a read-only region of the module's linear memory via an active wasm data segment, alongside a per-table directory (byte offset + point count). Three wasm helper functions — `lookup_interp`, `lookup_forward`, `lookup_backward` — reproduce `vm.rs`'s `lookup`/`lookup_forward`/`lookup_backward` (`vm.rs:3055-3186`) over a `(data_offset, count, index)` interface. The `Lookup { base_gf, table_count, mode }` opcode lowers to a runtime element-offset bounds check + a directory lookup + a `call` to the mode's helper. The interpolate kernel reuses Phase 2's `approx_eq` helper for the at-knot exact-hit test. + +**Tech Stack:** `wasm-encoder` `DataSection` (active data); the Phase 2 `approx_eq` helper; the VM lookup functions as spec. + +**Scope:** Phase 3 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`. + +**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`). + +--- + +## Acceptance Criteria Coverage + +### wasm-backend.AC1 +- **wasm-backend.AC1.1 Success:** A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` against the model's expected outputs at those tests' existing tolerances. (No separate, tighter wasm-vs-VM threshold.) + +### wasm-backend.AC7 +- **wasm-backend.AC7.1 Success:** Math wasm provides natively uses wasm instructions; the transcendentals … are open-coded as self-contained wasm helper functions … Each open-coded helper has a unit test comparing its output to Rust `f64` over a sampled range. *(For Phase 3 the relevant helpers are the lookup kernels; tested against the VM's `lookup`/`lookup_forward`/`lookup_backward`.)* + +--- + +## Notes for the implementer (read first) + +- **Opcode** (`bytecode.rs:626-638`): `Lookup { base_gf: GraphicalFunctionId, table_count: u16, mode: LookupMode }`. `GraphicalFunctionId = u8` (`bytecode.rs:21`, so ≤256 tables/module). `LookupMode` (`bytecode.rs:45-55`): `Interpolate = 0`, `Forward = 1`, `Backward = 2`. Stack effect `(2,1)`. +- **Stack discipline** (`vm.rs:1710-1731`): the opcode pops `lookup_index` first, then `element_offset` (so the producing opcodes pushed `element_offset` then `lookup_index`). Bounds check: `if element_offset < 0.0 || element_offset >= table_count as f64 { push NaN } else { gf_idx = base_gf + element_offset; dispatch mode }`. For the common scalar case codegen emits `LoadConstant 0.0` for `element_offset` (so it is 0), but **the lowering must handle a runtime element_offset** (arrayed scalar-`Lookup` selects a per-element table). +- **Tables** (`bytecode.rs:1588`): `graphical_functions: Vec>`; the table used is `graphical_functions[base_gf + element_offset]`, a list of `(x,y)` knots in x-ascending order. +- **The three VM lookup functions are NOT one function — they differ in three ways** (confirmed; this is the key parity risk): + - `lookup` (Interpolate, `vm.rs:3055-3102`): empty→NaN; NaN index→NaN; `index < x[0]` (**strict**)→`y[0]`; `index > x[n-1]` (**strict**)→`y[n-1]`; lower-bound binary search (`while low= x[n-1]`→`y[n-1]`; **same lower-bound** search; return `y[low]`. **No approx_eq, no interpolation.** + - `lookup_backward` (`vm.rs:3144-3186`): empty/NaN→NaN; `index <= x[0]`→`y[0]`; `index >= x[n-1]`→`y[n-1]`; **upper-bound** search (`if x[mid] <= index {low=mid+1} else {high=mid}`); return `y[low-1]` (last knot with `x <= index`; for duplicate x, the LAST). **No approx_eq, no interpolation.** +- The `context.graphical_functions[gf_idx]` access is a safe bounds-checked index in the VM; the element_offset/table_count check guarantees it's in range. +- **Memory-layout convention (extended each phase).** Phase 1 used `[curr][next][results]`. Phase 3 appends two regions after the results region: a **GF directory** (per global table index: byte offset of its data + point count) and the **GF data** (all tables' `(x,y)` pairs as f64). Compute these region bases in `compile_simulation`, grow `pages` accordingly, and initialize them with an active `DataSection`. `results_offset` (exported) is unchanged. (Phases 4/5 append RK-scratch / temp regions similarly.) +- `pub(crate)`/`pub` latitude per the repo owner. TDD, inline `#[cfg(test)] mod tests`, `cargo test -p simlin-engine --features file_io wasmgen`. + +--- + + + +### Task 1: Emit GF tables + directory into linear memory + +**Verifies:** wasm-backend.AC1.1 (prerequisite for lookups). + +**Files:** +- Modify: `src/simlin-engine/src/wasmgen/module.rs` (layout + `DataSection` emission), `src/simlin-engine/src/wasmgen/lower.rs` (carry the GF region bases in `EmitCtx`). +- Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +In `compile_simulation`, after computing the results region, lay out: +- **GF data region:** concatenate every table in `root.context.graphical_functions` in order; each table's knots as consecutive f64 LE pairs `x0,y0,x1,y1,…`. Record each table's byte offset and point count. +- **GF directory region:** an array indexed by global table index `t` (0..`graphical_functions.len()`), each entry `(data_byte_offset: i32, n_points: i32)` — so the runtime can map `base_gf + element_offset` → its table. Store as two i32 per entry (or i32 pairs). +Emit both regions with an active `DataSection` (a data segment whose `ConstExpr` offset is the region base) so they're initialized at instantiation. Grow `pages` to cover them. Thread the directory base + data base into `EmitCtx`. + +(Modules in Phase 7 each have their own `ByteCodeContext.graphical_functions`; for Phase 3 only the root's tables exist. Phase 7 generalizes the directory to cover all instances' tables.) + +**Testing:** +- A test that builds a model with one graphical function, compiles it, and verifies (by reading the blob's GF data region from memory after instantiation) that the table's `(x,y)` pairs are present at the directory-indicated offset with the right count. + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen::module` + +**Commit:** `engine: emit graphical-function tables + directory into wasm memory` + + + +### Task 2: The three lookup helper functions + +**Verifies:** wasm-backend.AC1.1, wasm-backend.AC7.1. + +**Files:** +- Modify: `src/simlin-engine/src/wasmgen/lower.rs` (or a `wasmgen/lookup.rs`) — emit `lookup_interp`, `lookup_forward`, `lookup_backward`. +- Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +Emit three wasm helper functions, each `(data_off: i32, count: i32, index: f64) -> f64`, reading `x = f64.load[data_off + 16*k]`, `y = f64.load[data_off + 16*k + 8]` for knot `k`. Reproduce the VM functions exactly: +- `lookup_interp`: the empty/NaN guards, **strict** edge clamps, lower-bound binary search, then at `i=low` `call approx_eq(x[i], index)` (Phase 2 helper) → if true return `y[i]`, else the linear-interp formula. +- `lookup_forward`: NaN/empty guards, **inclusive** edge clamps, lower-bound search, return `y[low]`. +- `lookup_backward`: NaN/empty guards, inclusive edge clamps, **upper-bound** search, return `y[low-1]`. +Implement the binary search with i32 locals (`low`, `high`, `mid`) and `f64.load` of `x[mid]`. (`count == 0` → return NaN; `index` NaN via `f64.ne(index,index)` → NaN.) + +**Testing:** +- Emit each helper over hand-placed tables in memory and assert, under DLR-FT, that it matches the VM's `lookup`/`lookup_forward`/`lookup_backward` for: below-range, above-range, exact-knot hits, between-knots, a single-point table, duplicate-x tables (Backward's last-duplicate rule), and a NaN index. Compare directly against calling the VM functions (expose them `pub(crate)` if needed). + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` + +**Commit:** `engine: wasm lookup_interp/forward/backward helpers matching the VM` + + + +### Task 3: `Lookup` opcode lowering + corpus parity + +**Verifies:** wasm-backend.AC1.1. + +**Files:** +- Modify: `src/simlin-engine/src/wasmgen/lower.rs` (add the `Lookup` arm). +- Modify: `src/simlin-engine/tests/simulate.rs` (raise `WASM_SUPPORTED_FLOOR`). + +**Implementation:** +Add the `Lookup { base_gf, table_count, mode }` arm. Stack has `[element_offset, index]` (top = index). Emit: pop `index` and `element_offset` into f64 locals; bounds-check `element_offset < 0.0 || element_offset >= table_count as f64` → push NaN; else compute `table_idx = base_gf + (element_offset as i32)`, load `(data_off, count)` from the GF directory at `directory_base + table_idx*8`, and `call` the mode-specific helper (`mode` is compile-time, so emit a static `call` to `lookup_interp`/`lookup_forward`/`lookup_backward`). Push the result. Match the VM's `as usize`/`as f64` cast chain for the bounds compare. + +Then raise the floor: corpus models using graphical functions now run through wasm. Re-observe and raise `WASM_SUPPORTED_FLOOR`. + +**Testing:** +- Unit: a model with a `LOOKUP`/graphical-function variable in Interpolate, Forward, and Backward modes; assert wasm matches the VM across the table's domain (below/above/at-knot/between) and for an out-of-range `element_offset` (→NaN). +- Corpus: at least one `simulate.rs` model that uses a graphical function now `Ran` and clears `ensure_results`. + +**Verification:** `cargo test -p simlin-engine --features file_io --test simulate` + +**Commit:** `engine: wasmgen Lookup opcode lowering + GF corpus parity` + + + +--- + +## Phase 3 Done When +- Corpus models using graphical functions match the VM through wasm. +- Unit tests cover interpolate / forward / backward, edge clamping, exact-knot hits, duplicate-x (Backward), and out-of-range element_offset → NaN. +- The floor gate is raised. diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_04.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_04.md new file mode 100644 index 000000000..f43de711e --- /dev/null +++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_04.md @@ -0,0 +1,118 @@ +# WebAssembly Simulation Backend — Phase 4: RK2/RK4 integration + PREVIOUS/INIT + +**Goal:** Generate the RK2 (Heun) and RK4 multi-stage integration loops, and serve `PREVIOUS`/`INIT` via `prev_values`/`initial_values` snapshot regions captured at the same loop points the VM uses. + +**Architecture:** `compile_simulation` selects the run-loop shape from `sim.specs.method` (Euler from Phase 1; RK2/RK4 added here). The RK loops mirror `vm.rs:712-838`: per-stock scratch (`saved`/`accum` in a linear-memory region), trial-point mutation of `curr`, time juggling across stages, a final flows-only re-evaluation with restored state, then the `prev_values` snapshot. `LoadPrev`/`LoadInitial` read the two snapshot regions; the `use_prev_fallback` gate is a mutable wasm global (not a time comparison). Because the emitter knows which program it is lowering, `LoadInitial`'s "during Initials read `curr`, else read `initial_values`" branch is resolved at compile time. + +**Tech Stack:** `wasm-encoder` (loops/blocks, mutable global, multi-region memory); the VM integration loops + `run_initials` + `LoadPrev`/`LoadInitial` arms as spec. + +**Scope:** Phase 4 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`. + +**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`). + +--- + +## Acceptance Criteria Coverage + +### wasm-backend.AC1 +- **wasm-backend.AC1.1 Success:** A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` at those tests' existing tolerances. (No separate, tighter wasm-vs-VM threshold.) + +### wasm-backend.AC7 +- **wasm-backend.AC7.4 Success:** Euler, RK2, and RK4 each match the VM's saved samples (cadence and values); `PREVIOUS`/`INIT` match via the snapshot regions. *(Phase 1 established Euler; Phase 4 completes RK2/RK4 and PREVIOUS/INIT.)* + +--- + +## Notes for the implementer (read first) + +- **Reserved globals**: `TIME_OFF=0`, `DT_OFF=1`, `INITIAL_TIME_OFF=2`, `FINAL_TIME_OFF=3` (`vm.rs:83-86`). `LoadGlobalVar` reads these absolutely (no `module_off`). +- **Stock offsets**: the set of stock data-buffer offsets is the `AssignNext { off }` targets in `root.compiled_stocks` (Phase 1 already collects these for the Euler copy-back). They are module-relative `off` (root `module_off=0`, so absolute here). The VM's `stock_offsets` (`vm.rs:265`) are absolute and include submodule stocks via `EvalModule` recursion — Phase 4 is root-only; Phase 7 generalizes. +- **RK4 loop** (`vm.rs:712-787`), reproduce per timestep: + - `saved_time = curr[TIME_OFF]`. + - Stage 1: `eval_step` (flows then stocks). For each stock `off`: `s1 = next[off]-curr[off]; saved[i]=curr[off]; accum[i]=s1; curr[off]=saved[i]+s1*0.5`. Then `curr[TIME_OFF]=saved_time+dt*0.5`. + - Stage 2: `eval_step`. `s2=next[off]-curr[off]; accum[i]+=2*s2; curr[off]=saved[i]+s2*0.5`. + - Stage 3: `eval_step`. `s3=next[off]-curr[off]; accum[i]+=2*s3; curr[off]=saved[i]+s3`. Then `curr[TIME_OFF]=saved_time+dt`. + - Stage 4: `eval_step`. `s4=next[off]-curr[off]; accum[i]+=s4; next[off]=saved[i]+accum[i]/6.0; curr[off]=saved[i]`. + - `curr[TIME_OFF]=saved_time; next[TIME_OFF]=saved_time+dt`. + - **Final flows-only re-eval** with restored `curr` (`eval(StepPart::Flows)`), so `curr`'s aux/flow slots hold time-`t` values (stages 2-4 clobbered them). **Load-bearing** for both saved output and PREVIOUS. + - `prev_values := curr`; `use_prev_fallback := 0`; `save_advance!`. +- **RK2 (Heun) loop** (`vm.rs:788-838`): Stage 1 `eval_step`, `s1=next-curr; saved=curr; accum=s1; curr=saved+s1`, `curr[TIME]=saved_time+dt`. Stage 2 `eval_step`, `s2=next-curr; accum+=s2; next=saved+accum/2.0; curr=saved`. `curr[TIME]=saved_time; next[TIME]=saved_time+dt`. Final flows re-eval; `prev_values:=curr`; `use_prev_fallback:=0`; `save_advance!`. +- **`eval_step` = flows() then stocks()**; the stocks program writes `next[off]` via `AssignNext`. So per stage: `call flows(0); call stocks(0)`; then read `next[off]`/`curr[off]`. The final re-eval calls **only** `flows(0)`. +- **`run_initials`** (`vm.rs:1066-1135`): seed `curr[TIME/DT/INITIAL_TIME/FINAL_TIME]`, set `use_prev_fallback=1`, run initials once, then **capture `initial_values := curr` (whole `n_slots` chunk)** exactly once. (`prev_values` is not written during initials.) +- **`prev_values`/`initial_values`** are each `n_slots` wide (`vm.rs:617-618`). Address with `module_off + off` (root: `module_off=0`). +- **`LoadPrev { off }`** (`vm.rs:1320-1328`): pops a fallback; pushes `if use_prev_fallback { fallback } else { prev_values[module_off+off] }`. **Gate on the flag, never a `TIME==INITIAL_TIME` check** (RK moves TIME to trial points). +- **`LoadInitial { off }`** (`vm.rs:1332-1340`): `if part==Initials { curr[module_off+off] } else { initial_values[module_off+off] }`. Since the emitter knows the program (`StepPart`), pick the branch at compile time: in the initials function emit a `curr` read, in flows/stocks emit an `initial_values` read. +- **Memory layout additions:** `prev_values` (n_slots), `initial_values` (n_slots), and (RK only) `rk_scratch` = `saved`(n_stocks)+`accum`(n_stocks). Append after the Phase-3 GF region; grow `pages`. Add a mutable i32 global `use_prev_fallback` (init 1). +- `pub(crate)`/`pub` latitude per the repo owner. TDD, inline `#[cfg(test)] mod tests`. + +--- + + + +### Task 1: PREVIOUS/INIT snapshot regions + LoadPrev/LoadInitial + +**Verifies:** wasm-backend.AC7.4 (PREVIOUS/INIT), wasm-backend.AC1.1. + +**Files:** +- Modify: `src/simlin-engine/src/wasmgen/module.rs` (layout: prev/initial regions + `use_prev_fallback` global; `run_initials` capture; Euler-loop `prev_values` snapshot), `src/simlin-engine/src/wasmgen/lower.rs` (`LoadPrev`/`LoadInitial` arms). +- Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +1. Reserve `initial_values` + `prev_values` regions (each `n_slots*8` bytes) and a mutable i32 global `use_prev_fallback` (init 1). Thread their bases into `EmitCtx` + the `StepPart` being emitted. +2. In the `run`/initials sequence: after seeding globals and calling the initials function, copy the `curr` chunk into `initial_values` (an unrolled per-slot copy or a small copy loop). Leave `use_prev_fallback=1`. +3. In the Euler loop (Phase 1's loop), after `flows`+`stocks` and before advancing time, copy `curr → prev_values` and set `use_prev_fallback=0` (mirroring `vm.rs:705-707`). +4. `LoadPrev { off }`: pop fallback into a scratch local; `global.get use_prev_fallback`; `if` → push fallback, `else` → push `prev_values[module_off+off]` (use `select` after loading both, or an `if/else` producing f64). +5. `LoadInitial { off }`: in the **initials** program emit `curr[module_off+off]`; in **flows/stocks** programs emit `initial_values[module_off+off]`. + +**Testing:** +- Euler models using `PREVIOUS(x)` and `INIT(x)` (build via `TestProject`/XMILE), assert wasm matches the VM series. Include: `PREVIOUS` at t0 (returns the fallback), `PREVIOUS` after the first step, `INIT(x)` referenced from a flow (reads `initial_values`), and `INIT(x)` referenced from another initial equation (reads `curr` during Initials). + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` + +**Commit:** `engine: wasmgen PREVIOUS/INIT snapshot regions + LoadPrev/LoadInitial` + + + +### Task 2: RK2 + RK4 run-loop generation + +**Verifies:** wasm-backend.AC7.4 (RK2/RK4), wasm-backend.AC1.1. + +**Files:** +- Modify: `src/simlin-engine/src/wasmgen/module.rs` (method dispatch in `compile_simulation`; emit RK2/RK4 loops). +- Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +Remove Phase 1's `Method::Euler`-only guard; dispatch on `sim.specs.method`. Emit the RK4 and RK2 loops per the Notes above, unrolling the per-stock stage math over the compile-time-known stock offsets, using the `rk_scratch` region for `saved[i]`/`accum[i]`. Each stage does `call flows(0); call stocks(0)` then the per-stock arithmetic; the end-of-step does a **flows-only** `call flows(0)` with restored `curr`, then the `prev_values` snapshot (Task 1), then `save_advance!`. Mind the time juggling (`curr[TIME_OFF]` set to `saved_time + dt*0.5`, `+dt`, restored to `saved_time`; `next[TIME_OFF]=saved_time+dt`). + +**Testing:** +- RK2 and RK4 scalar models (e.g. a logistic-growth or SIR model run under each method): assert wasm matches the VM's saved samples (cadence and values). Include a model with `PREVIOUS`/`INIT` under RK to confirm the snapshot timing (prev captured after the final flows re-eval). + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` + +**Commit:** `engine: wasmgen RK2/RK4 integration loops` + + + +### Task 3: Raise floor; RK + PREVIOUS/INIT corpus parity + +**Verifies:** wasm-backend.AC1.1, wasm-backend.AC7.4. + +**Files:** +- Modify: `src/simlin-engine/tests/simulate.rs` (raise `WASM_SUPPORTED_FLOOR`). + +**Implementation:** +Corpus models using RK2/RK4 and/or PREVIOUS/INIT now run through wasm. Re-observe the `Ran` count and raise `WASM_SUPPORTED_FLOOR`. + +**Testing:** the raised floor gate; note in the commit which RK/PREVIOUS models flipped from `Skipped` to `Ran`. + +**Verification:** `cargo test -p simlin-engine --features file_io --test simulate` + +**Commit:** `engine: raise wasm parity floor after RK + PREVIOUS/INIT` + + + +--- + +## Phase 4 Done When +- RK2/RK4 models and PREVIOUS/INIT models match the VM through wasm. +- Unit tests cover each integration method and the snapshot timing (Euler post-step; RK after the end-of-step flows re-eval; initial_values once after initials; the `use_prev_fallback` gate). +- The floor gate is raised. diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_05.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_05.md new file mode 100644 index 000000000..3cfb22f17 --- /dev/null +++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_05.md @@ -0,0 +1,175 @@ +# WebAssembly Simulation Backend — Phase 5: Arrays — subscripts, iteration, reducers + +**Goal:** Lower the core array machinery — the view-stack opcodes, the `BeginIter…NextIterOrJump…EndIter` iteration loop, the `Array{Sum,Max,Min,Mean,Stddev,Size}` reducers, the temp-array region, and dynamic subscripting — to wasm, matching the VM element-for-element including out-of-bounds→NaN and empty-view semantics. + +**Architecture:** The VM resolves array access through a runtime `view_stack` of `RuntimeView`s. Because every view's geometry (base offset, dims, strides, offset, sparsity, is_temp) is known at compile time, the wasm emitter maintains a **compile-time view-descriptor stack** instead: `Push*View`/`ViewSubscript*`/`ViewRange*`/`ViewWildcard`/`ViewTranspose`/`PopView`/`DupView` push/transform/pop descriptors; `BeginIter…EndIter` becomes a wasm bounded loop with a loop-index local and compile-time stride arithmetic (or a precomputed flat-offset table for non-contiguous views); reducers loop over the top descriptor's elements; dynamic subscripts (`ViewSubscriptDynamic`/`ViewRangeDynamic`, legacy `PushSubscriptIndex`/`LoadSubscript`) carry a runtime offset + validity flag so OOB yields NaN exactly as the VM does. **Apply-to-all (A2A) variables are unrolled to scalar bytecode by the compiler — they need no array opcodes — so this phase targets array-producing builtins, reducer arguments, and explicit subscripting.** + +**Tech Stack:** `wasm-encoder` (loops/blocks, data segments for precomputed offset tables); `StaticArrayView`/`RuntimeView`/`DimensionInfo`/`SubdimensionRelation` (`bytecode.rs`); the VM array dispatch arms + `reduce_view` + `flat_offset` + `match_dimensions_two_pass` as spec. + +**Scope:** Phase 5 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`. + +**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`). + +--- + +## Acceptance Criteria Coverage + +### wasm-backend.AC1 +- **wasm-backend.AC1.1 Success:** A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` at those tests' existing tolerances. +- **wasm-backend.AC1.2 Success:** Arrayed/subscripted models (apply-to-all, subscripts, vector operations) match the VM element-for-element. *(Phase 5 covers A2A/subscript/reducer; vector ops complete it in Phase 6.)* +- **wasm-backend.AC1.5 Edge:** Empty-view reducers, out-of-bounds subscripts, and division-by-zero produce the same NaN / finite-`:NA:` / Inf values the VM produces. *(Phase 5 covers the empty-view-reducer (NaN-vs-0.0, and invalid-view→NaN for all reducers) and out-of-bounds-subscript portions.)* + +### wasm-backend.AC7 +- **wasm-backend.AC7.3 Edge:** `Mod` matches `rem_euclid`; `Max`/`Min` use `f64.max`/`f64.min` with compare-fallback. *(Reaffirmed for the array reducers `ArrayMax`/`ArrayMin`, whose empty-view→NaN semantics differ from the binary builtins.)* + +--- + +## Notes for the implementer (read first) + +- **CRITICAL — the design's "opcode" names are `Expr` IR, not bytecode.** `Subscript`, `StaticSubscript`, `TempArray`, `TempArrayElement`, `AssignTemp` are `compiler::Expr` nodes (`compiler/expr.rs:62-88`) that codegen lowers to the view/iter opcodes below — they NEVER appear in `ByteCode.code`. Lower the actual opcodes. +- **A2A is unrolled at compile time** (`compiler/mod.rs:1912-1990`): `c[D] = a[D]*b[D]` compiles to one independent scalar `LoadVar…AssignCurr(off+i)` per element — no array opcodes. So most arrayed models already pass via Phases 1-2. The array opcodes appear for: array-producing builtins (`AssignTemp` → `BeginIter` loop), reducer arguments that are elementwise array expressions, and reducers (`PushStaticView → Array → PopView`). +- **The actual array opcodes** (`bytecode.rs`), with operands and stack effects (`bytecode.rs:1220-1365`): + - View construction (stack `(0,0)` unless noted): `PushVarView { base_off: u16, dim_list_id: u16 }` (full var array; dims from `ctx.dim_lists[dim_list_id]` → `(n_dims,[DimId;4])`, sizes from `ctx.dimensions[DimId].size`); `PushTempView { temp_id: u8, dim_list_id: u16 }` (is_temp); `PushStaticView { view_id: u16 }` (**the workhorse**: `ctx.static_views[view_id]` baked at compile time); `PushVarViewDirect { base_off, dim_list_id }` (raw sizes, dynamic subscript). + - View transform (mutate top descriptor): `ViewSubscriptConst { dim_idx: u8, index: u16 }` (drop a dim, 0-based); `ViewSubscriptDynamic { dim_idx }` (stack `(1,0)`: pop 1-based index, **OOB → view invalid**); `ViewRange { dim_idx, start, end }` ([start:end)); `ViewRangeDynamic { dim_idx }` (stack `(2,0)`: pop end then start, clamp); `ViewStarRange { dim_idx, subdim_relation_id }` (sparse via `ctx.subdim_relations[id]`); `ViewWildcard { dim_idx }` (**no-op**); `ViewTranspose {}` (reverse dims/strides/dim_ids); `PopView {}`; `DupView {}`. + - Temp element: `LoadTempConst { temp_id, index }` (stack `(0,1)`: push `temp_storage[temp_offsets[temp_id]+index]`); `LoadTempDynamic { temp_id }` (stack `(1,1)`: pop index). + - Iteration: `BeginIter { write_temp_id: u8, has_write_temp: bool }` (captures `view_stack.last()` as the iter view); `LoadIterElement {}` (`(0,1)`, element at `current` from the captured view); `LoadIterTempElement { temp_id }`; `LoadIterViewTop {}` (`(0,1)`, from `view_stack.last()` at `current`, broadcasting); `LoadIterViewAt { offset: u8 }` (`(0,1)`, from `view_stack[len-offset]`, broadcasting; **this is what `StaticSubscript`/`TempArray` lower to inside a loop**, codegen.rs:523-571); `StoreIterElement {}` (`(1,0)`, write to `temp_storage[temp_offsets[write_temp_id]+current]`); `NextIterOrJump { jump_back: i16 }` (`current+=1`; if `, strides: SmallVec<[i32;4]>, offset: u32, sparse: SmallVec<[RuntimeSparseMapping;2]>, dim_ids: SmallVec<[DimId;4]> }`. **Dense element address** for indices `[i_0..i_{n-1}]`: `base_address + offset + Σ i_k*strides[k]`, where `base_address` = `curr[base_off..]` if `!is_temp` else `temp_storage[temp_offsets[base_off]..]`. `size() = Π dims`. Sparse: a sparse dim's real index is `parent_offsets[idx]` (precomputable at compile time). See `RuntimeView::flat_offset` (`bytecode.rs:283-323`), `offset_for_iter_index` (`bytecode.rs:433-456`). +- **`BeginIter` precompute** (`vm.rs:1876-1912`): if the iter view is `sparse.is_empty() && is_contiguous()`, per-iteration offset is `view.offset + current`; else the VM precomputes a `flat_offsets` table by walking multi-dim indices. The wasm emitter does the same at compile time: contiguous → `base+offset+i`; non-contiguous/sparse → bake a precomputed offset table (data segment) and read `offsets[i]`, **or** fully unroll for small arrays. +- **`reduce_view`** (`vm.rs:2802-2840`): `if !view.is_valid { return NaN }`; else fold over `size()` elements (via `flat_offset` + the is_temp dual addressing). **Asymmetry to match exactly:** an *invalid* view (OOB subscript) → NaN for **all** reducers including `ArraySum`; an *empty-but-valid* view → 0.0 for `ArraySum`, NaN for Max/Min/Mean/Stddev, `0` size for `ArraySize`. OOB-subscript→NaN is pinned by `array_tests.rs:1298-1340, 2449-2575`. +- **`temp_storage`**: a flat region of `temp_total_size` f64 (`vm.rs:584-586`); element `index` of temp `temp_id` lives at `temp_storage[temp_offsets[temp_id] + index]`. `temp_offsets`/`temp_total_size` are `ByteCodeContext` fields (compile-time). +- **Broadcasting** in `LoadIterViewTop`/`LoadIterViewAt` (`vm.rs:1946-2182`) uses `match_dimensions_two_pass` (`dimensions.rs:729`) when the source view's dims/dim_ids differ from the iter view's; a smaller source or invalid view → NaN. Mirror this exactly. +- **Memory layout addition:** the `temp_storage` region (`temp_total_size*8` bytes) + any precomputed iter-offset tables (data segments). Append after the Phase-4 regions; grow `pages`. +- `pub(crate)`/`pub` latitude per the repo owner. TDD, inline `#[cfg(test)] mod tests`; `cargo test -p simlin-engine --features file_io wasmgen`. + +--- + + + +### Task 1: Compile-time view-descriptor stack + static view opcodes + temp region + +**Verifies:** wasm-backend.AC1.2 (prerequisite). + +**Files:** +- Create: `src/simlin-engine/src/wasmgen/views.rs` (the compile-time `ViewDesc` model + address-computation helpers) — or add to `lower.rs`. +- Modify: `wasmgen/module.rs` (temp region in the layout), `wasmgen/lower.rs` (view-stack opcode arms). +- Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +1. Add the `temp_storage` region to the memory layout (base + `temp_total_size*8`); thread its base into `EmitCtx`. +2. Define a compile-time `ViewDesc` mirroring the static parts of `RuntimeView`: `{ base_off, is_temp, dims, strides, offset, sparse, dim_ids, runtime_off_local: Option, valid_local: Option }`. The last two are wasm locals introduced only by dynamic subscripts (Task 4); static views leave them `None`. Maintain a `Vec` in the emitter as the compile-time view stack. +3. Lower the static view opcodes: `PushStaticView{view_id}` (clone `ctx.static_views[view_id]` into a `ViewDesc`), `PushVarView`/`PushTempView`/`PushVarViewDirect` (build from `dim_list_id`/`base_off`), `ViewSubscriptConst`/`ViewRange`/`ViewStarRange`/`ViewWildcard`(no-op)/`ViewTranspose` (static transforms of the top `ViewDesc` mirroring `RuntimeView::apply_*`), `PopView`/`DupView`. Provide a `view_element_addr(desc, flat_index)` emitter that produces the byte address for a flat element index (contiguous fast path `base+offset+i`; strided/sparse via precomputed table or arithmetic). +4. Lower `LoadTempConst{temp_id,index}` (push `f64.load[temp_offsets[temp_id]*8 + index*8]`) and `LoadTempDynamic{temp_id}` (pop index → compute address → load). + +**Testing:** +- Unit-test the `ViewDesc` transforms by compiling tiny models whose bytecode contains each view op (a reducer over a subscripted/transposed/sparse view) and asserting the emitted reads hit the addresses the VM's `flat_offset` computes (compare a reducer's result to the VM). Test `LoadTempConst`/`LoadTempDynamic` reads. + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` + +**Commit:** `engine: wasmgen compile-time view-descriptor stack + static view ops` + + + +### Task 2: Array reducers + +**Verifies:** wasm-backend.AC1.2, wasm-backend.AC7.3, wasm-backend.AC1.5 (empty-view reducers: `ArraySum`→0.0, Max/Min/Mean/Stddev→NaN; invalid view→NaN for all). + +**Files:** +- Modify: `wasmgen/lower.rs` (reducer arms). +- Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +Lower `ArraySum`/`ArrayMax`/`ArrayMin`/`ArrayMean`/`ArrayStddev`/`ArraySize` over the top `ViewDesc` (do not pop it). Emit a bounded loop (or unrolled sum for small static sizes) over the view's `size()` elements, reading each via `view_element_addr`. Match `reduce_view` (`vm.rs:2802-2840`) and the per-reducer arms (`vm.rs:2216-2309`) exactly: +- Invalid view (the `valid_local`, when present, is 0) → push NaN for **all** reducers. +- `ArraySum`: fold with init `0.0` (empty valid view → 0.0). +- `ArrayMax`/`ArrayMin`: if `size()==0` → NaN, else fold with `NEG_INFINITY`/`INFINITY` and the VM's compare form (use compare-and-select to match the VM's `if a>b`/`if a + + + + +### Task 3: Iteration loops (BeginIter…EndIter) + broadcast + +**Verifies:** wasm-backend.AC1.2. + +**Files:** +- Modify: `wasmgen/lower.rs` (iteration arms). +- Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +Lower the iteration opcodes to a wasm bounded loop. On `BeginIter{write_temp_id,has_write_temp}`: capture the top `ViewDesc` as the iter view, compute `size()` at compile time, and open a wasm `block`/`loop` with an i32 iteration-index local (`current`) initialized to 0; record the iter context (the captured view, the write temp, the loop label depth) on an emitter-side iter stack. Within the body: +- `LoadIterElement` → read the captured iter view at `current` (contiguous: `base+offset+current`; else precomputed offsets[current]). +- `LoadIterTempElement{temp_id}` → `temp_storage[temp_offsets[temp_id]+current]`. +- `LoadIterViewTop`/`LoadIterViewAt{offset}` → read `view_stack[len-1]` / `view_stack[len-offset]` at `current`, reproducing the VM's dim-matching/broadcast (`match_dimensions_two_pass`, `dimensions.rs:729`) and the "smaller source / invalid view → NaN" rules (`vm.rs:1946-2182`). When the source view's dims/dim_ids equal the iter view's, it's the simple `offset_for_iter_index(current)` read. +- `StoreIterElement` → pop value, store to `temp_storage[temp_offsets[write_temp_id]+current]`. +On `NextIterOrJump{jump_back}`: `current+=1`; `br_if loop` when `current + + +### Task 4: Dynamic subscripts + OOB→NaN + +**Verifies:** wasm-backend.AC1.2, wasm-backend.AC1.5 (out-of-bounds subscripts → NaN, matching the VM). + +**Files:** +- Modify: `wasmgen/lower.rs` (dynamic-subscript arms; extend `ViewDesc` with runtime offset/validity). +- Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +- `ViewSubscriptDynamic{dim_idx}`: pop the 1-based runtime index; bounds-check against `dims[dim_idx]`; on OOB set the descriptor's `valid_local` (a wasm i32 local) to 0; otherwise fold `(index-1)*strides[dim_idx]` into the descriptor's `runtime_off_local`. Subsequent reads add `runtime_off_local` to the element address and, if `valid_local==0`, yield NaN. `ViewRangeDynamic{dim_idx}`: pop end then start, clamp to `[0,dims)` (empty range → 0-size dim, stays valid) per `apply_range_checked`. +- Legacy `PushSubscriptIndex{bounds}` / `LoadSubscript{off}` (`vm.rs:1341-1366`): maintain an emitter-side accumulator of `(runtime_index, bounds)` + a validity local; `PushSubscriptIndex` pops a 1-based index, range-checks against `bounds` (OOB → invalid), and accumulates; `LoadSubscript` folds the accumulated indices into a flat offset, and pushes `curr[module_off+off+flat]` unless invalid → NaN. + +**Testing:** +- Models with a runtime/dynamic subscript `arr[i]` (i from an expression) in-range and out-of-range (→NaN); a dynamic range; assert wasm matches the VM (including the OOB→NaN cases pinned by `array_tests.rs`). + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` + +**Commit:** `engine: wasmgen dynamic subscripts with OOB->NaN` + + + + +### Task 5: Raise floor; arrayed corpus parity + +**Verifies:** wasm-backend.AC1.1, wasm-backend.AC1.2. + +**Files:** +- Modify: `src/simlin-engine/tests/simulate.rs` (raise `WASM_SUPPORTED_FLOOR`). + +**Implementation:** +Arrayed (A2A/subscript/reducer) corpus models now run through wasm. Re-observe the `Ran` count and raise `WASM_SUPPORTED_FLOOR`. (Models using vector ops/allocation remain `Skipped` until Phase 6; module-bearing models until Phase 7.) + +**Testing:** the raised floor gate; note which arrayed models flipped to `Ran`. + +**Verification:** `cargo test -p simlin-engine --features file_io --test simulate` + +**Commit:** `engine: raise wasm parity floor after array core` + + +--- + +## Phase 5 Done When +- Arrayed (A2A/subscript/reducer) corpus models match the VM element-for-element. +- Unit tests cover subscript OOB→NaN, broadcast, each reducer (incl. empty-valid vs invalid-view asymmetry), and the iteration loop. +- The floor gate is raised. diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_06.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_06.md new file mode 100644 index 000000000..c49fcc850 --- /dev/null +++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_06.md @@ -0,0 +1,148 @@ +# WebAssembly Simulation Backend — Phase 6: Arrays — vector operations and allocation + +**Goal:** Lower the helper-heavy array builtins — `VectorSelect`, `VectorElmMap`, `VectorSortOrder`, `Rank`, `LookupArray`, and the `AllocateAvailable`/`AllocateByPriority` market-clearing allocators — to wasm helpers that match the VM (and its sibling modules `vm_vector_elm_map.rs`/`vm_vector_sort_order.rs`/`alloc.rs`) element-for-element. + +**Architecture:** Each opcode reads its inputs from the compile-time view stack (Phase 5) and the operand stack and writes its result array to its `write_temp_id` region of `temp_storage` (except `VectorSelect`, which reduces to one scalar). Each is emitted as a self-contained wasm helper mirroring the VM. Sorting (`VectorSortOrder`/`Rank`) uses a stable comparison sort (NaN-as-Equal to preserve stability). Allocation reuses Phase 2's `exp` helper for the open-coded `erfc`/`normal_cdf` and runs the VM's bisection over the per-requester allocation curves. + +**Tech Stack:** Phase 5 view/temp infrastructure; Phase 3 `lookup_*` helpers (for `LookupArray`); Phase 2 `approx_eq`/`is_truthy`/`exp`; the VM dispatch arms + sibling modules + `alloc.rs` as spec. + +**Scope:** Phase 6 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`. + +**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`). + +--- + +## Acceptance Criteria Coverage + +### wasm-backend.AC1 +- **wasm-backend.AC1.2 Success:** Arrayed/subscripted models (apply-to-all, subscripts, vector operations) match the VM element-for-element. + +### wasm-backend.AC7 +- **wasm-backend.AC7.1 Success:** … the allocation `erfc` [is] open-coded as [a] self-contained wasm helper function (range reduction + polynomial). Each open-coded helper has a unit test comparing its output to Rust `f64` over a sampled range. + +--- + +## Notes for the implementer (read first) + +- **Opcodes** (`bytecode.rs`), inputs from the view stack (top = last) and operand stack; outputs to `temp_storage[temp_offsets[write_temp_id]+i]`: + - `VectorSelect {}` (`vm.rs:2444-2502`): pop `action` (`.round() as i32`), pop `max_value`; views `expr_view=top`, `sel_view=top-1`. `size = min(sel.size, expr.size)`, independent index odometers. For each i: if `is_truthy(sel_val)` collect `expr_val`. Empty selection → `max_value`. Else by `action`: `1`=min, `2`=mean(sum/len), `3`=max, `4`=product, `_`=sum. Push the single scalar. (Invalid view → push one NaN.) + - `VectorElmMap { write_temp_id, full_source_len }` (`vm_vector_elm_map.rs:33-116`): `source_view=top-1`, `offset_view=top`. For each i in `offset_view.size()`: `base_i` = 0 if source is the full contiguous array else projected from carried axes; `flat_i = base_i + round(offset_val)`; result = `NaN` if `offset_val.is_nan()` or `flat_i<0 || flat_i>=full_source_len`, else `source[flat_i]` over full row-major storage. **No modulo.** Write `temp[i]`. + - `VectorSortOrder { write_temp_id }` (`vm_vector_sort_order.rs:49-101`): `input_view=top`; pop `direction` (`.round() as i32`). Innermost dim is the sorted axis (`inner = dims[n_dims-1]`, or whole view if scalar). Per row of `inner` elements: build `(value, local_idx 0..inner)`, **stable** sort (asc if `direction==1` else desc), write `temp[row_base + rank] = local_idx as f64` (**0-based in-row source index** at the sorted position). + - `Rank { write_temp_id }` (`vm.rs:2540-2584`): `input_view=top`; pop `direction`. Over the **whole view** collect `(value, orig_idx 0..size)` (orig_idx = sequential iteration index), **stable** sort, write `temp[orig_idx] = (rank_0based + 1) as f64` (**1-based**, indexed by original position). + - `LookupArray { base_gf, table_count, mode, write_temp_id }` (`vm.rs:2586-2629`): pop `index`; `input_view=top`. For each i in `view.size()`: `elem_off = view.flat_offset(indices)`; if `elem_off >= table_count` → NaN, else dispatch `mode` on `graphical_functions[base_gf+elem_off]` at `index` (reuse Phase 3 `lookup_interp/forward/backward`); write `temp[i]` (sequential index). + - `AllocateAvailable { write_temp_id }` (`vm.rs:2631-2721`): pop `avail`; `profile_view=top`, `requests_view=top-1`. Collect `requests` (n), `pp_values`; `pp_cols = if !pp_values.is_empty() && n>0 && pp_size%n==0 { pp_size/n } else { 4 }`; build per-requester `profiles[(ptype,ppriority,pwidth,pextra)]` reading `pp_values[i*pp_cols + {0,1,2,3}]` with defaults `(0.0, 0.0, 1.0, 0.0)` when out of range; `allocate_available(&requests,&profiles,avail)` → write temp. + - `AllocateByPriority { write_temp_id }` (`vm.rs:2723-2794`): pop `supply` then `width`; `priority_view=top`, `requests_view=top-1`. Build rectangular `profiles[(1.0, priorities[i] or 0.0, width, 0.0)]`; `allocate_available(&requests,&profiles,supply)` → write temp. +- **Invalid input view → `fill_temp_nan`** (`vm.rs:2866-2881`): fill the whole destination temp region with NaN (VectorSelect instead pushes one NaN). The NaN here is IEEE NaN, never `crate::float::NA`. +- **`alloc.rs` (verbatim, port bit-faithfully):** + - `erfc_approx(z)` (`alloc.rs:8-21`): for `z<0` return `2.0 - erfc_approx(-z)`; else `t=1/(1+0.3275911*z)`; `(((((1.061405429*t + -1.453152027)*t) + 1.421413741)*t + -0.284496736)*t + 0.254829592) * t * (-z*z).exp()`. (Abramowitz-Stegun 26.2.17; uses Phase 2 `exp`.) + - `normal_cdf(x)` (`alloc.rs:25-30`): `if x.is_nan() {NaN} else 0.5 * erfc_approx(-x / SQRT_2)`. + - `alloc_curve(p, request, ptype, ppriority, pwidth, pextra)` (`alloc.rs:40-129`): `if request<=0 {0.0}`; `fraction` by `ptype % 10`: 0 fixed (`p<=ppriority?1:0`), 1 rectangular, 2 triangular, 3 normal (`normal_cdf((ppriority-p)/pwidth)`), 4 exponential, 5 CES, `_` fixed (exact formulas in the investigator report / `alloc.rs:48-126`). Then `alloc = request*fraction; if ptype>=10 { alloc.floor() } else alloc`. + - `allocate_available(requests, profiles, avail)` (`alloc.rs:136-199`): `n=len`; if 0 → empty. `total_demand = Σ requests where r>0`; if `avail>=total_demand` → `requests.map(|r| r.max(0))`; if `avail<=0` → zeros. Else compute search range `[p_min,p_max]` from profiles (per-type `spread`), then **bisection up to 100 iterations**: `mid=(lo+hi)/2; total=Σ alloc_curve(mid, ...); if total + +### Task 1: VectorSelect + VectorElmMap + +**Verifies:** wasm-backend.AC1.2. + +**Files:** Modify `wasmgen/lower.rs` (+ a `wasmgen/vector.rs` helper module if preferred). Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +- `VectorSelect`: pop `action`/`max_value`; iterate `min(sel.size, expr.size)` with two index odometers; accumulate selected `expr` values where `is_truthy(sel)` (the Phase 2 helper); emit the empty→`max_value` and the action-dispatch (min/mean/max/product/sum) reductions; push one scalar. Invalid view → push NaN. +- `VectorElmMap`: emit the per-element `source[base_i + round(offset[i])]` computation with the `full_source_len` bound (OOB/NaN→NaN, no modulo), reproducing `vm_vector_elm_map.rs` (including the `source_is_full_array` base_i=0 fast path vs the carried-axis projection). Write the result temp; `fill_temp_nan` on invalid input. + +**Testing:** parity vs the VM for VectorSelect (each action, empty selection→max_value, NaN-in-mask) and VectorElmMap (in-range, OOB→NaN, NaN offset→NaN, sliced source base_i). + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` + +**Commit:** `engine: wasmgen VectorSelect + VectorElmMap` + + + +### Task 2: VectorSortOrder + Rank (stable sort) + +**Verifies:** wasm-backend.AC1.2. + +**Files:** Modify `wasmgen/lower.rs`/`wasmgen/vector.rs`; add a sort scratch region. Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +Emit a stable sort helper over `(value, idx)` pairs in a scratch region, NaN-as-Equal. `VectorSortOrder`: per innermost-dim row, sort the row's `(value, local_idx)` pairs (asc/desc by `direction`), write `temp[row_base+rank] = local_idx` (0-based). `Rank`: over the whole view, sort `(value, orig_idx)`, write `temp[orig_idx] = rank+1` (1-based). Match `vm_vector_sort_order.rs` and `vm.rs:2540-2584` exactly, including the `direction` semantics and the indexing (sorted-position vs original-position). + +**Testing:** parity vs the VM for ascending/descending; tie stability (equal values keep input order); multi-row VectorSortOrder; whole-view Rank; a NaN element (compares Equal → stable). + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` + +**Commit:** `engine: wasmgen VectorSortOrder + Rank with stable sort` + + + +### Task 3: LookupArray (per-element arrayed GF) + +**Verifies:** wasm-backend.AC1.2. + +**Files:** Modify `wasmgen/lower.rs`. Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +Lower `LookupArray { base_gf, table_count, mode, write_temp_id }`: pop the shared `index`; for each element i of the input view, compute `elem_off = flat_offset(indices)`; if `elem_off >= table_count` → NaN, else look up the GF directory at `base_gf+elem_off` and `call` the Phase 3 `lookup_interp/forward/backward` (per `mode`) at `index`; write `temp[i]` (sequential index). `fill_temp_nan` on invalid view. + +**Testing:** parity vs the VM for an arrayed graphical function across its domain, including an out-of-range element_offset element (→NaN) and all three modes. + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` + +**Commit:** `engine: wasmgen LookupArray (per-element arrayed GF)` + + + + + +### Task 4: Allocation — erfc/normal_cdf/alloc_curve/allocate_available + the two opcodes + +**Verifies:** wasm-backend.AC1.2, wasm-backend.AC7.1. + +**Files:** Create `src/simlin-engine/src/wasmgen/alloc.rs` (the allocation helper emitters); modify `wasmgen/lower.rs` (the two opcode arms); add allocation scratch regions. Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +Emit wasm helpers mirroring `alloc.rs` verbatim: +- `erfc_approx(z)` (using the Phase 2 `exp` helper) and `normal_cdf(x)` with the exact constants/Horner order above. +- `alloc_curve(p, request, ptype, ppriority, pwidth, pextra)` with all six `ptype % 10` branches and the `ptype >= 10` floor flag. +- `allocate_available(requests_ptr, n, profiles_ptr, avail, out_ptr)` (operating over scratch memory arrays): the `total_demand` short-circuits, the search-range computation, the 100-iteration bisection with the `1e-14*(1+|hi|)` relative convergence break, and the final per-requester `alloc_curve(p_star, ...)`. +Lower `AllocateAvailable`/`AllocateByPriority`: collect `requests`/`profiles` from the views into scratch arrays (with the `pp_cols`/default logic for AllocateAvailable, the rectangular-profile synthesis for AllocateByPriority), pop the scalars, call `allocate_available`, write results to the `write_temp_id` region. `fill_temp_nan` on invalid input views. + +**Testing:** +- AC7.1: unit-test the emitted `erfc_approx`/`normal_cdf` against Rust `alloc::erfc_approx`/`normal_cdf` over a sampled range (expose them `pub(crate)` if needed); document the tolerance. +- `alloc_curve` parity for each of the 6 profile types + the `>=10` floor. +- `AllocateAvailable`/`AllocateByPriority` end-to-end parity vs the VM: `avail >= total_demand` (full grant), `avail <= 0` (zeros), and the partial-allocation bisection case across profile types. + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` + +**Commit:** `engine: wasmgen allocation (erfc/normal_cdf/alloc_curve/allocate_available)` + + + + +### Task 5: Raise floor; vector-op/allocation corpus parity + +**Verifies:** wasm-backend.AC1.2. + +**Files:** Modify `src/simlin-engine/tests/simulate.rs` (raise `WASM_SUPPORTED_FLOOR`). + +**Implementation:** Corpus models using vector ops/allocation now run through wasm. Re-observe the `Ran` count and raise `WASM_SUPPORTED_FLOOR`. (Module-bearing models remain `Skipped` until Phase 7.) + +**Testing:** the raised floor gate. + +**Verification:** `cargo test -p simlin-engine --features file_io --test simulate` + +**Commit:** `engine: raise wasm parity floor after vector ops + allocation` + + +--- + +## Phase 6 Done When +- Corpus models using vector ops/allocation match the VM element-for-element. +- Unit tests cover each op including the allocation bisection and the `erfc`/`normal_cdf` accuracy vs Rust `f64`. +- The floor gate is raised. diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_07.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_07.md new file mode 100644 index 000000000..cd120dd25 --- /dev/null +++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_07.md @@ -0,0 +1,152 @@ +# WebAssembly Simulation Backend — Phase 7: Modules + host interface (FFI, layout, override/reset) + +**Goal:** Run submodels in the blob (`EvalModule`/`LoadModuleInput`), give the blob `set_value`/`reset` override semantics matching the VM, and surface the blob plus its name→offset layout through a libsimlin FFI so a host can drive the model and read one variable's series by name. + +**Architecture:** Each unique module instance `(model, input_set)` in `CompiledSimulation.modules` becomes its own set of three wasm functions (initials/flows/stocks), each taking a runtime `module_off: i32` (a shared `CompiledModule` may run at several base offsets) plus its `n_inputs` f64 inputs as parameters. `EvalModule { id }` resolves the declaration to a child `ModuleKey`, computes `child_module_off = module_off + decl.off`, and emits a `call` to the child's function for the current phase, passing the popped inputs as args; `LoadModuleInput { input }` reads the corresponding input parameter. Overridable constants are sourced from a mutable constants region (initialized to defaults) so an exported `set_value(offset, val)` + `reset()` reproduce the VM's "override a constant, reset, re-run from t0." The `WasmLayout` (already built in Phase 1) is serialized and returned alongside the blob through `simlin_model_compile_to_wasm`, and a host reads one variable's `n_chunks`-long series by striding the results region. + +**Tech Stack:** `wasm-encoder` (multi-function modules, `call`, mutable globals/regions, exported functions); the VM `EvalModule`/`LoadModuleInput`/`set_value`/`reset` as spec; libsimlin's malloc-return convention. + +**Scope:** Phase 7 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`. + +**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`). + +--- + +## Acceptance Criteria Coverage + +### wasm-backend.AC1 +- **wasm-backend.AC1.1 Success:** A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` at those tests' existing tolerances. + +### wasm-backend.AC4 +- **wasm-backend.AC4.1 Success:** The blob exports `n_slots`/`n_chunks`/`results_offset` and writes step-major snapshots; a host locates and strides the results with no external metadata. +- **wasm-backend.AC4.2 Success:** Reading one variable's series via the name→offset layout copies only that variable's `n_chunks` values (never the whole `n_chunks × n_slots` slab) and equals the VM's series for that variable. + +### wasm-backend.AC5 +- **wasm-backend.AC5.1 Success:** Overriding a constant via `set_value`, then `reset`, then `run`, yields the same series the VM produces under the same override (matching `simlin_sim_set_value`/`reset` semantics). +- **wasm-backend.AC5.2 Success:** `reset` with no override restores the compiled-default results. + +### wasm-backend.AC6 +- **wasm-backend.AC6.1 Success:** `simlin_model_compile_to_wasm` returns a valid wasm blob plus the name→offset layout via the malloc-return convention; both buffers are freeable with `simlin_free`; it works before any `SimlinSim` exists. +- **wasm-backend.AC6.2 Failure:** A model that cannot be compiled to wasm surfaces a `SimlinError` rather than panicking across the FFI boundary. + +--- + +## Notes for the implementer (read first) + +- **Opcodes** (`bytecode.rs`): `LoadModuleInput { input: ModuleInputOffset(u16) }` (`vm.rs:1376-1378`: push `module_inputs[input]`); `EvalModule { id: ModuleId(u16), n_inputs: u8 }` (`vm.rs:1379-1443`). **There is no `ModuleInput` opcode** (`Expr::ModuleInput` lowers to `LoadModuleInput`). `EvalModule` stack effect `(n_inputs, 0)`; `LoadModuleInput` `(0,1)`. +- **`ModuleDeclaration`** (`bytecode.rs:1505-1514`), the element type of `ByteCodeContext.modules`: `{ model_name: Ident, input_set: BTreeSet>, off: usize }`. +- **`EvalModule` VM dispatch** (`vm.rs:1379-1443`): pop `n_inputs` values into `module_inputs` **in reverse** (`for j in (0..n_inputs).rev() { module_inputs[j] = pop() }`); `child_module_off = module_off + context.modules[id].off`; resolve the child via `make_module_key(&decl.model_name, &decl.input_set)` (`vm.rs:27-32`) → the child `CompiledModule`; recurse phase-aware (Initials→child initials, Flows/Stocks→child `eval` with `part`). **The wasm backend does not need `CompiledSlicedSimulation`/`child_targets`** — resolve `EvalModule` to the child's wasm function index directly from `CompiledSimulation.modules` keyed by `make_module_key`. +- **Single slab**: the root `n_slots` includes all nested module slots; a child reads/writes at `module_off + off` (`LoadVar`/`AssignCurr`/`AssignNext`), while `LoadGlobalVar` is absolute (TIME/DT/INITIAL_TIME/FINAL_TIME). This is the addressing the emitter has used since Phase 1 (`module_off` is a function parameter). +- **Inputs as wasm params (clean approach):** each instance's three functions have signature `(module_off: i32, in_0: f64, …, in_{k-1}: f64) -> ()` where `k = n_inputs` for that `(model, input_set)`. `LoadModuleInput { input }` → `local.get(input + 1)` (param 0 is `module_off`). `EvalModule { id, n_inputs }`: pop the `n_inputs` operands into scratch locals (reverse, matching the VM), then push `child_module_off` (= `local.get(module_off) + decl.off`) followed by the input locals in order, and `call` the child's function for the current `StepPart`. (The root's functions are `(i32)->()`, 0 inputs.) This avoids any module-inputs memory scratch. +- **Phase-aware child function resolution:** build a map `(ModuleKey, StepPart) → wasm function index` during assembly; an `EvalModule` site in the initials/flows/stocks program calls the child's initials/flows/stocks function respectively (the `StepPart` is compile-time per program). The module instantiation graph is acyclic, so the wasm call graph is well-founded. +- **Per-instance side tables:** generalize Phase 3's GF directory and Phase 5's temp region to **per-instance** `ByteCodeContext`s — each instance has its own `graphical_functions`/`static_views`/`temp_offsets`/`temp_total_size`. The temp regions can be disjoint per instance (sum the sizes) or shared with care; disjoint is simplest. Generalize Phase 4's stock-offset collection to recurse through `EvalModule` declarations adding `decl.off` cumulatively (mirroring `collect_stock_offsets`, `vm.rs:512-543`) so the RK stage math covers nested stocks. +- **`set_value`/`reset`** (`vm.rs:976-1062`): `set_value(off, v)` is valid only when `is_constant_offset(off)` (`vm.rs:167`) — an offset with an `AssignConstCurr` in the **flows** phase (`cached_constant_info`, `collect_constant_info` `vm.rs:426-507`). The VM mutates the bytecode literal(s) at those locations (so flows re-assigns the override each step) and the override **persists across `reset`** (which only re-runs initials). `clear_values` restores defaults. The libsimlin wrappers `simlin_sim_set_value`/`simlin_sim_reset`/`simlin_sim_clear_values` (`simulation.rs:303-556`) record overrides in `SimState.overrides` and re-apply on reset. +- **`Results` has no `get_series`**; by-name retrieval strides the slab: `Vm::get_series(ident)` (`vm.rs:1140-1160`) does `off = offsets[ident]; for c in 0..n_steps { data[c*n_slots + off] }`. The host mirrors this over the blob's results region using `WasmLayout.var_offsets` — copying only `n_chunks` values. +- **libsimlin** (`src/libsimlin/`): `write_bytes_to_ffi_output` (`model.rs:65-86`), `simlin_malloc`/`simlin_free` (`memory.rs:30-71`), the `out_error: *mut *mut SimlinError` + `clear_out_error`/`store_error`/`store_anyhow_error` convention (`lib.rs:384-421`), `require_model` (`lib.rs:512`). The current POC `simlin_model_compile_to_wasm` (`model.rs:101-149`) returns only the blob; this phase changes it to also return the serialized layout. +- **Memory-layout addition:** a constants override region (a mutable region holding, per overridable offset, its current value, initialized to the compiled default). Append to the layout; grow `pages`. +- `pub(crate)`/`pub` latitude per the repo owner. TDD; corpus tests gated on `file_io`. + +--- + + + +### Task 1: Per-instance module functions + EvalModule/LoadModuleInput + +**Verifies:** wasm-backend.AC1.1. + +**Files:** Modify `wasmgen/module.rs` (emit one function-triple per instance; the `(ModuleKey,StepPart)→fn index` map; per-instance GF directory + temp regions; recursive stock-offset collection), `wasmgen/lower.rs` (`EvalModule`/`LoadModuleInput` arms). Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +1. Enumerate `sim.modules` (every `(model, input_set)` instance). For each, emit initials/flows/stocks functions with signature `(module_off: i32, in_0..in_{k-1}: f64) -> ()` (k = the instance's module-input count). Record `(ModuleKey, StepPart) → fn_index`. +2. `LoadModuleInput { input }` → `local.get(input + 1)`. +3. `EvalModule { id, n_inputs }`: pop the `n_inputs` operands into scratch f64 locals (reverse); resolve `decl = current_instance.context.modules[id]`, `child_key = make_module_key(&decl.model_name, &decl.input_set)`; push `child_module_off = (local.get module_off) + (decl.off as i32)`; push the input locals in order; `call (child_key, current_part)`. +4. The root `run` calls the root's initials/flows/stocks with `module_off=0` and no inputs. Generalize GF directory + temp regions to per-instance, and the RK stock-offset list to recurse through `EvalModule` (adding `decl.off`). + +**Testing:** module-bearing models (a model instantiating a submodel; SMOOTH/DELAY stdlib macros expand to implicit module stocks — exercise one) and the same `(model,input_set)` instantiated at two offsets: assert wasm matches the VM. Confirm `LoadModuleInput` reads the right input. + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` + +**Commit:** `engine: wasmgen per-instance module functions (EvalModule/LoadModuleInput)` + + + + + +### Task 2: `set_value` / `reset` override mechanism + +**Verifies:** wasm-backend.AC5.1, wasm-backend.AC5.2. + +**Files:** Modify `wasmgen/module.rs` (constants region + `set_value`/`reset` exports), `wasmgen/lower.rs` (source overridable constants from the region). Test: inline `#[cfg(test)] mod tests`. + +**Implementation:** +1. Identify the **full set** of overridable offsets: `CompiledSimulation::is_constant_offset(off)` (`vm.rs:167`, `pub fn`) only answers one offset at a time, and the set itself lives in the private `cached_constant_info` map — so expose its keys (widen `cached_constant_info`'s visibility, or add a `pub(crate) fn constant_offsets(&self) -> impl Iterator` accessor on `CompiledSimulation`) and initialize the constants region from that key set. Add a constants override region holding each overridable offset's current value, initialized (data segment or init code) to the compiled-default literal. +2. Redirect the value source for the overridable constant-assignment pattern (`LoadConstant{id}; AssignCurr{off}` where `off` is a constant offset, the un-fused form of `AssignConstCurr`): instead of `f64.const literal`, emit `f64.load const_region[off]`. This makes the override take effect every flows step, exactly like the VM mutating the literal. +3. Export `set_value(offset: i32, val: f64) -> i32` (return 0 ok / nonzero if `offset` is not overridable — validate against the overridable set) writing `const_region[offset]=val`; and `reset()` resetting run state (chunk/step counters, `use_prev_fallback=1`, `did_initials`-equivalent) **without** clearing the constants region (overrides persist across reset, matching the VM). Optionally `clear_values()` to restore defaults. The next `run` re-runs initials and the loop, picking up the override. + +**Testing:** +- AC5.1: `set_value(off_of_a_constant, v); reset(); run();` and compare the full series to the VM run with `vm.set_value(ident, v)` under the same override. +- AC5.2: `reset(); run()` with no override reproduces the compiled-default series. +- `set_value` on a non-constant offset returns the error code (no write). + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` + +**Commit:** `engine: wasmgen blob set_value/reset override semantics` + + + +### Task 3: libsimlin FFI returning blob + layout; by-name series retrieval + +**Verifies:** wasm-backend.AC4.1, wasm-backend.AC4.2, wasm-backend.AC6.1, wasm-backend.AC6.2. + +**Files:** Modify `src/libsimlin/src/model.rs` (`simlin_model_compile_to_wasm` to also return the serialized layout); add a `WasmLayout` serializer (in `wasmgen` or libsimlin). Test: a Rust integration test in `src/libsimlin/` (and/or a `wasmgen` test for the by-name read). + +**Implementation:** +1. Add a `WasmLayout` serializer: a length-prefixed encoding — `n_slots`, `n_chunks`, `results_offset` (as u64 LE), then `count` (u32), then per entry `name_len` (u32) + UTF-8 name bytes + `offset` (u64). (Avoids a protobuf dependency; matches the libsimlin "Pattern A" malloc-return convention.) +2. Change `simlin_model_compile_to_wasm` to: + ```rust + pub unsafe extern "C" fn simlin_model_compile_to_wasm( + model: *mut SimlinModel, + out_wasm: *mut *mut u8, out_wasm_len: *mut usize, + out_layout: *mut *mut u8, out_layout_len: *mut usize, + out_error: *mut *mut SimlinError, + ) + ``` + Build the `CompiledSimulation` from the model's datamodel (sync + `compile_project_incremental`), call `compile_simulation` to get the `WasmArtifact`, then `write_bytes_to_ffi_output` the `artifact.wasm` and the serialized `artifact.layout` into the two buffer pairs. Follow the FFI prologue (`clear_out_error`, null-checks, `require_model`). On any compile/codegen error, `store_error`/`store_anyhow_error` (AC6.2 — never panic across the boundary); the function works before any `SimlinSim` exists (it takes a `SimlinModel`). +3. A host reads one variable's series by name: locate `off` from the layout, then for `c in 0..n_chunks` read `results[results_offset + (c*n_slots + off)*8]` — copying only `n_chunks` values. + +**Testing:** +- AC6.1: FFI test — compile a model to wasm + layout, assert the wasm validates, the layout deserializes to the expected geometry + name→offset map, and both buffers free with `simlin_free`. Works with only a `SimlinModel` (no `SimlinSim`). +- AC6.2: a model that fails codegen (an unsupported construct, if any remain) surfaces a `SimlinError` (the out_error is set), no panic. +- AC4.2: a `wasmgen`/libsimlin test that reads one variable's `n_chunks`-long series via the layout (striding the slab) and asserts it equals the VM's `get_series` for that variable, and that it copied only `n_chunks` values (not the whole slab). +- AC4.1 (reaffirm): geometry read from the exported globals matches the layout. + +**Verification:** `cargo test -p simlin-engine --features file_io wasmgen` and `cargo test -p libsimlin` + +**Commit:** `libsimlin: simlin_model_compile_to_wasm returns blob + WasmLayout` + + + +### Task 4: Raise floor; module + systems-format + metasd corpus parity + +**Verifies:** wasm-backend.AC1.1. + +**Files:** Modify `src/simlin-engine/tests/simulate.rs` (raise `WASM_SUPPORTED_FLOOR`); add the wasm hook to `src/simlin-engine/tests/simulate_systems.rs`. + +**Implementation:** +Module-bearing models (including SMOOTH/DELAY stdlib expansions) now run through wasm. Add the `ensure_wasm_matches` hook to `simulate_systems.rs` (systems-format models become stdlib-module instances, so they exercise modules). Re-observe the `Ran` counts and raise `WASM_SUPPORTED_FLOOR` (and add a systems floor if appropriate). Heavy/`#[ignore]` models still defer their wasm twins to Phase 8. + +**Testing:** the raised floor gates (simulate + simulate_systems); note which module/systems models flipped to `Ran`. + +**Verification:** `cargo test -p simlin-engine --features file_io --test simulate` and `--test simulate_systems` + +**Commit:** `engine: raise wasm parity floor after modules + systems format` + + + +--- + +## Phase 7 Done When +- Module-bearing, systems-format, and metasd-simulation models match the VM through wasm. +- Override-then-reset-then-run matches the VM under the same override; reset with no override restores defaults. +- A by-name series read copies only `n_chunks` values and equals the VM's series; the FFI returns blob + layout (both `simlin_free`-able) and surfaces errors without panicking. +- The floor gate(s) are raised. diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/phase_08.md b/docs/implementation-plans/2026-05-20-wasm-backend/phase_08.md new file mode 100644 index 000000000..739db2cbd --- /dev/null +++ b/docs/implementation-plans/2026-05-20-wasm-backend/phase_08.md @@ -0,0 +1,100 @@ +# WebAssembly Simulation Backend — Phase 8: Full-corpus parity + C-LEARN + +**Goal:** Close the gate — make any `WasmGenError::Unsupported` for a VM-simulated core model a hard failure (no skips remain for core simulation), add the `#[ignore]`d C-LEARN wasm twin against `Ref.vdf`, and document the backend and its coverage. + +**Architecture:** The parity harness flips from "skip-not-fail" to "fail" for core-simulation models: every XMILE/MDL/systems model the VM simulates in the default suite must also run through the wasm backend and clear the same comparator. The heavy `#[ignore]`d models (C-LEARN, WORLD3, COVID/metasd) get `#[ignore]`d wasm twins so they don't blow the 3-minute default-suite cap under the (interpreted, non-JIT) DLR-FT oracle. + +**Tech Stack:** the `tests/simulate.rs` corpus harness, `run_clearn_vs_vdf()`, `ensure_vdf_results` + `EXPECTED_VDF_RESIDUAL`; docs. + +**Scope:** Phase 8 of 8 from `docs/design-plans/2026-05-20-wasm-backend.md`. + +**Codebase verified:** 2026-05-21 (branch `wasm-backend-poc`). + +--- + +## Acceptance Criteria Coverage + +### wasm-backend.AC1 +- **wasm-backend.AC1.3 Success:** C-LEARN runs through the wasm backend and matches `Ref.vdf` / the VM under the existing VDF tolerance and the `EXPECTED_VDF_RESIDUAL` carve-out. +- **wasm-backend.AC1.4 Failure:** A model using a not-yet-supported construct returns `WasmGenError::Unsupported` — a clean error, never a panic or a silently wrong result. *(Phase 8 is the end-state expression of this AC: the flipped gate turns any `Unsupported` for a VM-simulated core model into a hard failure — never a silent wrong result.)* + +### wasm-backend.AC3 +- **wasm-backend.AC3.2 Success:** End state — no core-simulation model is skipped: every XMILE, MDL, and systems-format model in the corpus runs through both backends. +- **wasm-backend.AC3.3 Failure:** A regression that makes a previously-supported model unsupported (dropping below the floor, or any `Unsupported` at the end-state gate) fails the test suite. + +--- + +## Notes for the implementer (read first) + +- **The end-state gate applies to models the VM actually simulates in the default suite.** Models the VM itself does not simulate (the unsupported-feature `#[ignore]`s: DELAY FIXED `simulate.rs:1534-1552`, GET DATA `simulate.rs:1595-1609`) stay VM-only and are out of scope — the wasm hook runs *after* the VM run, so a model the VM `#[ignore]`s never reaches it. LTM (`simulate_ltm.rs`) stays VM-only (out of scope). +- **C-LEARN harness** (confirmed): `run_clearn_vs_vdf() -> (Results, Results)` at `simulate.rs:1865-1893` (VM results + parsed `Ref.vdf`); `ensure_vdf_results`/`ensure_vdf_results_excluding` at `simulate.rs:309/349` (1% `VDF_RTOL` + matched-floor); `EXPECTED_VDF_RESIDUAL` at `simulate.rs:1746`; `simulates_clearn` at `simulate.rs:1849` (`#[ignore]`, `// Run with: cargo test --release -- --ignored simulates_clearn`). The wasm twin compares the **wasm** output to `Ref.vdf` with the **same** `ensure_vdf_results_excluding(&vdf, &wasm_results, EXPECTED_VDF_RESIDUAL)` check. +- **Test-suite time budget** (`docs/dev/rust.md:13-17`): default suite under a 3-minute wall-clock cap; the DLR-FT interpreter is not a JIT, so heavy models run slowly under it. Keep the heavy models' wasm twins `#[ignore]`d (run via `cargo test --release -- --ignored `), exactly like their VM counterparts. +- **Building C-LEARN's `CompiledSimulation` for the wasm twin:** reuse the C-LEARN compile path from `run_clearn_vs_vdf` (open the `.mdl`, sync, `compile_project_incremental`), then `compile_simulation` → run the blob under DLR-FT → build `Results` from the slab (`is_vensim` consistent with the VDF comparison) → `ensure_vdf_results_excluding`. +- `pub(crate)`/`pub` latitude per the repo owner. Engine tests gated on `file_io`. + +--- + + + +### Task 1: Flip the harness — Unsupported is a hard failure; close the floor + +**Verifies:** wasm-backend.AC3.2, wasm-backend.AC3.3. + +**Files:** Modify `src/simlin-engine/tests/test_helpers.rs` (or `simulate.rs`) and `src/simlin-engine/tests/simulate.rs`, `src/simlin-engine/tests/simulate_systems.rs`. + +**Implementation:** +1. Change the inline wasm hook in `simulate_path_with_excluding` (and the `.mdl` + systems paths) so a `WasmRunOutcome::Skipped(msg)` for a model the VM simulated is now a **hard failure** (`panic!`) for core-simulation models, not a silent skip. (Equivalently, `ensure_wasm_matches` returns `()` and panics on `Unsupported`.) +2. Replace the monotonic floor with the end-state assertion: the `wasm_parity_floor`/equivalent gate now requires that **every** VM-simulated core-simulation model in the default suite runs through wasm (zero `Unsupported`). Remove the skip-counting branch. Keep the gate's runtime within the cap (it only covers the small/medium default corpus; heavy models are `#[ignore]`d twins, Task 2). +3. If Task 1 surfaces any remaining `Unsupported` for a VM-simulated core model, close that lowering gap (a small addition to the relevant phase's emitter) — the design's end state is full core-simulation coverage. (A genuinely VM-unsupported feature stays out of scope and must not reach the hook.) + +**Testing:** the flipped gate is the test: it fails if any VM-simulated core model is `Unsupported` (AC3.3) and passes only at full coverage (AC3.2). Confirm a deliberately-introduced `Unsupported` (temporarily) fails the suite. + +**Verification:** `cargo test -p simlin-engine --features file_io --test simulate` and `--test simulate_systems` + +**Commit:** `engine: close the wasm parity gate (Unsupported is a hard failure)` + + + +### Task 2: C-LEARN (and heavy-model) wasm twins + +**Verifies:** wasm-backend.AC1.3. + +**Files:** Modify `src/simlin-engine/tests/simulate.rs`. + +**Implementation:** +Add `#[test] #[ignore] fn simulates_clearn_wasm()` (with the `// Run with: cargo test --release -- --ignored simulates_clearn_wasm` comment) that: builds C-LEARN's `CompiledSimulation` (reusing the compile path inside `run_clearn_vs_vdf`), compiles it via `compile_simulation`, runs the blob under DLR-FT, builds a `Results` from the slab, and asserts `ensure_vdf_results_excluding(&vdf_results, &wasm_results, EXPECTED_VDF_RESIDUAL)` — the same check `simulates_clearn` uses. Add similarly-`#[ignore]`d wasm twins for the other heavy models that have VM equivalents (WORLD3 `simulates_wrld3_03`, the COVID/metasd SSTATS model) if they exercise wasm-supported features, mirroring their existing VM tests' comparators. + +**Testing:** `simulates_clearn_wasm` (run on demand): C-LEARN's wasm output matches `Ref.vdf` under the existing tolerance + residual carve-out. + +**Verification:** `cargo test -p simlin-engine --release --features file_io -- --ignored simulates_clearn_wasm` +Expected: passes (matches `Ref.vdf` within the VDF tolerance and `EXPECTED_VDF_RESIDUAL`). + +**Commit:** `engine: C-LEARN wasm parity twin against Ref.vdf` + + + +### Task 3: Documentation + +**Verifies:** (none — documentation; supports AC3.2 reporting.) + +**Files:** Modify `src/simlin-engine/CLAUDE.md`; update `docs/` (and `docs/README.md` if adding a doc file, per `docs/CLAUDE.md`). + +**Implementation:** +- Add a `wasmgen` entry to `src/simlin-engine/CLAUDE.md`'s module map: the backend lowers `CompiledSimulation` bytecode to a self-contained wasm module (alternative execution path to the VM, validated against the VM via the DLR-FT interpreter), its file layout (`mod.rs`/`module.rs`/`lower.rs`/`math.rs`/`views.rs`/`vector.rs`/`alloc.rs` as built), the `compile_simulation`/`WasmArtifact`/`WasmLayout` contract, and the supported-feature coverage (full core simulation: scalar + arrays + lookups + Euler/RK2/RK4 + modules; LTM out of scope). +- Document how to run the wasm parity tests (default suite runs small/medium corpus through wasm; heavy twins via `cargo test --release -- --ignored `), and that the bytecode VM remains the correctness oracle. +- Note the `libsimlin` `simlin_model_compile_to_wasm` entry (blob + `WasmLayout`). + +**Testing:** n/a (docs). Verify links/freshness; keep the `**Last updated:**` date current in `simlin-engine/CLAUDE.md`. + +**Verification:** `pnpm lint` / a docs build if applicable; manual review. + +**Commit:** `doc: document the wasm simulation backend and its coverage` + + + +--- + +## Phase 8 Done When +- Every core-simulation corpus model (XMILE, MDL, systems) runs through both VM and wasm with no skips; an `Unsupported` for a VM-simulated core model fails the suite. +- C-LEARN matches `Ref.vdf` through wasm under the existing tolerance + `EXPECTED_VDF_RESIDUAL` (`#[ignore]`d twin). +- The backend and its coverage are documented in `simlin-engine/CLAUDE.md` and `docs/`. diff --git a/docs/implementation-plans/2026-05-20-wasm-backend/test-requirements.md b/docs/implementation-plans/2026-05-20-wasm-backend/test-requirements.md new file mode 100644 index 000000000..912694f13 --- /dev/null +++ b/docs/implementation-plans/2026-05-20-wasm-backend/test-requirements.md @@ -0,0 +1,135 @@ +# WebAssembly Simulation Backend — Test Requirements + +This document maps every acceptance criterion from the design plan +([`docs/design-plans/2026-05-20-wasm-backend.md`](../../design-plans/2026-05-20-wasm-backend.md), +the authoritative AC list) to its verification. There are 8 AC groups and 22 +individual cases (AC1.1–1.5, AC2.1–2.2, AC3.1–3.3, AC4.1–4.2, AC5.1–5.2, +AC6.1–6.2, AC7.1–7.4, AC8.1–8.2). The phase mappings come from the +`**Verifies:**` lines in [`phase_01.md`](phase_01.md) … [`phase_08.md`](phase_08.md). + +## Verification conventions + +This backend is engine-internal and is validated against the bytecode VM as the +correctness oracle, so verification is almost entirely automated. Two test +surfaces recur throughout: + +- **Unit (inline `#[cfg(test)] mod tests`)** in the relevant + `src/simlin-engine/src/wasmgen/*.rs` file. Each unit test hand-builds a tiny + `ByteCode`/`CompiledSimulation`, emits a wasm module with `wasm-encoder`, + validates it (`wasm::validate`), instantiates it under the DLR-FT + `wasm-interpreter` via the `checked` crate's `Store`, invokes the export, and + asserts on linear memory / return values against the VM's matching handler + (the executable spec). Files: `wasmgen/lower.rs`, `wasmgen/module.rs`, + `wasmgen/math.rs`, `wasmgen/lookup.rs` (if split out; otherwise in + `lower.rs`), `wasmgen/views.rs`, `wasmgen/vector.rs`, `wasmgen/alloc.rs`. +- **Integration / corpus** in `src/simlin-engine/tests/simulate.rs` (and + `src/simlin-engine/tests/simulate_systems.rs`). The `ensure_wasm_matches` + hook runs each supported corpus model through the wasm backend after the VM + run and feeds its results through the model's existing comparator; the + `wasm_parity_floor` gate enforces a monotonically rising count of + wasm-supported models; the `#[ignore]`d `simulates_clearn_wasm` twin checks + C-LEARN against `Ref.vdf`. + +**The correctness bar is the existing comparators, not a separate +wasm-vs-VM threshold.** A model's wasm output must clear the same +`ensure_results` (abs `2e-3` / Vensim-relative `5e-6`) or `ensure_vdf_results` +(1% `VDF_RTOL` + the `EXPECTED_VDF_RESIDUAL` carve-out) check the VM clears, +against the same expected outputs. "wasm-vs-VM parity" is achieved because both +backends clear the same comparator against the same expected outputs — there is +no tighter backend-equivalence tolerance (design "Validation bar"; reflected in +AC1.1, AC1.3, and AC7.4 below). + +--- + +## AC1: The wasm backend reproduces the VM's simulation results + +| AC | Literal text | Verification | +|---|---|---| +| **AC1.1** (Success) | A model within the supported feature set runs through the wasm backend and passes the same `simulate.rs` comparison the VM passes — its results clear `ensure_results` / `ensure_vdf_results` against the model's expected outputs at those tests' existing tolerances. (No separate, tighter wasm-vs-VM threshold.) | **Automated — integration.** The `ensure_wasm_matches` hook in `src/simlin-engine/tests/simulate.rs` (`simulate_path_with_excluding` + the `.mdl` path) runs each supported model through the backend and asserts via the existing `ensure_results_excluding` comparator (the same check the VM passes; no separate threshold). The supported set widens each phase: scalar/Euler (Phase 1), full scalar builtins (Phase 2), graphical functions (Phase 3), RK + PREVIOUS/INIT (Phase 4), arrays (Phase 5), vector ops/allocation (Phase 6), modules + systems format (Phase 7). The per-phase floor raise in `wasm_parity_floor` records the widening. Per-opcode correctness is also covered by the unit tests under each AC below. | +| **AC1.2** (Success) | Arrayed/subscripted models (apply-to-all, subscripts, vector operations) match the VM element-for-element. | **Automated — unit + integration.** Unit: reducer/iteration/subscript parity vs the VM in `wasmgen/views.rs` and `wasmgen/lower.rs` (Phase 5: subscript OOB→NaN, broadcast, each reducer, iteration loops) and the vector-op/allocation parity tests in `wasmgen/vector.rs` and `wasmgen/alloc.rs` (Phase 6: VectorSelect/ElmMap/SortOrder/Rank/LookupArray/Allocate). Integration: arrayed corpus models clear `ensure_results` via `ensure_wasm_matches` and raise the floor (Phase 5 Task 5, Phase 6 Task 5). A2A variables are unrolled to scalar bytecode by the compiler, so they are additionally covered by the Phase 1/2 scalar path. | +| **AC1.3** (Success) | C-LEARN runs through the wasm backend and matches `Ref.vdf` / the VM under the existing VDF tolerance and the `EXPECTED_VDF_RESIDUAL` carve-out. | **Automated — integration (`#[ignore]`d).** Phase 8 Task 2 adds `#[test] #[ignore] fn simulates_clearn_wasm()` in `src/simlin-engine/tests/simulate.rs`, reusing `run_clearn_vs_vdf()`'s compile path, running the blob under DLR-FT, and asserting `ensure_vdf_results_excluding(&vdf, &wasm_results, EXPECTED_VDF_RESIDUAL)` — the same check `simulates_clearn` uses. `#[ignore]`d for runtime (interpreter is not a JIT); run via `cargo test --release --features file_io -- --ignored simulates_clearn_wasm`. | +| **AC1.4** (Failure) | A model using a not-yet-supported construct returns `WasmGenError::Unsupported` — a clean error, never a panic or a silently wrong result. | **Automated — unit + integration (negative path).** Unit: Phase 1 Task 1 asserts unsupported opcodes (`Op2::Eq`/`Op2::Mod`/`Apply`/`Lookup`/an array opcode at that point) return `WasmGenError::Unsupported` rather than panicking, in `wasmgen/lower.rs`. Integration end state: Phase 8 Task 1 flips the `simulate.rs` hook so any `Unsupported` for a VM-simulated core model is a hard failure (never a silent wrong result); a deliberately-introduced `Unsupported` must fail the suite. | +| **AC1.5** (Edge) | Empty-view reducers, out-of-bounds subscripts, and division-by-zero produce the same NaN / finite-`:NA:` / Inf values the VM produces. | **Automated — unit (edge path), split across phases.** Phase 1 Task 1: raw `Op2::Div` by zero (`x/0`→±Inf, `0/0`→NaN, IEEE-identical to the VM) in `wasmgen/lower.rs`. Phase 2 Task 1: the finite `:NA:` sentinel (`crate::float::NA`) vs genuine IEEE NaN, kept distinct by the `approx_eq` helper (curated sample incl. `(NA,NA)`/`(NA,0.0)`/`(NaN,NaN)`) in `wasmgen/lower.rs`. Phase 5 Task 2 + Task 4: empty-but-valid reducers (`ArraySum`→0.0; Max/Min/Mean/Stddev→NaN) and invalid-view→NaN for all reducers; out-of-bounds subscripts→NaN (pinned against `array_tests.rs` cases) in `wasmgen/lower.rs`/`wasmgen/views.rs`. | + +## AC2: The backend consumes the salsa compiled bytecode + +| AC | Literal text | Verification | +|---|---|---| +| **AC2.1** (Success) | The wasm module is produced from `compile_project_incremental(...) -> CompiledSimulation`, not from the `Expr` IR or the monolithic `compiler::Module`. | **Automated — unit (Phase 1).** Task 1: each scalar-core opcode lowers from `ByteCode.code` (bytecode, not `Expr`), unit-tested in `wasmgen/lower.rs`. Task 2: `compile_simulation(&CompiledSimulation)` builds the module from a `CompiledSimulation` produced by `compile_project_incremental`, unit-tested in `wasmgen/module.rs` against `Vm::new(sim).run_to_end()`. Task 3: `compile_datamodel_to_wasm` is rerouted through the salsa pipeline, and `wasmgen/expr.rs` (the `Expr`-tree path) is deleted — verified by `cargo test -p simlin-engine --features file_io wasmgen` plus the structural check that no `crate::compiler::Module` references remain in `wasmgen/`. | +| **AC2.2** (Success) | The POC's `#[cfg(test)]` un-gating of the monolithic builder is reverted; the crate builds with `Module::new`/`build_metadata`/`calc_n_slots`/`calc_module_model_map` test-only again. | **Automated — build state (Phase 1 Task 4).** Operational verification (a visibility/gating revert, no new behavior): `cargo build -p simlin-engine` builds with the four items `#[cfg(test)]`-gated again; `cargo test -p simlin-engine --features file_io` still compiles and passes (test code reaches the now-test-only builder); `git diff main -- src/simlin-engine/src/compiler/mod.rs` shows only the re-gating. | + +## AC3: simulate.rs runs the corpus through both backends + +| AC | Literal text | Verification | +|---|---|---| +| **AC3.1** (Success) | During rollout, each corpus model runs through the VM and (when supported) the wasm backend, comparing wasm-vs-VM; unsupported models are skipped (not failed) and counted against a monotonically rising floor. | **Automated — integration (Phase 1 Tasks 5–6).** `ensure_wasm_matches` returns `WasmRunOutcome::Ran | Skipped(msg)` (`src/simlin-engine/tests/test_helpers.rs`); the inline hook in `simulate.rs` records `Skipped` rather than failing; `const WASM_SUPPORTED_FLOOR` + `#[test] fn wasm_parity_floor()` count `Ran` models and assert `ran >= WASM_SUPPORTED_FLOOR`. The floor is raised in every subsequent functionality phase (Phases 2–7 each have a "raise the floor" task). | +| **AC3.2** (Success) | End state — no core-simulation model is skipped: every XMILE, MDL, and systems-format model in the corpus runs through both backends. | **Automated — integration (Phase 8 Task 1).** The harness flips: the skip-counting branch is removed and the gate asserts every VM-simulated core-simulation model in the default suite runs through wasm with zero `Unsupported`, across `src/simlin-engine/tests/simulate.rs` (XMILE + `.mdl`) and `src/simlin-engine/tests/simulate_systems.rs` (systems format). | +| **AC3.3** (Failure) | A regression that makes a previously-supported model unsupported (dropping below the floor, or any `Unsupported` at the end-state gate) fails the test suite. | **Automated — integration (Phase 8 Task 1); the gate itself is the test.** During rollout, dropping below `WASM_SUPPORTED_FLOOR` fails `wasm_parity_floor` (Phase 1). At the end state, any `Unsupported` for a VM-simulated core model is a hard `panic!` in the hook + the closed gate. Confirmed by temporarily introducing an `Unsupported` and observing the suite fail. | + +## AC4: Self-describing results + efficient by-name retrieval + +| AC | Literal text | Verification | +|---|---|---| +| **AC4.1** (Success) | The blob exports `n_slots`/`n_chunks`/`results_offset` and writes step-major snapshots; a host locates and strides the results with no external metadata. | **Automated — unit (Phase 1 Task 2; reaffirmed Phase 7 Task 3).** A dedicated test in `wasmgen/module.rs` reads the three exported i32 globals from the instantiated module (`instance_export(inst, "n_slots").as_global()`, etc.), asserts they equal the `WasmLayout` values, then uses only the module-exported geometry to stride to one variable's series and confirms it matches the VM. Phase 7 Task 3 reaffirms geometry-from-globals matches the layout alongside the FFI test. | +| **AC4.2** (Success) | Reading one variable's series via the name→offset layout copies only that variable's `n_chunks` values (never the whole `n_chunks × n_slots` slab) and equals the VM's series for that variable. | **Automated — unit / integration (Phase 7 Task 3).** A `wasmgen`/libsimlin test reads one variable's `n_chunks`-long series via `WasmLayout.var_offsets` (striding `results[results_offset + (c*n_slots + off)*8]`), asserts it equals the VM's `get_series` for that variable, and asserts only `n_chunks` values were copied (not the whole slab). | + +## AC5: Override + reset + +| AC | Literal text | Verification | +|---|---|---| +| **AC5.1** (Success) | Overriding a constant via `set_value`, then `reset`, then `run`, yields the same series the VM produces under the same override (matching `simlin_sim_set_value`/`reset` semantics). | **Automated — unit (Phase 7 Task 2).** A test in `wasmgen/module.rs` calls `set_value(off_of_a_constant, v); reset(); run();` on the blob and compares the full series to a VM run with `vm.set_value(ident, v)` under the same override. A `set_value` on a non-constant offset is asserted to return the error code with no write. | +| **AC5.2** (Success) | `reset` with no override restores the compiled-default results. | **Automated — unit (Phase 7 Task 2).** A test in `wasmgen/module.rs` calls `reset(); run()` with no override and asserts the blob reproduces the compiled-default series. | + +## AC6: libsimlin FFI + +| AC | Literal text | Verification | +|---|---|---| +| **AC6.1** (Success) | `simlin_model_compile_to_wasm` returns a valid wasm blob plus the name→offset layout via the malloc-return convention; both buffers are freeable with `simlin_free`; it works before any `SimlinSim` exists. | **Automated — integration FFI (Phase 7 Task 3).** A Rust integration test in `src/libsimlin/` compiles a model to wasm + serialized layout, asserts the wasm validates, the layout deserializes to the expected geometry + name→offset map, both buffers free with `simlin_free`, and the call works from only a `SimlinModel` (no `SimlinSim`). | +| **AC6.2** (Failure) | A model that cannot be compiled to wasm surfaces a `SimlinError` rather than panicking across the FFI boundary. | **Automated — integration FFI (negative path, Phase 7 Task 3).** A `src/libsimlin/` test feeds a model that fails codegen and asserts the `out_error` (`*mut *mut SimlinError`) is set via `store_error`/`store_anyhow_error` with no panic across the boundary. | + +## AC7: Numeric-parity specifics + +| AC | Literal text | Verification | +|---|---|---| +| **AC7.1** (Success) | Math wasm provides natively (`sqrt`, `abs`, `floor`/`ceil`/`trunc`/`nearest`, `min`/`max`, arithmetic) uses wasm instructions; the transcendentals wasm lacks (`sin`/`cos`/`tan`/`asin`/`acos`/`atan`/`exp`/`ln`/`log10`/`pow`) and the allocation `erfc` are open-coded as self-contained wasm helper functions (range reduction + polynomial). Each open-coded helper has a unit test comparing its output to Rust `f64` over a sampled range; results need not be bit-identical to the VM's libm — only close enough that the existing tests pass. | **Automated — unit, split across phases.** Phase 2 Task 2: each scalar transcendental helper (`exp`/`ln`/`sin`/`cos`/`atan` kernels + `tan`/`acos`/`log10`/`pow`/`asin` composed) emitted in `wasmgen/math.rs` gets a unit test comparing wasm output to Rust `f64` over a dense sampled domain + NaN/inf edges, with a documented tolerance comfortably inside the `simulate.rs` tolerances. Phase 2 Task 4 confirms native instructions are used for `Abs`/`Sqrt`/`Int`/`Min`/`Max`. Phase 3 Task 2: the lookup kernels tested against the VM's `lookup`/`lookup_forward`/`lookup_backward`. Phase 6 Task 4: `erfc_approx`/`normal_cdf` (in `wasmgen/alloc.rs`) unit-tested against the Rust `alloc::erfc_approx`/`normal_cdf`. | +| **AC7.2** (Success) | Equality and truthiness (`Eq`/`Neq`/`And`/`Or`/`If` condition) use ULP-based `approx_eq` matching the VM. | **Automated — unit (Phase 2 Task 1).** An `approx_eq(a,b)->i32` wasm helper reproduces `float_cmp::approx_eq!(f64, …)` (epsilon + 4-ulp ordered-integer algorithm) bit-faithfully; a unit test in `wasmgen/lower.rs` runs it under DLR-FT over a curated + randomized sample (exact-equal, far, 1–4 ULP, EPSILON-apart, subnormals, `(NaN,NaN)`, `(NA,NA)`, `(NA,0.0)`, `(±0)`, `(±inf)`) and asserts equality with `crate::float::approx_eq`. Further tests confirm `Op2::Eq`, `Op2::And`, `Op2::Or`, `Not`, and `SetCond`+`If` match the VM for near-zero/ULP-adjacent operands where raw `==`/`!=0.0` would diverge. `Neq` lowers to `Eq`+`Not`, so it is covered transitively. | +| **AC7.3** (Edge) | `Mod` matches the VM's `rem_euclid` semantics (computed via wasm `floor`). `Max`/`Min` use the wasm `f64.max`/`f64.min` instructions; if a corpus test surfaces a NaN/±0 difference from the VM's compare-based form, fall back to explicit compare-and-select for that case. | **Automated — unit (Phase 2 Tasks 3–4; reaffirmed Phase 5 Task 2).** Phase 2 Task 3: `Op2::Mod` asserted to match `l.rem_euclid(r)` over the four sign combinations + non-integer operands, result always in `[0,|r|)`, in `wasmgen/lower.rs`. Phase 2 Task 4: `Min`/`Max` use `f64.min`/`f64.max`, with the documented compare-and-select fallback if a corpus test surfaces a NaN/±0 divergence. Phase 5 Task 2 reaffirms for the array reducers `ArrayMax`/`ArrayMin` (which use the VM's compare form on the reduce path, since their empty-view→NaN semantics differ from the binary builtins). | +| **AC7.4** (Success) | Euler, RK2, and RK4 each match the VM's saved samples (cadence and values); `PREVIOUS`/`INIT` match via the snapshot regions. | **Automated — unit + integration (Phase 1 Euler; Phase 4 RK2/RK4 + PREVIOUS/INIT).** Phase 1 Task 2: the Euler `run` loop's cadence and per-step values asserted against `Vm::new(sim).run_to_end()` in `wasmgen/module.rs` (`step_count == n_chunks`, save cadence matches). Phase 4 Task 1: `PREVIOUS`/`INIT` models (incl. `PREVIOUS` at t0/after-first-step and `INIT` from a flow vs from an initial) match the VM via the `prev_values`/`initial_values` snapshot regions. Phase 4 Task 2: RK2 (Heun) and RK4 scalar models match the VM's saved samples (cadence and values), incl. the snapshot timing under RK. Integration: RK + PREVIOUS/INIT corpus models clear `ensure_results` via `ensure_wasm_matches` and raise the floor (Phase 4 Task 3) — checked against expected outputs at the existing tolerances, not a separate threshold. | + +## AC8: Engineering quality (cross-cutting) + +These two criteria are not satisfied by a single test; they are properties of the +test *structure* established uniformly across every phase, and they map to the +unit-test suite as a whole. + +| AC | Literal text | Verification | +|---|---|---| +| **AC8.1** | New code reaches ≥95% test coverage via unit tests that execute emitted wasm under the DLR-FT interpreter, with each opcode/feature group individually tested. | **Automated — the unit-test suite as a whole (all phases).** Satisfied cross-cuttingly: every functionality task in Phases 1–7 is TDD'd with inline `#[cfg(test)] mod tests` in its `wasmgen/*.rs` file (`lower.rs`, `module.rs`, `math.rs`, `lookup.rs`/`lower.rs`, `views.rs`, `vector.rs`, `alloc.rs`), each test building and executing a wasm module under the DLR-FT interpreter and asserting against the VM. Each opcode/feature group (scalar core, builtins, transcendentals, lookups, RK/PREVIOUS/INIT, view ops, reducers, vector ops, allocation, modules, override/reset) is individually tested. Coverage ≥95% is a `wasmgen`-wide property of this suite, not one named test. | +| **AC8.2** | Each functionality phase ends with passing tests for the acceptance criteria it claims to cover. | **Automated — per-phase "Done When" gates (all phases).** Each phase file ends with a "Done When" section enumerating the ACs it claims and the passing tests/commands that demonstrate them; the per-phase floor raise and the `cargo test -p simlin-engine --features file_io wasmgen` / `--test simulate` verifications gate each phase. This is a process/structure criterion satisfied by the phase boundaries themselves. | + +--- + +## Human verification: none required, and why + +Every one of the 22 acceptance criteria is automatable, and the plan automates +all of them. This backend has no human-verification surface: + +- It is **engine-internal** — there is no UI, rendering, animation, copy, or + interactive UX to inspect. (The `@simlin/engine` TypeScript API, browser + worker, and live-graph/diagram UX are explicitly out of scope per the design; + the in-scope override/reset and by-name retrieval are engine-side mechanisms + validated programmatically.) +- Its correctness oracle is the **bytecode VM**, an in-repo executable + specification. Every numeric/behavioral claim is a diff against the VM (or, for + C-LEARN, against `Ref.vdf`) under the existing comparators — fully + programmatic. +- Even the criteria that look qualitative reduce to automated checks: + "self-describing" (AC4.1) is asserted by reading exported globals with no + external metadata; "clean error, never a panic" (AC1.4) and "surfaces a + `SimlinError` rather than panicking" (AC6.2) are negative-path tests; the + cross-cutting engineering-quality criteria (AC8.1/AC8.2) are satisfied by the + per-opcode TDD + DLR-FT unit-test structure and the per-phase "Done When" + gates. + +The only non-test deliverable is Phase 8 Task 3 (documentation), which carries no +AC and is verified by review. diff --git a/docs/test-plans/2026-05-20-wasm-backend.md b/docs/test-plans/2026-05-20-wasm-backend.md new file mode 100644 index 000000000..e63bdf3bf --- /dev/null +++ b/docs/test-plans/2026-05-20-wasm-backend.md @@ -0,0 +1,66 @@ +# Human Test Plan: WebAssembly Simulation Backend + +Companion to [design-plans/2026-05-20-wasm-backend.md](../design-plans/2026-05-20-wasm-backend.md) and [implementation-plans/2026-05-20-wasm-backend/](../implementation-plans/2026-05-20-wasm-backend/). + +The wasm backend is engine-internal with the bytecode VM as its automated correctness oracle, so nearly everything is machine-verified: all 22 acceptance criteria map to genuine, non-vacuous automated tests that execute the emitted wasm under the DLR-FT interpreter and compare against the VM (or `crate::float::approx_eq` / `crate::vm::lookup*` / `crate::alloc::*` / `Ref.vdf`). The steps below cover the residual surface automation can't fully stand in for: the heavy `#[ignore]`d parity twins, the FFI driven from a real (non-Rust-test) host, the AC3.3 deliberate-regression confidence check, and an optional line-coverage measurement for AC8.1. + +## Prerequisites + +- `./scripts/dev-init.sh` has been run (idempotent). +- The default suites are green (re-run if the tree changed): + - `cargo test -p simlin-engine --features file_io --lib wasmgen` (~259 tests) + - `cargo test -p simlin-engine --features file_io --test simulate` (incl. `wasm_parity_floor`) + - `cargo test -p simlin-engine --features file_io --test simulate_systems` (incl. `wasm_systems_parity_floor`) + - `cargo test -p simlin --test wasm` (FFI) + +## Phase A: Heavy parity twins (AC1.3; AC1.1/AC7.4 at scale) + +These are `#[ignore]`d for runtime (the DLR-FT interpreter is not a JIT) and never run in the default suite, so they are the only automated coverage of C-LEARN-against-`Ref.vdf` and WORLD3-at-scale through wasm. Run in release. + +| Step | Action | Expected | +|------|--------|----------| +| A1 | `cargo test -p simlin-engine --release --features file_io --test simulate -- --ignored simulates_clearn_wasm` | Passes. C-LEARN compiles to wasm, runs under the interpreter, and clears the 1% VDF gate + `EXPECTED_VDF_RESIDUAL` carve-out -- the same gate the VM clears (~3358 vars matched / 84 excluded across 251 steps). | +| A2 | `cargo test -p simlin-engine --release --features file_io --test simulate -- --ignored simulates_wrld3_03_wasm` | Passes. WORLD3 wasm output matches the VM element-for-element. | + +## Phase B: FFI from a real host (AC6.1, AC6.2, AC4.1, AC4.2) + +`src/libsimlin/tests/wasm.rs` drives the FFI in-process; this exercises the same entry point from outside the Rust harness (how TS/WASM, CGo, C/C++ consumers reach it) -- the cross-boundary contract automation can't fully represent. + +| Step | Action | Expected | +|------|--------|----------| +| B1 | Build the cbindgen header + lib (per [src/libsimlin/CLAUDE.md](../../src/libsimlin/CLAUDE.md)). | `simlin_model_compile_to_wasm` is declared in `simlin.h` with five out-params + `out_error`. | +| B2 | From a small C/Go driver (or `node` over the WASM build): open a model, `simlin_project_get_model("main")`, then `simlin_model_compile_to_wasm(...)` **without ever calling `simlin_sim_new`**. | Non-NULL `out_wasm`/`out_layout` with non-zero lengths, `out_error == NULL` (AC6.1: works pre-sim). | +| B3 | Parse the layout per the documented little-endian wire format (`n_slots`/`n_chunks`/`results_offset` as u64; `count` u32; then per entry `name_len` u32 + UTF-8 + `offset` u64). Instantiate the blob, read the exported globals, call `run`, and stride one variable using only the layout. | The strided series matches `simlin_sim_get_series` for that variable; only `n_chunks` values are read per variable (AC4.1/AC4.2). | +| B4 | `simlin_free(out_wasm); simlin_free(out_layout);` | No crash/leak/double-free. | +| B5 | Feed an unsupported model (e.g. a true runtime-range subscript `SUM(source[lo:hi])` with variable `lo`/`hi` -> `ViewRangeDynamic`) to `simlin_model_compile_to_wasm`. | `out_error != NULL` with a descriptive message, both buffers NULL, **no panic across the boundary** (AC6.2). | +| B6 | Pass a NULL `out_layout` pointer. | `out_error` set, no crash. | + +## Phase C: AC3.3 deliberate-regression confidence check + +The gate is automated; the deliberate break is a manual confidence step. **Do not commit the edit.** + +| Step | Action | Expected | +|------|--------|----------| +| C1 | Temporarily edit `src/simlin-engine/src/wasmgen/lower.rs` so a common opcode (e.g. the `Op2::Add` arm) returns `WasmGenError::Unsupported(...)`. | -- | +| C2 | `cargo test -p simlin-engine --features file_io --test simulate` | **Fails**: `wasm_parity_floor` and the per-model `wasm_parity_hook` panic, listing the now-unsupported models (AC3.2/AC3.3). | +| C3 | `cargo test -p simlin-engine --features file_io --test simulate_systems` | **Fails**: `wasm_systems_parity_floor` panics. | +| C4 | `git checkout -- src/simlin-engine/src/wasmgen/lower.rs`; re-run C2/C3. | Back to green. | + +## Phase D (optional): AC8.1 coverage measurement + +| Step | Action | Expected | +|------|--------|----------| +| D1 | `cargo llvm-cov -p simlin-engine --features file_io --lib -- wasmgen` (or the repo's configured coverage command); read `src/wasmgen/*` line/region coverage. | `wasmgen/` aggregate >=95%. Pins the AC8.1 number the suite establishes structurally (per-opcode TDD) but does not assert in CI. | + +## Traceability + +Every acceptance criterion is covered by an automated test (see the test-analysis mapping); the manual steps above add real-host / heavy-model / deliberate-regression confidence on top: + +| AC | Manual step(s) | AC | Manual step(s) | +|----|----------------|----|----------------| +| AC1.3 | A1 | AC6.1 | B1-B4 | +| AC1.1/AC7.4 (scale) | A2 | AC6.2 | B5-B6 | +| AC1.4 | B5 | AC3.3 | C1-C4 | +| AC4.1/AC4.2 | B3 | AC8.1 | D1 (optional) | + +All other ACs (AC1.2, AC1.5, AC2.1, AC2.2, AC3.1, AC3.2, AC5.1, AC5.2, AC7.1, AC7.2, AC7.3, AC8.2) are fully covered by automated tests and need no manual step. diff --git a/src/engine/wasm-backend-poc.mjs b/src/engine/wasm-backend-poc.mjs new file mode 100644 index 000000000..3780797bb --- /dev/null +++ b/src/engine/wasm-backend-poc.mjs @@ -0,0 +1,280 @@ +// Throwaway proof-of-concept for the compile-to-WebAssembly backend. +// +// Demonstrates the "direct-drive" architecture end to end in Node: +// 1. load libsimlin.wasm (the engine, compiled to wasm) +// 2. open default_projects/population/model.xmile and get its model +// 3. call simlin_model_compile_to_wasm -> a *second* wasm module (the model) +// 4. JS instantiates that model module directly and drives its `run` export +// (libsimlin is not on the per-run hot path) +// 5. check every VM variable's series shows up as a column of the blob's +// results, and compare run-to-run timing of the blob vs the bytecode VM. +// +// Run: node src/engine/wasm-backend-poc.mjs +// +// This file is exploratory scaffolding, not part of the @simlin/engine API. + +import { readFileSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; +import { dirname, join } from 'node:path'; +import { performance } from 'node:perf_hooks'; + +const here = dirname(fileURLToPath(import.meta.url)); +const WASM = join(here, 'core', 'libsimlin.wasm'); +const MODEL = join(here, '..', '..', 'default_projects', 'population', 'model.xmile'); + +// ── load libsimlin (mirrors src/engine/src/internal/wasm.node.ts) ────────── +let memory = new WebAssembly.Memory({ initial: 256, maximum: 16384 }); +const lib = await WebAssembly.instantiate(await WebAssembly.compile(readFileSync(WASM)), { + env: { memory }, +}); +const E = lib.exports; +if (E.memory instanceof WebAssembly.Memory) memory = E.memory; +E.simlin_init?.(); + +if (typeof E.simlin_model_compile_to_wasm !== 'function') { + throw new Error('libsimlin.wasm is stale: missing simlin_model_compile_to_wasm (rebuild it)'); +} + +// ── minimal FFI glue (re-derived per call so memory growth is handled) ───── +const TD = new TextDecoder(); +const TE = new TextEncoder(); +const dv = () => new DataView(memory.buffer); +const malloc = (n) => { + const p = E.simlin_malloc(n); + if (!p && n) throw new Error('wasm allocation failed'); + return p; +}; +const free = (p) => { + if (p) E.simlin_free(p); +}; +const u32 = (p) => dv().getUint32(p, true); +const outPtr = () => { + const p = malloc(4); + dv().setUint32(p, 0, true); + return p; +}; +const writeBytes = (bytes) => { + const p = malloc(bytes.length); + new Uint8Array(memory.buffer, p, bytes.length).set(bytes); + return p; +}; +const cstr = (s) => writeBytes(TE.encode(s + '\0')); +const readBytes = (p, n) => new Uint8Array(memory.buffer.slice(p, p + n)); +const readCStr = (p) => { + const v = new Uint8Array(memory.buffer); + let e = p; + while (v[e]) e++; + return TD.decode(v.slice(p, e)); +}; +const f64Array = (p, n) => { + const d = dv(); + const out = new Float64Array(n); + for (let i = 0; i < n; i++) out[i] = d.getFloat64(p + i * 8, true); + return out; +}; +function checkErr(ep, what) { + const err = u32(ep); + if (err !== 0) { + let msg = '(no message)'; + const mp = E.simlin_error_get_message(err); + if (mp) msg = readCStr(mp); + E.simlin_error_free(err); + throw new Error(`${what}: ${msg}`); + } +} + +// ── open population, get its model, extract the compiled-model wasm ──────── +const xmile = readFileSync(MODEL); +let dataPtr = writeBytes(xmile); +let ep = outPtr(); +const project = E.simlin_project_open_xmile(dataPtr, xmile.length, ep); +checkErr(ep, 'open_xmile'); +free(ep); +free(dataPtr); + +const namePtr = cstr('main'); +ep = outPtr(); +const model = E.simlin_project_get_model(project, namePtr, ep); +checkErr(ep, 'get_model'); +free(ep); +free(namePtr); + +const outBuf = outPtr(); +const outLen = outPtr(); +const outLayout = outPtr(); +const outLayoutLen = outPtr(); +ep = outPtr(); +// New 6-arg signature: returns the wasm blob AND a serialized WasmLayout +// (name -> slot offset map + geometry), each via the malloc-return convention. +E.simlin_model_compile_to_wasm(model, outBuf, outLen, outLayout, outLayoutLen, ep); +checkErr(ep, 'compile_to_wasm'); +const blobPtr = u32(outBuf); +const blobLen = u32(outLen); +const blob = readBytes(blobPtr, blobLen); +const layoutPtr = u32(outLayout); +const layoutLen = u32(outLayoutLen); +const layoutBytes = readBytes(layoutPtr, layoutLen); +free(blobPtr); +free(layoutPtr); +free(outBuf); +free(outLen); +free(outLayout); +free(outLayoutLen); +free(ep); +console.log(`compiled model -> ${blobLen} bytes of WebAssembly + ${layoutLen}-byte layout`); + +// Parse the serialized WasmLayout (little-endian): n_slots, n_chunks, +// results_offset (u64 each), count (u32), then per entry name_len (u32) + +// UTF-8 name + offset (u64). This is the same name->offset map the engine +// exposes, so a host can read a variable's series by name with no guessing. +function parseLayout(bytes) { + const d = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength); + let p = 0; + const u64 = () => { + const v = Number(d.getBigUint64(p, true)); + p += 8; + return v; + }; + const u32le = () => { + const v = d.getUint32(p, true); + p += 4; + return v; + }; + const nSlots = u64(); + const nChunks = u64(); + const resultsOffset = u64(); + const count = u32le(); + const varOffsets = new Map(); + for (let i = 0; i < count; i++) { + const nameLen = u32le(); + const name = TD.decode(bytes.slice(p, p + nameLen)); + p += nameLen; + varOffsets.set(name, u64()); + } + return { nSlots, nChunks, resultsOffset, varOffsets }; +} +const layout = parseLayout(layoutBytes); +console.log(`layout: ${layout.varOffsets.size} named variables`); + +// ── direct-drive: JS instantiates the model blob and calls run() ────────── +const { instance: mi } = await WebAssembly.instantiate(blob, {}); +const ME = mi.exports; +const nSlots = ME.n_slots.value; +const nChunks = ME.n_chunks.value; +const resultsOffset = ME.results_offset.value; +console.log(`blob self-describes: n_slots=${nSlots}, n_chunks=${nChunks}, results_offset=${resultsOffset}`); + +ME.run(); +const blobColumn = (col) => { + const d = new DataView(ME.memory.buffer); + const s = new Float64Array(nChunks); + for (let c = 0; c < nChunks; c++) s[c] = d.getFloat64(resultsOffset + (c * nSlots + col) * 8, true); + return s; +}; +const blobCols = Array.from({ length: nSlots }, (_, c) => blobColumn(c)); + +// ── VM golden via libsimlin ──────────────────────────────────────────────── +ep = outPtr(); +const sim = E.simlin_sim_new(model, 0, ep); +checkErr(ep, 'sim_new'); +free(ep); +ep = outPtr(); +E.simlin_sim_run_to_end(sim, ep); +checkErr(ep, 'run_to_end'); +free(ep); + +const vmSeries = (name) => { + const np = cstr(name); + const rp = malloc(nChunks * 8); + const wp = outPtr(); + const e = outPtr(); + E.simlin_sim_get_series(sim, np, rp, nChunks, wp, e); + checkErr(e, `get_series(${name})`); + const written = u32(wp); + const s = f64Array(rp, written); + free(np); + free(rp); + free(wp); + free(e); + return s; +}; + +// ── correctness: match every VM variable's series to a blob column ───────── +console.log('\ncorrectness (each VM variable matched to a blob column by value):'); +const vars = ['time', 'population', 'births', 'deaths', 'birth_rate', 'average_lifespan']; +let worst = 0; +for (const name of vars) { + let vm; + try { + vm = vmSeries(name); + } catch (e) { + console.log(` ${name.padEnd(18)} (skipped: ${e.message})`); + continue; + } + let best = Infinity; + let bestCol = -1; + for (let col = 0; col < nSlots; col++) { + let m = 0; + for (let c = 0; c < vm.length; c++) m = Math.max(m, Math.abs(vm[c] - blobCols[col][c])); + if (m < best) { + best = m; + bestCol = col; + } + } + worst = Math.max(worst, best); + console.log(` ${name.padEnd(18)} -> blob column ${bestCol}, max|Δ| = ${best.toExponential(2)}`); +} +console.log(`worst mismatch across variables: ${worst.toExponential(2)} -> ${worst < 1e-9 ? 'MATCH' : 'FAIL'}`); + +// ── by-name reads via the layout (no brute-force column matching) ────────── +// The layout's name -> offset map lets a host read a variable's series directly, +// striding the results region by `n_slots`. Verify it agrees with the VM. +console.log('\nby-name reads via the returned layout:'); +let worstByName = 0; +for (const name of vars) { + let vm; + try { + vm = vmSeries(name); + } catch { + continue; + } + const off = layout.varOffsets.get(name); + if (off === undefined) { + console.log(` ${name.padEnd(18)} (not in layout)`); + continue; + } + const series = blobColumn(off); + let m = 0; + for (let c = 0; c < vm.length; c++) m = Math.max(m, Math.abs(vm[c] - series[c])); + worstByName = Math.max(worstByName, m); + console.log(` ${name.padEnd(18)} -> layout offset ${off}, max|Δ| = ${m.toExponential(2)}`); +} +console.log(`worst by-name mismatch: ${worstByName.toExponential(2)} -> ${worstByName < 1e-9 ? 'MATCH' : 'FAIL'}`); + +const pop = vmSeries('population'); +console.log(`\npopulation: ${pop[0].toFixed(2)} (t=start) ... ${pop[pop.length - 1].toFixed(2)} (t=stop), ${pop.length} steps`); + +// ── timing: blob run() vs VM reset+run_to_end (both re-simulate from t0) ─── +console.log('\ntiming (each call re-runs the whole simulation):'); +const NB = 5000; +let t = performance.now(); +for (let i = 0; i < NB; i++) ME.run(); +const blobMs = (performance.now() - t) / NB; + +const NV = 500; +t = performance.now(); +for (let i = 0; i < NV; i++) { + const e1 = outPtr(); + E.simlin_sim_reset(sim, e1); + checkErr(e1, 'reset'); + free(e1); + const e2 = outPtr(); + E.simlin_sim_run_to_end(sim, e2); + checkErr(e2, 'run_to_end'); + free(e2); +} +const vmMs = (performance.now() - t) / NV; + +console.log(` blob run(): ${blobMs.toFixed(5)} ms/run (${NB} runs)`); +console.log(` VM reset+run_to_end: ${vmMs.toFixed(5)} ms/run (${NV} runs)`); +console.log(` blob is ${(vmMs / blobMs).toFixed(1)}x faster per re-simulation`); diff --git a/src/libsimlin/CLAUDE.md b/src/libsimlin/CLAUDE.md index c88158a47..b29edbd4b 100644 --- a/src/libsimlin/CLAUDE.md +++ b/src/libsimlin/CLAUDE.md @@ -40,6 +40,7 @@ All public FFI functions are prefixed with `simlin_` and declared `extern "C"`. - **`src/model.rs`** - Inspect model structure: - `simlin_model_{ref,unref}()`, `simlin_model_get_var_count()`, `simlin_model_get_var_names()` - `simlin_model_get_dependencies()`, `simlin_model_get_links()`, `simlin_model_get_equations()` + - `simlin_model_compile_to_wasm()` - Compile the model to a self-contained wasm module (engine `wasmgen` backend, an alternative to the VM for fast repeated re-simulation). Returns two malloc'd buffers, each freed with `simlin_free`: the wasm blob and a serialized `WasmLayout` (length-prefixed, little-endian: geometry `n_slots`/`n_chunks`/`results_offset` then a canonical-name -> slot-offset map a host strides the results region with). Works from a `SimlinModel`'s datamodel alone -- no `SimlinSim` required -- and stores a `SimlinError` (never panics) on any compile/codegen failure ### Serialization @@ -81,6 +82,7 @@ Integration tests live in `tests/` (standard Rust layout), organized by FFI modu - **`tests/patch.rs`** - JSON patch application, error collection, unit warnings, XMILE patches - **`tests/incremental.rs`** - Incremental compilation path (patch-then-sim, snapshot isolation) - **`tests/analysis.rs`** - Causal analysis: incoming links, loop detection, loop scores +- **`tests/wasm.rs`** - `simlin_model_compile_to_wasm`: validates and executes the returned blob under the DLR-FT interpreter (a libsimlin dev-dependency), parses the returned layout per its documented wire format, and checks the strided series against the VM via `simlin_sim_get_series`; also asserts a graceful `SimlinError` (no panic) for an unsupported model - **`tests/rendering.rs`** - SVG and PNG diagram rendering - **`tests/diagram.rs`** - Diagram layout sync - **`tests/errors.rs`** - Error formatting, error kind mapping, diagnostics diff --git a/src/libsimlin/Cargo.toml b/src/libsimlin/Cargo.toml index 1585091dd..51c664d54 100644 --- a/src/libsimlin/Cargo.toml +++ b/src/libsimlin/Cargo.toml @@ -31,6 +31,14 @@ anyhow = "1.0" mimalloc = { version = "0.1", optional = true } [dev-dependencies] +# Pure-Rust no_std wasm interpreter (the same DLR-FT rev simlin-engine pins), +# used by the `tests/wasm.rs` integration test to validate and execute the blob +# `simlin_model_compile_to_wasm` returns and check it against the returned +# layout. Dev-only: dev-dependencies are never built into the cdylib/staticlib +# or the wasm32 bundle (which uses --no-default-features), so this cannot leak +# into the shipped library. +wasm-interpreter = { git = "https://github.com/DLR-FT/wasm-interpreter.git", rev = "64cedbba603edfd64cbb6b5a19f5fa34530bb03a" } +checked = { git = "https://github.com/DLR-FT/wasm-interpreter.git", rev = "64cedbba603edfd64cbb6b5a19f5fa34530bb03a", features = ["linker", "interop"] } [package.metadata.wasm-pack.profile.release] wasm-opt = false diff --git a/src/libsimlin/simlin.h b/src/libsimlin/simlin.h index 6d8f24276..0623ec37e 100644 --- a/src/libsimlin/simlin.h +++ b/src/libsimlin/simlin.h @@ -349,6 +349,41 @@ void simlin_free(uint8_t *ptr); // - `s` must be a valid pointer returned by simlin API functions that return strings void simlin_free_string(char *s); +// Compile the model to a self-contained WebAssembly module plus its layout. +// +// The emitted module exports its own linear `memory` and a `run` function +// that executes the whole simulation in one call, writing step-major result +// snapshots into a results region of its memory. This is an alternative to +// the bytecode VM intended for fast, repeated re-simulation (e.g. interactive +// parameter scrubbing): the host instantiates the module once and calls `run` +// on every change. +// +// Two buffers are returned via the malloc-return convention, each freed +// separately with `simlin_free`: +// - `out_wasm`/`out_wasm_len`: the wasm blob. +// - `out_layout`/`out_layout_len`: a self-describing, length-prefixed layout +// buffer (all integers little-endian): `n_slots` (u64), `n_chunks` (u64), +// `results_offset` (u64), `count` (u32), then per entry `name_len` (u32) + +// UTF-8 name + `offset` (u64). A host strides one variable's `n_chunks`-long +// series from the results region using `results_offset`, `n_slots`, and the +// variable's `offset` from this map. +// +// Works from the model's datamodel alone -- no `SimlinSim` is required. Any +// compile or codegen failure stores a `SimlinError` (never panics across the +// boundary) and leaves both output buffers NULL. +// +// # Safety +// - `model` must be a valid pointer to a SimlinModel +// - `out_wasm`, `out_wasm_len`, `out_layout`, and `out_layout_len` must be +// valid, non-null pointers +// - `out_error` may be null +void simlin_model_compile_to_wasm(SimlinModel *model, + uint8_t **out_wasm, + uintptr_t *out_wasm_len, + uint8_t **out_layout, + uintptr_t *out_layout_len, + SimlinError **out_error); + // Increments the reference count of a model // // # Safety diff --git a/src/libsimlin/src/model.rs b/src/libsimlin/src/model.rs index 8c5472b2a..d830a7b3b 100644 --- a/src/libsimlin/src/model.rs +++ b/src/libsimlin/src/model.rs @@ -85,6 +85,118 @@ unsafe fn write_bytes_to_ffi_output( true } +/// Compile the model to a self-contained WebAssembly module plus its layout. +/// +/// The emitted module exports its own linear `memory` and a `run` function +/// that executes the whole simulation in one call, writing step-major result +/// snapshots into a results region of its memory. This is an alternative to +/// the bytecode VM intended for fast, repeated re-simulation (e.g. interactive +/// parameter scrubbing): the host instantiates the module once and calls `run` +/// on every change. +/// +/// Two buffers are returned via the malloc-return convention, each freed +/// separately with `simlin_free`: +/// - `out_wasm`/`out_wasm_len`: the wasm blob. +/// - `out_layout`/`out_layout_len`: a self-describing, length-prefixed layout +/// buffer (all integers little-endian): `n_slots` (u64), `n_chunks` (u64), +/// `results_offset` (u64), `count` (u32), then per entry `name_len` (u32) + +/// UTF-8 name + `offset` (u64). A host strides one variable's `n_chunks`-long +/// series from the results region using `results_offset`, `n_slots`, and the +/// variable's `offset` from this map. +/// +/// Works from the model's datamodel alone -- no `SimlinSim` is required. Any +/// compile or codegen failure stores a `SimlinError` (never panics across the +/// boundary) and leaves both output buffers NULL. +/// +/// # Safety +/// - `model` must be a valid pointer to a SimlinModel +/// - `out_wasm`, `out_wasm_len`, `out_layout`, and `out_layout_len` must be +/// valid, non-null pointers +/// - `out_error` may be null +#[no_mangle] +pub unsafe extern "C" fn simlin_model_compile_to_wasm( + model: *mut SimlinModel, + out_wasm: *mut *mut u8, + out_wasm_len: *mut usize, + out_layout: *mut *mut u8, + out_layout_len: *mut usize, + out_error: *mut *mut SimlinError, +) { + clear_out_error(out_error); + if out_wasm.is_null() + || out_wasm_len.is_null() + || out_layout.is_null() + || out_layout_len.is_null() + { + store_error( + out_error, + SimlinError::new(SimlinErrorCode::Generic) + .with_message("output pointers must not be NULL"), + ); + return; + } + *out_wasm = ptr::null_mut(); + *out_wasm_len = 0; + *out_layout = ptr::null_mut(); + *out_layout_len = 0; + + let model_ref = match require_model(model) { + Ok(m) => m, + Err(err) => { + store_anyhow_error(out_error, err); + return; + } + }; + + // The compiled-model wasm is regenerated from the project's datamodel; it + // does not depend on the VM `SimState`, so this works even before a + // `SimlinSim` has been created for the model. + let project_ref = &*model_ref.project; + let datamodel = project_ref.datamodel.lock().unwrap(); + + let artifact = match engine::wasmgen::compile_datamodel_to_artifact( + &datamodel, + model_ref.model_name.as_str(), + ) { + Ok(artifact) => artifact, + Err(err) => { + store_error( + out_error, + SimlinError::new(SimlinErrorCode::Generic) + .with_message(format!("wasm code generation failed: {err}")), + ); + return; + } + }; + + let layout_bytes = artifact.layout.serialize(); + + // Write the wasm blob first. On its allocation failure `write_bytes_to_ffi_output` + // stores the error and returns false; bail before touching the layout buffer. + if !write_bytes_to_ffi_output( + &artifact.wasm, + out_wasm, + out_wasm_len, + out_error, + "model wasm", + ) { + return; + } + // If the layout allocation fails, free the wasm buffer already handed out so + // the caller is never left with one buffer set and the other NULL-but-leaked. + if !write_bytes_to_ffi_output( + &layout_bytes, + out_layout, + out_layout_len, + out_error, + "model wasm layout", + ) { + crate::memory::simlin_free(*out_wasm); + *out_wasm = ptr::null_mut(); + *out_wasm_len = 0; + } +} + /// Find a model by name in a locked datamodel. pub(crate) fn find_model_in_datamodel<'a>( datamodel: &'a MutexGuard<'_, datamodel::Project>, diff --git a/src/libsimlin/tests/wasm.rs b/src/libsimlin/tests/wasm.rs new file mode 100644 index 000000000..21c1c3381 --- /dev/null +++ b/src/libsimlin/tests/wasm.rs @@ -0,0 +1,347 @@ +// Copyright 2026 The Simlin Authors. All rights reserved. +// Use of this source code is governed by the Apache License, +// Version 2.0, that can be found in the LICENSE file. + +//! FFI integration tests for `simlin_model_compile_to_wasm`. +//! +//! These exercise the host-facing contract: the function returns a valid wasm +//! blob plus a self-describing, length-prefixed layout buffer (both freeable +//! with `simlin_free`), works from a `SimlinModel` alone (no `SimlinSim`), and +//! surfaces a `SimlinError` -- never a panic -- for a model the wasm backend +//! cannot compile. The blob is validated and executed under the same DLR-FT +//! interpreter the engine's own wasmgen tests use, and the series a host would +//! stride from the results region (using only the returned layout) is checked +//! against the bytecode VM via `simlin_sim_get_series`. + +mod common; + +use std::ptr; + +use checked::Store; +use common::open_project_from_datamodel; +use simlin::*; +use simlin_engine::test_common::TestProject; +use wasm::validate; + +/// A small scalar stock-and-flow model: a constant inflow fills a stock. Used as +/// the supported-model fixture (it runs through the wasm backend cleanly). +fn simple_model() -> simlin_engine::datamodel::Project { + TestProject::new("ffi_wasm") + .with_sim_time(0.0, 10.0, 1.0) + .aux("inflow_rate", "2", None) + .stock("level", "0", &["inflow"], &[], None) + .flow("inflow", "inflow_rate", None) + .build_datamodel() +} + +/// The host-side layout parse, mirroring the documented little-endian wire +/// format (`n_slots`/`n_chunks`/`results_offset` u64, `count` u32, then per entry +/// `name_len` u32 + UTF-8 name + `offset` u64). Returns the geometry and the +/// name->offset map. +struct ParsedLayout { + n_slots: usize, + n_chunks: usize, + results_offset: usize, + var_offsets: Vec<(String, usize)>, +} + +fn parse_layout(bytes: &[u8]) -> ParsedLayout { + let mut pos = 0usize; + let read_u64 = |pos: &mut usize| -> u64 { + let v = u64::from_le_bytes(bytes[*pos..*pos + 8].try_into().unwrap()); + *pos += 8; + v + }; + let read_u32 = |pos: &mut usize| -> u32 { + let v = u32::from_le_bytes(bytes[*pos..*pos + 4].try_into().unwrap()); + *pos += 4; + v + }; + let n_slots = read_u64(&mut pos) as usize; + let n_chunks = read_u64(&mut pos) as usize; + let results_offset = read_u64(&mut pos) as usize; + let count = read_u32(&mut pos) as usize; + let mut var_offsets = Vec::with_capacity(count); + for _ in 0..count { + let name_len = read_u32(&mut pos) as usize; + let name = String::from_utf8(bytes[pos..pos + name_len].to_vec()).unwrap(); + pos += name_len; + let offset = read_u64(&mut pos) as usize; + var_offsets.push((name, offset)); + } + assert_eq!(pos, bytes.len(), "layout buffer had trailing bytes"); + ParsedLayout { + n_slots, + n_chunks, + results_offset, + var_offsets, + } +} + +/// AC6.1: `simlin_model_compile_to_wasm` returns a valid wasm blob plus the +/// name->offset layout via the malloc-return convention; both buffers free with +/// `simlin_free`; it works from a `SimlinModel` with no `SimlinSim`. +#[test] +fn compile_to_wasm_returns_blob_and_layout() { + let datamodel = simple_model(); + unsafe { + let project = open_project_from_datamodel(&datamodel); + let model_name = std::ffi::CString::new("main").unwrap(); + let mut err: *mut SimlinError = ptr::null_mut(); + // No SimlinSim is ever created -- the model handle alone must suffice. + let model = simlin_project_get_model(project, model_name.as_ptr(), &mut err); + assert!(err.is_null(), "get_model should not error"); + assert!(!model.is_null(), "model handle must be non-null"); + + let mut out_wasm: *mut u8 = ptr::null_mut(); + let mut out_wasm_len: usize = 0; + let mut out_layout: *mut u8 = ptr::null_mut(); + let mut out_layout_len: usize = 0; + let mut err: *mut SimlinError = ptr::null_mut(); + simlin_model_compile_to_wasm( + model, + &mut out_wasm, + &mut out_wasm_len, + &mut out_layout, + &mut out_layout_len, + &mut err, + ); + assert!( + err.is_null(), + "compile_to_wasm should not error on a supported model" + ); + assert!( + !out_wasm.is_null() && out_wasm_len > 0, + "wasm blob must be non-empty" + ); + assert!( + !out_layout.is_null() && out_layout_len > 0, + "layout buffer must be non-empty" + ); + + // The wasm blob validates under the interpreter. + let wasm = std::slice::from_raw_parts(out_wasm, out_wasm_len).to_vec(); + validate(&wasm).expect("returned blob must validate"); + + // The layout deserializes to the expected geometry + name->offset map. + let layout_bytes = std::slice::from_raw_parts(out_layout, out_layout_len).to_vec(); + let layout = parse_layout(&layout_bytes); + assert!( + layout.n_slots >= 4, + "scalar model has at least the 4 reserved slots" + ); + // dt=1 over [0,10] -> 11 saved samples. + assert_eq!(layout.n_chunks, 11, "n_chunks should match the sim specs"); + // The results region sits two chunks past the start of memory (curr+next). + assert_eq!( + layout.results_offset, + 2 * layout.n_slots * 8, + "results_offset = 2 chunks (curr + next) past byte 0" + ); + for name in ["level", "inflow", "inflow_rate"] { + assert!( + layout.var_offsets.iter().any(|(n, _)| n == name), + "{name} must appear in the layout name->offset map" + ); + } + // Offsets are within a chunk. + for (name, off) in &layout.var_offsets { + assert!( + *off < layout.n_slots, + "{name} offset {off} must be < n_slots" + ); + } + + // Run the blob and stride `level`'s series using only the layout, then + // check it against the VM's series. + let level_off = layout + .var_offsets + .iter() + .find(|(n, _)| n == "level") + .map(|(_, o)| *o) + .unwrap(); + let blob_level = run_and_stride(&wasm, &layout, level_off); + // level integrates by 2/step: 0, 2, 4, ..., 20. + assert!((blob_level[0]).abs() < 1e-9, "level starts at 0"); + assert!( + (blob_level[blob_level.len() - 1] - 20.0).abs() < 1e-9, + "level reaches 20 by the last step, got {}", + blob_level[blob_level.len() - 1] + ); + let vm_level = vm_series(project, &model_name, "level", layout.n_chunks); + assert_eq!(blob_level.len(), vm_level.len()); + for (c, (&b, &v)) in blob_level.iter().zip(vm_level.iter()).enumerate() { + assert!((b - v).abs() < 1e-9, "level chunk {c}: blob {b} != vm {v}"); + } + + // Both buffers free with simlin_free without leaking or double-free. + simlin_free(out_wasm); + simlin_free(out_layout); + + simlin_model_unref(model); + simlin_project_unref(project); + } +} + +/// AC6.2: a model the wasm backend cannot compile surfaces a `SimlinError` +/// (out_error is set, both buffers stay NULL), never a panic across the FFI +/// boundary. `SUM(source[lo:hi])` with variable bounds lowers to a runtime view +/// range the fully-unrolled emitter cannot express. +#[test] +fn compile_to_wasm_unsupported_model_surfaces_error() { + let datamodel = TestProject::new("ffi_wasm_unsupported") + .with_sim_time(0.0, 5.0, 1.0) + .indexed_dimension("A", 5) + .array_aux("source[A]", "A") + .scalar_aux("lo", "2") + .scalar_aux("hi", "4") + .scalar_aux("total", "SUM(source[lo:hi])") + .build_datamodel(); + unsafe { + let project = open_project_from_datamodel(&datamodel); + let model_name = std::ffi::CString::new("main").unwrap(); + let mut err: *mut SimlinError = ptr::null_mut(); + let model = simlin_project_get_model(project, model_name.as_ptr(), &mut err); + assert!(err.is_null()); + assert!(!model.is_null()); + + let mut out_wasm: *mut u8 = ptr::null_mut(); + let mut out_wasm_len: usize = 0; + let mut out_layout: *mut u8 = ptr::null_mut(); + let mut out_layout_len: usize = 0; + let mut err: *mut SimlinError = ptr::null_mut(); + simlin_model_compile_to_wasm( + model, + &mut out_wasm, + &mut out_wasm_len, + &mut out_layout, + &mut out_layout_len, + &mut err, + ); + + assert!(!err.is_null(), "an unsupported model must set out_error"); + // The message names the unsupported construct (no panic, a clean error). + let msg_ptr = simlin_error_get_message(err); + assert!(!msg_ptr.is_null(), "the error must carry a message"); + let msg = std::ffi::CStr::from_ptr(msg_ptr).to_str().unwrap(); + assert!( + msg.contains("ViewRangeDynamic") || msg.contains("code generation failed"), + "error message should describe the codegen failure, got: {msg}" + ); + // Both output buffers stay NULL on failure. + assert!( + out_wasm.is_null() && out_wasm_len == 0, + "wasm buffer stays NULL on error" + ); + assert!( + out_layout.is_null() && out_layout_len == 0, + "layout buffer stays NULL on error" + ); + + simlin_error_free(err); + simlin_model_unref(model); + simlin_project_unref(project); + } +} + +/// NULL output pointers are rejected with an error rather than a crash. +#[test] +fn compile_to_wasm_null_outputs_error() { + let datamodel = simple_model(); + unsafe { + let project = open_project_from_datamodel(&datamodel); + let model_name = std::ffi::CString::new("main").unwrap(); + let mut err: *mut SimlinError = ptr::null_mut(); + let model = simlin_project_get_model(project, model_name.as_ptr(), &mut err); + assert!(!model.is_null()); + + let mut out_wasm: *mut u8 = ptr::null_mut(); + let mut out_wasm_len: usize = 0; + let mut out_layout_len: usize = 0; + let mut err: *mut SimlinError = ptr::null_mut(); + // A NULL out_layout pointer must be rejected. + simlin_model_compile_to_wasm( + model, + &mut out_wasm, + &mut out_wasm_len, + ptr::null_mut(), + &mut out_layout_len, + &mut err, + ); + assert!(!err.is_null(), "a NULL output pointer must set out_error"); + simlin_error_free(err); + + simlin_model_unref(model); + simlin_project_unref(project); + } +} + +/// Instantiate `wasm` under the interpreter, invoke `run`, and stride out the +/// `n_chunks`-long series for the variable at `off` (using only the layout). +fn run_and_stride(wasm: &[u8], layout: &ParsedLayout, off: usize) -> Vec { + let info = validate(wasm).expect("validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let run = store + .instance_export(inst, "run") + .unwrap() + .as_func() + .unwrap(); + store.invoke_simple_typed::<(), ()>(run, ()).expect("run"); + let mem = store + .instance_export(inst, "memory") + .unwrap() + .as_mem() + .unwrap(); + let base = layout.results_offset; + let n_slots = layout.n_slots; + store.mem_access_mut_slice(mem, |bytes| { + (0..layout.n_chunks) + .map(|c| { + let a = base + (c * n_slots + off) * 8; + f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap()) + }) + .collect() + }) +} + +/// The VM's series for `name` via `simlin_sim_new` + `simlin_sim_get_series`. +unsafe fn vm_series( + project: *mut SimlinProject, + model_name: &std::ffi::CStr, + name: &str, + n_chunks: usize, +) -> Vec { + let mut err: *mut SimlinError = ptr::null_mut(); + let model = simlin_project_get_model(project, model_name.as_ptr(), &mut err); + assert!(err.is_null()); + let sim = simlin_sim_new(model, false, &mut err); + assert!( + err.is_null(), + "sim_new should succeed for a supported model" + ); + simlin_sim_run_to_end(sim, &mut err); + assert!(err.is_null(), "run_to_end should succeed"); + + let name_c = std::ffi::CString::new(name).unwrap(); + let mut results = vec![0.0f64; n_chunks]; + let mut written: usize = 0; + let mut err: *mut SimlinError = ptr::null_mut(); + simlin_sim_get_series( + sim, + name_c.as_ptr(), + results.as_mut_ptr(), + n_chunks, + &mut written, + &mut err, + ); + assert!(err.is_null(), "get_series should succeed"); + results.truncate(written); + + simlin_sim_unref(sim); + simlin_model_unref(model); + results +} diff --git a/src/simlin-engine/CLAUDE.md b/src/simlin-engine/CLAUDE.md index a51188a93..ce60d3e82 100644 --- a/src/simlin-engine/CLAUDE.md +++ b/src/simlin-engine/CLAUDE.md @@ -2,8 +2,6 @@ Core simulation engine for system dynamics models. Compiles, type-checks, unit-checks, and simulates SD models. See the root `CLAUDE.md` for full development guidelines; this file maps where functionality lives. -**Last updated: 2026-05-21 (unit-inference robustness, GH #614 + the `units.rs:263` TODO: the unit subsystem no longer fails all-or-nothing. `units_infer::infer` returns `InferenceResult { resolved, conflicts }` -- a dimensional conflict no longer discards the units already resolved (a contradiction is confined to its connected component, since substitution only flows along shared metavariables), and `find_constraint_mismatches` collects every residual contradiction rather than the first; `db_units::check_model_units` keeps the resolved units (so the rest of the model is still checked) and surfaces conflicts as ONE umbrella warning, not one-per-conflict. `gen_constraints` is now total (returns `Units`, not a vestigial always-`Ok` `UnitResult`; removed the `.unwrap()` panic landmine and the `None`-arm gap that dropped declared-units propagation for equation-less variables). `units::Context::new`/`new_with_builtins` return `(Context, Vec)` instead of discarding the whole built context on the first duplicate/conflicting declaration (an empty context lost project-wide alias normalization). RANK results are dimensionless in both `units_infer` and `units_check` (an ordinal index, not the ranked array's units). `ModelStage0`/`ModelStage1` gain `is_macro`: a Vensim macro's body-variable units may name the formal parameters (`~ xfrom` inside RAMP FROM TO -- a polymorphic unit, not a base unit), so inference skips declared-units constraints for macro bodies; without this, keeping the resolved map re-floods C-LEARN (the `xfrom`/`xto` leak), with it C-LEARN holds its documented 14-diagnostic residual. New tests in `units_infer.rs` (partial-results-survive-conflict, declared-units-without-equation propagation, macro-polymorphic-units, RANK dimensionless), `units.rs` (partial-context), `unit_checking_test.rs` (RANK-in-checking).) Earlier 2026-05-21 (#606: a standalone lookup-only variable -- a graphical-function holder with no functional input -- is now a non-value-bearing **static table** (`Variable::Var::is_table_only` / `db::source_var_is_table_only`), NOT a runtime variable: excluded from the runlist and the saved output (produces no series), its data reached only via `LOOKUP(table, x)` call sites. A table reference is kept off the data-flow dependency graph by a dedicated `builtins::BuiltinContents::LookupTable` walk variant; `referenced_tables` on `VariableDeps`/`ImplicitVarDeps` re-supplies the fragment compiler's metadata + tables map. A bare reference (no argument) is a compile error (`ErrorCode::LookupReferencedWithoutArgument`, emitted in `db_var_fragment::lower_var_fragment`). The MDL importer emits the canonical empty-equation form; the `"0+0"` `LOOKUP_SENTINEL` is still ACCEPTED on read (now produced only for empty-RHS vars). The shared lookup-only predicate moved to `src/variable.rs`. Retired the `gf(Time)` lowering (`lookup_only_index_expr`, `LookupOnlyLayout`). C-LEARN `EXPECTED_VDF_RESIDUAL` shrank 13->4. SUPERSEDES the #590 `gf(Time)` primitive (1) below.) Earlier (2026-05-20): C-LEARN residual closure (#590/#591), as five general Vensim import/simulation primitives, not model-specific patches: (1) a standalone lookup-only variable -- a graphical-function holder with no functional input -- lowers uniformly to `gf(Time)` across scalar/A2A/arrayed shapes (`src/compiler/mod.rs`: `var_is_lookup_only`/`is_lookup_only`/`lookup_only_index_expr` + `LookupOnlyLayout { PerElement, Shared }`); (2) a genuine passthrough macro `:MACRO: INIT(x) = INITIAL(x)` collapses to the builtin opcode at the call site (`module_functions.rs`: `classify_passthrough` + a `passthrough: Option` field on `ModuleFunctionDescriptor`, classified at `MacroRegistry::build`), instead of expanding a buggy per-element synthetic module; (3) the import-time XMILE formatter no longer linearizes a shadowed `RAMP FROM TO` macro -- `xmile_compat.rs::format_call_ctx` carries a macro-shadowing audit and the `ramp from to` restructuring arm was removed so the call survives as `RAMP_FROM_TO(...)` and resolves through the macro path; (4) the simulation **initials runlist is now deterministic** (`db_dep_graph.rs` sorts the init set before `topo_sort_str`, GH #595) and the dt stock-submodel-output chain-break applies to ALL readers (parity with the legacy `model.rs::module_output_deps` gate); (5) the VDF reader re-binds a **standalone** graphical-function descriptor to its forward-link output OT (`record_results.rs::standalone_descriptor_rebinds`). New `#[cfg(test)] lookup_only_tests.rs`; the C-LEARN `EXPECTED_VDF_RESIDUAL` carve-out (`tests/simulate.rs`) is now an exact, taxonomy-attributed remainder pinned by `clearn_residual_exactness`. simlin-cli now resolves `GET DIRECT *` external data via a `FilesystemDataProvider` -- see its CLAUDE.md.) Earlier: Element-level cycle resolution + genuine-Vensim VECTOR ELM MAP/SORT ORDER + `:NA:` sentinel, the work that made C-LEARN compile via the incremental path, run to FINAL TIME, and match genuine Vensim (`Ref.vdf`) within the 1% cross-simulator tolerance on the matched floor. The whole-variable `model_dependency_graph` cycle gate now refines a recurrence SCC to an element-acyclic verdict over a cross-member-comparable symbolic `SymVarRef` element graph (`db_dep_graph.rs`: `resolve_recurrence_sccs`/`refine_scc_to_element_verdict`/`symbolic_phase_element_order`, GH #575); a resolved SCC's per-element symbolic segments are interleaved into one combined fragment along the SCC's `element_order` and injected at `assemble_module` (`db.rs`: `combine_scc_fragment`/`var_phase_symbolic_fragment_prod`). Per-variable lowering moved to a new sibling module `db_var_fragment.rs` (`lower_var_fragment`). `crate::float::NA` is the finite Vensim `:NA:` sentinel (`-2^109`, NOT IEEE NaN); both `:NA:` paths route to it. New top-level VM-adjacent modules `vm_vector_sort_order.rs` (arrayed VECTOR SORT ORDER, per-iterated-slice 0-based ranks, #585) and `vm_vector_elm_map.rs` (base+full-source, OOB→NaN, no modulo). New `Opcode::LookupArray` (per-element arrayed-GF apply → array view, #580); `src/compiler/symbolic.rs` gains cross-fragment GF de-duplication (`GfDedup`, #582) and `TempStrategy { Recycle, Sum }`. Per-element graphical-function tables now lay out by element-name → declared dimension index (`variable.rs::reorder_arrayed_element_tables`, `db.rs::extract_tables_from_source_var`), not `Equation::Arrayed` Vec position. New tests: `db_dep_graph_tests.rs`, `db_combined_fragment_tests.rs`, `per_element_gf_tests.rs`; `tests/simulate.rs` `simulates_clearn` is un-stubbed (`#[ignore]` for runtime only) with a hardened `ensure_vdf_results` comparator + `EXPECTED_VDF_RESIDUAL` carve-out.) Earlier: Vensim macro support, Phases 1-7 complete. `:MACRO:`/`` definitions import as macro-marked `datamodel::Model`s (`Model.macro_spec: Option`, persisted through protobuf/JSON/schema); single-output macros inline through `BuiltinVisitor`, multi-output (`:`-list) ones materialize at import. New top-level modules: `module_functions.rs` (the unified `ModuleFunctionDescriptor`/`MacroRegistry` resolver+validator for stdlib functions *and* macros, the shared `is_renamed_*` collision predicates) and `db_macro_registry.rs` (the `project_macro_registry` salsa query + sync-time `macro_registry_build_error`); `SourceProject` gains `macro_registry_build_error`, `SourceModel` gains `macro_spec`; `ErrorCode::DuplicateMacroName`; new `tests/metasd_macros.rs` (gated on `file_io`). LTM arrays hardening, Phases 1-8 complete. Phase 7: #502 per-element graphical-function static link polarity -- when an arrayed source feeds an arrayed graphical-function target, `lookup_table_polarity` folds the per-element `tables` list on `Variable::Var` into one link polarity, falling back to `Unknown` for the multi-dim case; #492 the GF strict-monotonicity check uses a y-range-relative epsilon (`max(EPSILON, range_rel * (y_max - y_min))`) so numeric-import noise no longer flips a monotone lookup table to `Unknown`. Phase 6: #483 analytic STDDEV ceteris-paribus partial -- `generate_nonlinear_partial` builds the unrolled population-variance `sqrt` formula for STDDEV (divisor `N`, matching `vm.rs::Opcode::ArrayStddev`) instead of the delta-ratio stand-in; RANK keeps the delta-ratio (an order statistic, unreachable via real models since it returns an array) with a documented justification, pinned by `test_generate_rank_keeps_delta_ratio`. Phase 5: #515 budgeted cross-element-through-aggregate loop recovery -- `recover_cross_agg_loops` drops the old `MAX_AGG_PETALS = 8` hard drop for a deterministic petal priority + a threaded `agg_loop_budget` loop-count budget (`MAX_CROSS_AGG_LOOPS = 256`, `#[cfg(test)]`-overridable via `AggLoopBudgetGuard`; `MAX_AGG_PETALS` survives as a soft per-agg petal cap), surfaces truncation on `LtmVariablesResult.agg_recovery_truncated` + a `Warning`, and enumerates each disjoint petal subset's distinct *cyclic orderings* (`cyclic_orderings(m)` -- (m-1)!/2 for m≥3, mirror reversals skipped, via Heap's algorithm) instead of one ordering per subset. Phase 4 (2026-05-12): #514 sliced-reducer hoisting -- `AggNode.read_slice`, read-slice-driven element graph / link scores, dynamic-index carve-out reclassified as `DynamicIndex`; arrayed synthetic aggs route through agg-half link-score emitters with subscripted agg names and a subscripted Δsource denominator in the diagonal case (strict-prefix broadcast over-subscribes -- GH #528); mapped-dimension sliced reducers stay conservative; a scalar feeder of a hoisted reducer emits a bare element-graph node. Phases 1-3: the `model_ltm_reference_sites` classification IR (`db_ltm_ir.rs`), the consolidated `reducer_kind`/`ReducerKind` table in `ltm_agg.rs`, element-level A2A `Loop::stocks` + per-slot `loop_partitions` (#487), iterated-dimension subscripts ⇒ `Bare` (#511), disjoint-dim arrayed→arrayed per-source-element link scores + the unscoreable-edge `Warning` (#510).)** - **Maintenance note**: Keep this file up to date when adding, removing, or reorganizing modules. ## Compilation pipeline @@ -28,6 +26,16 @@ Equation text flows through these stages in order: - **`src/vm_vector_sort_order.rs`** - Genuine-Vensim VECTOR SORT ORDER. Ranks WITHIN each currently-iterated source slice (the innermost/last-declared dim is the sorted axis; outer dims select independent rows), 0-based: result position `j` of a row holds the 0-based source index *within that row* of its `j`-th element in sorted order (`direction == 1` ascending, else descending; stable ties). A 1-D view is the degenerate single-row case (in-row ranks == whole-view ranks). The prior whole-flattened-view absolute-index behavior (GH #585) made a multi-row source feed out-of-range flat indices into a downstream single-column ELM MAP; ground truth is real Vensim DSS `/test/test-models/tests/vector_order/output.tab` (ranks include `0`, impossible for a 1-based permutation). RANK is a distinct, correctly 1-based opcode. - **`src/vm_vector_elm_map.rs`** - Genuine-Vensim VECTOR ELM MAP: result element `i` = `source[base_i + round(offset[i])]` over the source variable's FULL row-major contiguous storage, where `base_i` is the flat position arg-1's element reference establishes and the offset steps the source's innermost dim (stride 1). An offset+base outside `[0, full_source_len)`, or a NaN offset, yields genuine IEEE NaN (the out-of-range result Vensim documents as `:NA:`; this is the absorbing NaN, NOT the finite `crate::float::NA` sentinel). NO modulo / NO wraparound (the bug the prior sliced-view-no-base implementation had). 8. **`src/alloc.rs`** - Allocation helpers for VM priority allocation: `allocate_available()` (bisection-based priority allocation), `alloc_curve()` (per-requester allocation curves for 6 profile types), `normal_cdf()`/`erfc_approx()`. +9. **`src/wasmgen/`** - WebAssembly code-generation backend: an alternative execution path to the bytecode VM (item 7) that lowers the salsa-compiled `CompiledSimulation` to one self-contained wasm module (no host imports), mirroring the VM opcode-for-opcode. Intended for fast repeated re-simulation (e.g. interactive parameter scrubbing): a host instantiates the blob once and calls its exported `run` on every change. **The bytecode VM remains the correctness oracle** -- every emitted module is executed under the pure-Rust DLR-FT `wasm-interpreter` in tests and compared against `Vm::run_to_end`. Entry point `compile_simulation(&CompiledSimulation) -> WasmArtifact { wasm: Vec, layout: WasmLayout }`; the blob exports `memory`, `run`, the geometry globals `n_slots`/`n_chunks`/`results_offset` (step-major results), and `set_value`/`reset`/`clear_values` (constant-override semantics matching the VM, sourced from a mutable const-override region indexed by absolute slot). `WasmLayout` (canonical-name -> slot offset) lets a host read one variable's series by striding the results region. Coverage is the full core-simulation surface: every scalar opcode + builtin (transcendentals open-coded as wasm helpers, so the blob needs no math imports), arrays (subscripts, iteration, reducers, dynamic subscripts with OOB->NaN), graphical-function lookups (scalar + per-element `LookupArray`), the vector ops (`VectorSelect`/`VectorElmMap`/`VectorSortOrder`/`Rank`) and market-clearing allocation (`AllocateAvailable`/`AllocateByPriority`), Euler/RK2/RK4 integration, `PREVIOUS`/`INIT`, and nested modules (one set of initials/flows/stocks functions per `(model, input_set)` instance, addressed by a runtime `module_off`). Out of scope: LTM (VM-only); a true-runtime-range subscript (`ViewRangeDynamic`, GH #612) returns `WasmGenError::Unsupported`; array unrolling is bounded by `MAX_UNROLL_UNITS` (65,536 elements/function), above which a model cleanly returns `Unsupported` and the caller falls back to the VM. Files: + - **`mod.rs`** - the `WasmGenError` error type + module re-exports. + - **`module.rs`** - `compile_simulation`/`compile_datamodel_to_*`: whole-module assembly -- memory layout, the per-instance initials/flows/stocks functions + the `run` driver (Euler/RK2/RK4 loops), the GF/temp/snapshot/const-override regions, the `set_value`/`reset` exports, and `WasmLayout` (de)serialization. + - **`lower.rs`** - the per-opcode emitter (`emit_bytecode` over the un-fused + peephole opcode set), the `HelperFns` registry, and the `EmitState` unroll budget. Its `#[cfg(test)]` tests live in the sibling **`lower_tests.rs`** (split out for the per-file line cap). + - **`views.rs`** - the compile-time `ViewDesc` view-descriptor stack + element-address arithmetic mirroring `RuntimeView::flat_offset`/`offset_for_iter_index`. + - **`math.rs`** - open-coded transcendental wasm helpers (`exp`/`ln`/`sin`/`cos`/`tan`/`atan`/`asin`/`acos`/`log10`/`pow`), each validated against Rust `f64`. + - **`lookup.rs`** - the three GF lookup helpers (`lookup_interp`/`lookup_forward`/`lookup_backward`) reproducing the VM lookup functions. + - **`vector.rs`** - the vector-op emitters (`VectorSelect`/`VectorElmMap`/`VectorSortOrder`/`Rank`/`LookupArray`) + a runtime-loop NaN-as-Equal stable sort. + - **`alloc.rs`** - the allocation emitters (`erfc_approx`/`normal_cdf`/`alloc_curve` + the runtime-loop `allocate_available` bisection) ported bit-faithfully from `crate::alloc`. + The libsimlin FFI `simlin_model_compile_to_wasm` returns the blob + serialized `WasmLayout` (see `src/libsimlin/CLAUDE.md`). Parity tests: the default `tests/simulate.rs` + `tests/simulate_systems.rs` corpora run every VM-simulated model through the wasm backend via an inline hook (an `Unsupported` for a core-simulation model is a hard failure -- AC3.2); the heavy twins (`simulates_clearn_wasm` vs `Ref.vdf`, `simulates_wrld3_03_wasm` vs the VM) are `#[ignore]`d, run via `cargo test --release -- --ignored `. ## Data model and project structure diff --git a/src/simlin-engine/Cargo.toml b/src/simlin-engine/Cargo.toml index f5109fb0e..c02081eed 100644 --- a/src/simlin-engine/Cargo.toml +++ b/src/simlin-engine/Cargo.toml @@ -43,6 +43,12 @@ xmutil = { version = "1", path = "../xmutil", optional = true } bumpalo = "3" salsa = "0.26" +# WebAssembly code-generation backend (compiles models to wasm as an +# alternative to the bytecode VM). no_std + alloc only, single transitive +# dependency (leb128fmt), and builds cleanly to wasm32-unknown-unknown so it +# is available inside the libsimlin wasm bundle. +wasm-encoder = { version = "0.244", default-features = false } + rand = { version = "0.9", default-features = false, features = ["std_rng"] } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] @@ -60,6 +66,14 @@ ed25519-dalek = "2" ssh-key = "0.6" tempfile = "3" +# Pure-Rust no_std wasm interpreter used as a correctness oracle: every wasm +# module the wasmgen backend produces is executed here and checked against the +# bytecode VM. Git-only (not yet published to crates.io); pinned to a commit. +# The host `Store` API lives in the `checked` workspace member, not the `wasm` +# lib crate. +wasm-interpreter = { git = "https://github.com/DLR-FT/wasm-interpreter.git", rev = "64cedbba603edfd64cbb6b5a19f5fa34530bb03a" } +checked = { git = "https://github.com/DLR-FT/wasm-interpreter.git", rev = "64cedbba603edfd64cbb6b5a19f5fa34530bb03a", features = ["linker", "interop"] } + [[test]] name = "simulate" required-features = ["file_io"] diff --git a/src/simlin-engine/src/lib.rs b/src/simlin-engine/src/lib.rs index 2d08e1187..2a3053574 100644 --- a/src/simlin-engine/src/lib.rs +++ b/src/simlin-engine/src/lib.rs @@ -116,6 +116,12 @@ mod vm; mod vm_profile; mod vm_vector_elm_map; mod vm_vector_sort_order; +// WebAssembly code-generation backend: lowers the salsa-compiled +// `CompiledSimulation` bytecode (the same value `Vm::new` consumes) to a +// self-contained wasm module, as an alternative execution path to the bytecode +// VM. Validated in tests by executing the emitted module under a pure-Rust wasm +// interpreter and comparing against the VM. +pub mod wasmgen; pub mod xmile; pub use self::common::{Error, ErrorCode, ErrorKind, Result, canonicalize}; diff --git a/src/simlin-engine/src/test_common.rs b/src/simlin-engine/src/test_common.rs index ddb7fd693..ac3ba3958 100644 --- a/src/simlin-engine/src/test_common.rs +++ b/src/simlin-engine/src/test_common.rs @@ -183,6 +183,28 @@ impl TestProject { self } + /// Add an auxiliary variable backed by a graphical function. The `equation` + /// is the lookup input expression; `gf` is the table the value is looked up + /// in. With a real input expression this lowers to `LOOKUP(self, input)`. + pub fn aux_with_gf( + mut self, + name: &str, + equation: &str, + gf: datamodel::GraphicalFunction, + ) -> Self { + self.variables.push(Variable::Aux(datamodel::Aux { + ident: name.to_string(), + equation: Equation::Scalar(equation.to_string()), + documentation: String::new(), + units: None, + gf: Some(gf), + ai_state: None, + uid: None, + compat: datamodel::Compat::default(), + })); + self + } + /// Add a flow variable pub fn flow(mut self, name: &str, equation: &str, units: Option<&str>) -> Self { self.variables.push(Variable::Flow(datamodel::Flow { diff --git a/src/simlin-engine/src/vm.rs b/src/simlin-engine/src/vm.rs index 5b3b40524..0df6afade 100644 --- a/src/simlin-engine/src/vm.rs +++ b/src/simlin-engine/src/vm.rs @@ -167,6 +167,19 @@ impl CompiledSimulation { pub fn is_constant_offset(&self, off: usize) -> bool { self.cached_constant_info.contains_key(&off) } + + /// The full set of overridable constant offsets (absolute data-buffer + /// offsets), i.e. every offset for which [`is_constant_offset`] is true. + /// These are the offsets with an `AssignConstCurr` in some module's flows + /// phase (see `collect_constant_info`); `set_value`/`set_value_by_offset` + /// accept exactly these. The wasm backend reads this to size and initialize + /// its constants-override region so a blob's `set_value` accepts the same + /// set the VM does. + /// + /// [`is_constant_offset`]: Self::is_constant_offset + pub(crate) fn constant_offsets(&self) -> impl Iterator + '_ { + self.cached_constant_info.keys().copied() + } } /// One unique compiled module (a distinct `(model_name, input_set)`), holding @@ -204,7 +217,7 @@ struct CompiledSlicedSimulation { } #[cfg_attr(feature = "debug-derive", derive(Debug))] -#[derive(Clone, Copy, PartialEq, Eq)] +#[derive(Clone, Copy, PartialEq, Eq, Hash)] pub(crate) enum StepPart { Initials, Flows, @@ -3052,8 +3065,11 @@ pub(crate) fn pulse(time: f64, dt: f64, volume: f64, first_pulse: f64, interval: 0.0 } +// `pub(crate)` so the wasm backend's lookup-helper tests can compare the +// emitted helpers directly against the VM functions they reproduce +// (`wasmgen::lookup`), the byte-faithful oracle for `vm.rs:3055-3186`. #[inline(never)] -fn lookup(table: &[(f64, f64)], index: f64) -> f64 { +pub(crate) fn lookup(table: &[(f64, f64)], index: f64) -> f64 { if table.is_empty() { return f64::NAN; } @@ -3105,7 +3121,7 @@ fn lookup(table: &[(f64, f64)], index: f64) -> f64 { /// If x is beyond the last point, returns the y-value of the last point. /// This is a "sample and hold" interpolation where we look forward. #[inline(never)] -fn lookup_forward(table: &[(f64, f64)], index: f64) -> f64 { +pub(crate) fn lookup_forward(table: &[(f64, f64)], index: f64) -> f64 { if table.is_empty() { return f64::NAN; } @@ -3147,7 +3163,7 @@ fn lookup_forward(table: &[(f64, f64)], index: f64) -> f64 { /// /// For duplicate x-values, returns the y of the LAST point with that x. #[inline(never)] -fn lookup_backward(table: &[(f64, f64)], index: f64) -> f64 { +pub(crate) fn lookup_backward(table: &[(f64, f64)], index: f64) -> f64 { if table.is_empty() { return f64::NAN; } diff --git a/src/simlin-engine/src/wasmgen/alloc.rs b/src/simlin-engine/src/wasmgen/alloc.rs new file mode 100644 index 000000000..6ea5dd673 --- /dev/null +++ b/src/simlin-engine/src/wasmgen/alloc.rs @@ -0,0 +1,1839 @@ +// Copyright 2026 The Simlin Authors. All rights reserved. +// Use of this source code is governed by the Apache License, +// Version 2.0, that can be found in the LICENSE file. + +// pattern: Functional Core +// Pure transformation: each emitter builds the body of one self-contained wasm +// helper function mirroring the matching `crate::alloc` function. No I/O; the +// only side effect is in `#[cfg(test)]` (which lives in `lower_tests.rs` +// alongside the rest of the lowering harness). + +//! Lowering of the bytecode VM's market-clearing allocators +//! (`AllocateAvailable`/`AllocateByPriority`) to WebAssembly (Phase 6). +//! +//! These opcodes route through four self-contained wasm helper functions that +//! port `crate::alloc` *bit-faithfully* -- exact constants, exact Horner +//! evaluation order, exact branch structure, and the exact bisection loop + +//! relative-convergence break -- so the emitted module takes the same numerical +//! path the VM does: +//! +//! - [`emit_erfc_approx`] -- `crate::alloc::erfc_approx` (Abramowitz-Stegun +//! 26.2.17), `call`ing the Phase-2 `exp` helper for the `(-z*z).exp()` factor. +//! - [`emit_normal_cdf`] -- `crate::alloc::normal_cdf` +//! (`0.5 * erfc_approx(-x / SQRT_2)`). +//! - [`emit_alloc_curve`] -- `crate::alloc::alloc_curve` (all six `ptype % 10` +//! curve branches + the `ptype >= 10` floor flag). +//! - [`emit_allocate_available`] -- `crate::alloc::allocate_available` (the +//! `total_demand` short-circuits, the per-type search-range computation, the +//! 100-iteration bisection, and the final per-requester `alloc_curve`). +//! +//! ## Runtime loop vs unrolled +//! +//! [`emit_allocate_available`] is a **runtime-loop** helper: `n` (the requester +//! count) is a runtime value, so it iterates over scratch-memory arrays +//! (`requests`/`profiles`/`out`) with wasm `loop`/`br_if`, never unrolled. The +//! other three helpers are straight-line numeric kernels. The lowering arm +//! (`super::lower`) gathers the request + profile values from the compile-time +//! view stack into the scratch region (an unrolled per-element copy charged +//! against the unroll budget) before `call`ing this helper. +//! +//! ## Why bit-faithful (rather than "close enough") +//! +//! The allocation curves and the bisection are sensitive: `alloc_curve` selects +//! among six analytic survival functions by an integer `ptype % 10`, and the +//! bisection's `total < avail` comparison decides which half to keep at each of +//! 100 steps. Reproducing the Rust reference's exact arithmetic (including the +//! `(-z) * z` / `(-z).exp()` unary-negation order and the `q.is_infinite()` +//! CES guard) keeps the converged price -- and therefore every per-requester +//! allocation -- identical to the VM up to the leaf `exp`/`pow` helpers' own +//! documented tolerance. + +use wasm_encoder::{BlockType, Function, Instruction as Ins, ValType}; + +use super::WasmGenError; +use super::lower::{ + EmitCtx, SLOT_SIZE, emit_fill_temp_nan, emit_view_element_load, f64_const, memarg, + temp_element_byte_addr, +}; +use super::math::emit_horner; +use super::views::ViewDesc; + +// ── erfc_approx (alloc.rs:8-21) ────────────────────────────────────────────── + +// Abramowitz & Stegun 26.2.17 constants (alloc.rs:12-17). Low-order-first for +// the shared `emit_horner`, whose `acc = acc*t + c` fold reproduces the Rust +// expression `(((((a5*t + a4)*t) + a3)*t + a2)*t + a1)` op-for-op. +const A1: f64 = 0.254829592; +const A2: f64 = -0.284496736; +const A3: f64 = 1.421413741; +const A4: f64 = -1.453152027; +const A5: f64 = 1.061405429; +const AS_P: f64 = 0.3275911; + +// `erfc_approx` local layout. Param 0 is `z`; `T` is the reduced argument +// `t = 1/(1 + p*z)`, materialized in a local so `emit_horner` can read it once +// per polynomial term. +const ERFC_Z: u32 = 0; +const ERFC_T: u32 = 1; + +/// Emit `erfc_approx(z: f64) -> f64`, porting `crate::alloc::erfc_approx` +/// (Abramowitz-Stegun 26.2.17) bit-faithfully. +/// +/// For `z < 0` returns `2.0 - erfc_approx(-z)` (the symmetry the Rust reference +/// uses); else `t = 1/(1 + p*z)` and the result is the degree-5 polynomial +/// `(((((a5*t + a4)*t) + a3)*t + a2)*t + a1) * t * (-z*z).exp()`. The polynomial +/// is evaluated by the shared [`emit_horner`] (identical fold order); `(-z) * z` +/// reproduces Rust's unary-negation precedence (`-z * z == (-z) * z`); the +/// `.exp()` is the Phase-2 `exp` helper (`exp_idx`). The `z < 0` symmetry branch +/// is open-coded as `2 - kernel(-z)` (the kernel is the shared non-negative path), +/// so no self-`call` -- and therefore no forward index to itself -- is needed. +pub(crate) fn emit_erfc_approx(exp_idx: u32) -> Function { + // One f64 scratch local (ERFC_T) after the `z` param. + let mut f = Function::new([(1, ValType::F64)]); + emit_erfc_body(&mut f, exp_idx); + f.instruction(&Ins::End); + f +} + +/// The body of `erfc_approx` (no terminating `End`). The `z < 0` symmetry branch +/// is open-coded as `2 - erfc_approx_of(-z)` rather than a self-`call`, so the +/// helper needs no forward index to itself: `erfc_approx_of` shares the +/// non-negative-argument kernel. +fn emit_erfc_body(f: &mut Function, exp_idx: u32) { + // if z < 0 { 2.0 - kernel(-z) } else { kernel(z) }. + f.instruction(&Ins::LocalGet(ERFC_Z)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + // 2.0 - kernel(-z): negate z in place, run the kernel, subtract from 2. + f.instruction(&f64_const(2.0)); + f.instruction(&Ins::LocalGet(ERFC_Z)); + f.instruction(&Ins::F64Neg); + f.instruction(&Ins::LocalSet(ERFC_Z)); + emit_erfc_kernel(f, exp_idx); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::Else); + emit_erfc_kernel(f, exp_idx); + f.instruction(&Ins::End); +} + +/// The non-negative-argument kernel of `erfc_approx`, leaving the f64 result on +/// the stack: `t = 1/(1 + p*z)`, then `poly(t) * t * (-z*z).exp()`. Reads `z` +/// from [`ERFC_Z`] (already non-negative at every call site). +fn emit_erfc_kernel(f: &mut Function, exp_idx: u32) { + // t = 1.0 / (1.0 + p * z) + f.instruction(&f64_const(1.0)); + f.instruction(&f64_const(AS_P)); + f.instruction(&Ins::LocalGet(ERFC_Z)); + f.instruction(&Ins::F64Mul); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::F64Add); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::LocalSet(ERFC_T)); + + // poly(t) = (((((a5*t + a4)*t) + a3)*t + a2)*t + a1) -- the shared Horner + // fold matches this op order exactly. + emit_horner(f, ERFC_T, &[A1, A2, A3, A4, A5]); + // * t + f.instruction(&Ins::LocalGet(ERFC_T)); + f.instruction(&Ins::F64Mul); + // * (-z * z).exp(): (-z) then * z (Rust unary-neg precedence), then exp(). + f.instruction(&Ins::LocalGet(ERFC_Z)); + f.instruction(&Ins::F64Neg); + f.instruction(&Ins::LocalGet(ERFC_Z)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::Call(exp_idx)); + f.instruction(&Ins::F64Mul); +} + +// ── normal_cdf (alloc.rs:25-30) ────────────────────────────────────────────── + +const NCDF_X: u32 = 0; + +/// Emit `normal_cdf(x: f64) -> f64`, porting `crate::alloc::normal_cdf`: +/// `if x.is_nan() { NaN } else { 0.5 * erfc_approx(-x / SQRT_2) }`. `erfc_idx` +/// is [`emit_erfc_approx`]'s assigned function index. +pub(crate) fn emit_normal_cdf(erfc_idx: u32) -> Function { + let mut f = Function::new([]); + + // NaN guard: x != x -> return NaN. + f.instruction(&Ins::LocalGet(NCDF_X)); + f.instruction(&Ins::LocalGet(NCDF_X)); + f.instruction(&Ins::F64Ne); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&f64_const(f64::NAN)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // 0.5 * erfc_approx(-x / SQRT_2) + f.instruction(&f64_const(0.5)); + f.instruction(&Ins::LocalGet(NCDF_X)); + f.instruction(&Ins::F64Neg); + f.instruction(&f64_const(std::f64::consts::SQRT_2)); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::Call(erfc_idx)); + f.instruction(&Ins::F64Mul); + + f.instruction(&Ins::End); + f +} + +// ── alloc_curve (alloc.rs:40-129) ──────────────────────────────────────────── + +// `alloc_curve` param layout (mirrors the Rust signature order). +const CURVE_P: u32 = 0; +const CURVE_REQUEST: u32 = 1; +const CURVE_PTYPE: u32 = 2; +const CURVE_PPRIORITY: u32 = 3; +const CURVE_PWIDTH: u32 = 4; +const CURVE_PEXTRA: u32 = 5; +// Scratch locals (after the six params). +const CURVE_PT_MOD: u32 = 6; // i32 `ptype % 10` +const CURVE_FRACTION: u32 = 7; // f64 the survival fraction +const CURVE_T: u32 = 8; // f64 the rectangular/triangular interpolation `t` +const CURVE_Z: u32 = 9; // f64 the exponential branch `z` +const CURVE_Q: u32 = 10; // f64 the CES branch `q` + +/// Emit `alloc_curve(p, request, ptype, ppriority, pwidth, pextra) -> f64`, +/// porting `crate::alloc::alloc_curve` bit-faithfully. +/// +/// `request <= 0` returns 0 immediately. Otherwise the survival `fraction` is +/// selected by `ptype % 10` across all six branches (0 fixed, 1 rectangular, +/// 2 triangular, 3 normal via [`normal_cdf`](emit_normal_cdf), 4 exponential +/// via the `exp` helper, 5 CES via the `pow` helper, `_` fixed), then +/// `alloc = request * fraction` is floored when `ptype >= 10`. `ptype` is +/// carried as an f64 (the VM stores profile fields as f64 and casts `pt as i32`); +/// `ptype % 10` and the `ptype >= 10` test reproduce that i32 cast via +/// `i32.trunc_sat_f64_s`. `normal_cdf_idx`/`exp_idx`/`pow_idx` are the helpers' +/// assigned function indices. +pub(crate) fn emit_alloc_curve(normal_cdf_idx: u32, exp_idx: u32, pow_idx: u32) -> Function { + // Scratch: one i32 (CURVE_PT_MOD) + four f64 (FRACTION/T/Z/Q). + let mut f = Function::new([(1, ValType::I32), (4, ValType::F64)]); + + // if request <= 0.0 { return 0.0 } + f.instruction(&Ins::LocalGet(CURVE_REQUEST)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Le); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // pt_mod = (ptype as i32) % 10 (truncated remainder, sign of the dividend -- + // wasm `i32.rem_s` matches Rust `%`). + f.instruction(&Ins::LocalGet(CURVE_PTYPE)); + f.instruction(&Ins::I32TruncSatF64S); + f.instruction(&Ins::I32Const(10)); + f.instruction(&Ins::I32RemS); + f.instruction(&Ins::LocalSet(CURVE_PT_MOD)); + + // fraction = match pt_mod { 0|_ => fixed, 1 => rect, 2 => tri, 3 => normal, + // 4 => exp, 5 => ces }. Emitted as an if/else + // chain on pt_mod; each arm leaves the fraction on the stack, stored into + // CURVE_FRACTION below. + emit_curve_fraction(&mut f, normal_cdf_idx, exp_idx, pow_idx); + f.instruction(&Ins::LocalSet(CURVE_FRACTION)); + + // alloc = request * fraction, parked in CURVE_T (free here) so the floor + // branch can read it inside both `if` arms (a wasm `If(Result(F64))` does + // NOT carry the pre-`if` stack value into the block). + f.instruction(&Ins::LocalGet(CURVE_REQUEST)); + f.instruction(&Ins::LocalGet(CURVE_FRACTION)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::LocalSet(CURVE_T)); + + // if ptype >= 10 { alloc.floor() } else { alloc }. `ptype >= 10` tests the + // original f64 ptype (Rust `ptype >= 10`, an i32 compare; ptype is + // integer-valued here). + f.instruction(&Ins::LocalGet(CURVE_PTYPE)); + f.instruction(&Ins::I32TruncSatF64S); + f.instruction(&Ins::I32Const(10)); + f.instruction(&Ins::I32GeS); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&Ins::LocalGet(CURVE_T)); + f.instruction(&Ins::F64Floor); + f.instruction(&Ins::Else); + f.instruction(&Ins::LocalGet(CURVE_T)); + f.instruction(&Ins::End); + + f.instruction(&Ins::End); + f +} + +/// Push the survival `fraction` for the `pt_mod` already in [`CURVE_PT_MOD`], +/// dispatching the six `ptype % 10` branches as a nested if/else chain (each arm +/// a `Result(F64)` leaving exactly one f64). The `_` default and branch `0` are +/// the identical "fixed" survival, so the chain falls through to it. +fn emit_curve_fraction(f: &mut Function, normal_cdf_idx: u32, exp_idx: u32, pow_idx: u32) { + // if pt_mod == 1 { rect } else if pt_mod == 2 { tri } else if pt_mod == 3 + // { normal } else if pt_mod == 4 { exp } else if pt_mod == 5 { ces } + // else { fixed }. + emit_pt_eq(f, 1); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + emit_curve_rectangular(f); + f.instruction(&Ins::Else); + + emit_pt_eq(f, 2); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + emit_curve_triangular(f); + f.instruction(&Ins::Else); + + emit_pt_eq(f, 3); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + emit_curve_normal(f, normal_cdf_idx); + f.instruction(&Ins::Else); + + emit_pt_eq(f, 4); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + emit_curve_exponential(f, exp_idx); + f.instruction(&Ins::Else); + + emit_pt_eq(f, 5); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + emit_curve_ces(f, pow_idx); + f.instruction(&Ins::Else); + + // Default (pt_mod == 0 or anything else): the fixed survival. + emit_curve_fixed(f); + + f.instruction(&Ins::End); // 5 + f.instruction(&Ins::End); // 4 + f.instruction(&Ins::End); // 3 + f.instruction(&Ins::End); // 2 + f.instruction(&Ins::End); // 1 +} + +/// Push the i32 condition `pt_mod == n`. +fn emit_pt_eq(f: &mut Function, n: i32) { + f.instruction(&Ins::LocalGet(CURVE_PT_MOD)); + f.instruction(&Ins::I32Const(n)); + f.instruction(&Ins::I32Eq); +} + +/// Branch 0 / `_`: fixed quantity -- `if p <= ppriority { 1.0 } else { 0.0 }`. +fn emit_curve_fixed(f: &mut Function) { + f.instruction(&f64_const(1.0)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::LocalGet(CURVE_P)); + f.instruction(&Ins::LocalGet(CURVE_PPRIORITY)); + f.instruction(&Ins::F64Le); // p <= ppriority + f.instruction(&Ins::Select); // 1.0 if p<=ppriority else 0.0 +} + +/// Branch 1: rectangular survival. `lo = ppriority - pwidth; hi = ppriority + +/// pwidth; if p <= lo { 1 } else if p >= hi { 0 } else { (hi - p)/(hi - lo) }`. +/// `lo`/`hi` are recomputed inline at each use (matching the Rust let-bindings' +/// values; the FP result is identical) to avoid extra scratch locals. +fn emit_curve_rectangular(f: &mut Function) { + // if p <= lo { 1.0 } + f.instruction(&Ins::LocalGet(CURVE_P)); + emit_lo(f); + f.instruction(&Ins::F64Le); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::Else); + // else if p >= hi { 0.0 } else { (hi - p) / (hi - lo) } + f.instruction(&Ins::LocalGet(CURVE_P)); + emit_hi(f); + f.instruction(&Ins::F64Ge); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::Else); + emit_hi_minus_p_over_hi_minus_lo(f); + f.instruction(&Ins::End); + f.instruction(&Ins::End); +} + +/// Branch 2: triangular survival. `lo`/`hi` as in rectangular; `if p <= lo { 1 } +/// else if p >= hi { 0 } else if p <= ppriority { t = (hi-p)/(hi-lo); 1 - +/// 2(1-t)^2 } else { t = (hi-p)/(hi-lo); 2 t^2 }`. +fn emit_curve_triangular(f: &mut Function) { + // if p <= lo { 1.0 } + f.instruction(&Ins::LocalGet(CURVE_P)); + emit_lo(f); + f.instruction(&Ins::F64Le); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::Else); + // else if p >= hi { 0.0 } + f.instruction(&Ins::LocalGet(CURVE_P)); + emit_hi(f); + f.instruction(&Ins::F64Ge); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::Else); + // t = (hi - p) / (hi - lo) + emit_hi_minus_p_over_hi_minus_lo(f); + f.instruction(&Ins::LocalSet(CURVE_T)); + // else if p <= ppriority { 1 - 2*(1-t)*(1-t) } else { 2*t*t } + f.instruction(&Ins::LocalGet(CURVE_P)); + f.instruction(&Ins::LocalGet(CURVE_PPRIORITY)); + f.instruction(&Ins::F64Le); // p <= ppriority + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + // 1.0 - 2.0 * (1.0 - t) * (1.0 - t) + f.instruction(&f64_const(1.0)); + f.instruction(&f64_const(2.0)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::LocalGet(CURVE_T)); + f.instruction(&Ins::F64Sub); // (1 - t) + f.instruction(&Ins::F64Mul); // 2 * (1 - t) + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::LocalGet(CURVE_T)); + f.instruction(&Ins::F64Sub); // (1 - t) + f.instruction(&Ins::F64Mul); // 2 * (1 - t) * (1 - t) + f.instruction(&Ins::F64Sub); // 1 - 2*(1-t)*(1-t) + f.instruction(&Ins::Else); + // 2.0 * t * t + f.instruction(&f64_const(2.0)); + f.instruction(&Ins::LocalGet(CURVE_T)); + f.instruction(&Ins::F64Mul); // 2 * t + f.instruction(&Ins::LocalGet(CURVE_T)); + f.instruction(&Ins::F64Mul); // 2 * t * t + f.instruction(&Ins::End); + f.instruction(&Ins::End); + f.instruction(&Ins::End); +} + +/// Branch 3: normal survival. `if pwidth <= 0 { if p <= ppriority { 1 } else +/// { 0 } } else { normal_cdf((ppriority - p) / pwidth) }`. +fn emit_curve_normal(f: &mut Function, normal_cdf_idx: u32) { + f.instruction(&Ins::LocalGet(CURVE_PWIDTH)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Le); // pwidth <= 0 + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + emit_curve_fixed(f); + f.instruction(&Ins::Else); + // normal_cdf((ppriority - p) / pwidth) + f.instruction(&Ins::LocalGet(CURVE_PPRIORITY)); + f.instruction(&Ins::LocalGet(CURVE_P)); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::LocalGet(CURVE_PWIDTH)); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::Call(normal_cdf_idx)); + f.instruction(&Ins::End); +} + +/// Branch 4: symmetric exponential survival. `if pwidth <= 0 { fixed } else +/// { z = (p - ppriority) / pwidth; if z > 0 { 0.5 * (-z).exp() } else { 1 - 0.5 +/// * z.exp() } }`. +fn emit_curve_exponential(f: &mut Function, exp_idx: u32) { + f.instruction(&Ins::LocalGet(CURVE_PWIDTH)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Le); // pwidth <= 0 + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + emit_curve_fixed(f); + f.instruction(&Ins::Else); + // z = (p - ppriority) / pwidth + f.instruction(&Ins::LocalGet(CURVE_P)); + f.instruction(&Ins::LocalGet(CURVE_PPRIORITY)); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::LocalGet(CURVE_PWIDTH)); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::LocalSet(CURVE_Z)); + // if z > 0 { 0.5 * (-z).exp() } else { 1.0 - 0.5 * z.exp() } + f.instruction(&Ins::LocalGet(CURVE_Z)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Gt); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + // 0.5 * (-z).exp() + f.instruction(&f64_const(0.5)); + f.instruction(&Ins::LocalGet(CURVE_Z)); + f.instruction(&Ins::F64Neg); + f.instruction(&Ins::Call(exp_idx)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::Else); + // 1.0 - 0.5 * z.exp() + f.instruction(&f64_const(1.0)); + f.instruction(&f64_const(0.5)); + f.instruction(&Ins::LocalGet(CURVE_Z)); + f.instruction(&Ins::Call(exp_idx)); + f.instruction(&Ins::F64Mul); // 0.5 * z.exp() + f.instruction(&Ins::F64Sub); // 1 - 0.5 * z.exp() + f.instruction(&Ins::End); + f.instruction(&Ins::End); +} + +/// Branch 5: constant elasticity of substitution (CES). `if p <= 0 { 1 } else +/// if ppriority <= 0 { 0 } else { ratio = ppriority / p; q = ratio.powf(pextra); +/// if q.is_infinite() { 1 } else { q / (1 + q) } }`. +fn emit_curve_ces(f: &mut Function, pow_idx: u32) { + // if p <= 0 { 1.0 } + f.instruction(&Ins::LocalGet(CURVE_P)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Le); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::Else); + // else if ppriority <= 0 { 0.0 } + f.instruction(&Ins::LocalGet(CURVE_PPRIORITY)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Le); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::Else); + // q = (ppriority / p).powf(pextra) + f.instruction(&Ins::LocalGet(CURVE_PPRIORITY)); + f.instruction(&Ins::LocalGet(CURVE_P)); + f.instruction(&Ins::F64Div); // ratio + f.instruction(&Ins::LocalGet(CURVE_PEXTRA)); + f.instruction(&Ins::Call(pow_idx)); + f.instruction(&Ins::LocalSet(CURVE_Q)); + // if q.is_infinite() { 1.0 } else { q / (1.0 + q) } + f.instruction(&Ins::LocalGet(CURVE_Q)); + f.instruction(&Ins::F64Abs); + f.instruction(&f64_const(f64::INFINITY)); + f.instruction(&Ins::F64Eq); // |q| == inf (q.is_infinite()) + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::Else); + // q / (1.0 + q) + f.instruction(&Ins::LocalGet(CURVE_Q)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::LocalGet(CURVE_Q)); + f.instruction(&Ins::F64Add); // 1 + q + f.instruction(&Ins::F64Div); // q / (1 + q) + f.instruction(&Ins::End); + f.instruction(&Ins::End); + f.instruction(&Ins::End); +} + +/// Push `ppriority - pwidth` (the rectangular/triangular `lo`). +fn emit_lo(f: &mut Function) { + f.instruction(&Ins::LocalGet(CURVE_PPRIORITY)); + f.instruction(&Ins::LocalGet(CURVE_PWIDTH)); + f.instruction(&Ins::F64Sub); +} + +/// Push `ppriority + pwidth` (the rectangular/triangular `hi`). +fn emit_hi(f: &mut Function) { + f.instruction(&Ins::LocalGet(CURVE_PPRIORITY)); + f.instruction(&Ins::LocalGet(CURVE_PWIDTH)); + f.instruction(&Ins::F64Add); +} + +/// Push `(hi - p) / (hi - lo)` where `lo = ppriority - pwidth`, `hi = ppriority +/// + pwidth`. `hi - lo == 2*pwidth`, but the Rust reference computes `(hi - lo)` +/// from the let-bound `hi`/`lo`, so reproduce that exact subtraction. +fn emit_hi_minus_p_over_hi_minus_lo(f: &mut Function) { + // hi - p + emit_hi(f); + f.instruction(&Ins::LocalGet(CURVE_P)); + f.instruction(&Ins::F64Sub); + // hi - lo + emit_hi(f); + emit_lo(f); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::F64Div); +} + +// ── allocate_available (alloc.rs:136-199) ──────────────────────────────────── + +// `allocate_available(requests_ptr: i32, n: i32, profiles_ptr: i32, avail: f64, +// out_ptr: i32) -> ()` local layout. `requests_ptr`/`profiles_ptr`/`out_ptr` are +// byte addresses into the scratch region; `profiles` is 4 f64/requester laid out +// `(ptype, ppriority, pwidth, pextra)`. +const ALLOC_REQ_PTR: u32 = 0; +const ALLOC_N: u32 = 1; +const ALLOC_PROF_PTR: u32 = 2; +const ALLOC_AVAIL: u32 = 3; +const ALLOC_OUT_PTR: u32 = 4; +// Scratch locals (after the five params). +const ALLOC_I: u32 = 5; // i32 loop index +const ALLOC_TOTAL_DEMAND: u32 = 6; // f64 Σ requests where r > 0 +const ALLOC_R: u32 = 7; // f64 a request value +const ALLOC_P_MIN: u32 = 8; // f64 search-range lower bound +const ALLOC_P_MAX: u32 = 9; // f64 search-range upper bound +const ALLOC_SPREAD: u32 = 10; // f64 per-profile spread +const ALLOC_PPRIORITY: u32 = 11; // f64 a profile's ppriority +const ALLOC_PWIDTH: u32 = 12; // f64 a profile's pwidth +const ALLOC_PT_MOD: u32 = 13; // i32 a profile's ptype % 10 +const ALLOC_LO: u32 = 14; // f64 bisection low +const ALLOC_HI: u32 = 15; // f64 bisection high +const ALLOC_MID: u32 = 16; // f64 bisection midpoint +const ALLOC_TOTAL: u32 = 17; // f64 Σ alloc_curve(mid, ...) +const ALLOC_ITER: u32 = 18; // i32 bisection iteration counter +const ALLOC_PSTAR: u32 = 19; // f64 the converged price + +// Bytes per profile tuple (4 f64) and per request/out slot (1 f64). +const PROFILE_BYTES: i32 = 32; +const SLOT_BYTES: i32 = 8; + +/// Emit `allocate_available(requests_ptr, n, profiles_ptr, avail, out_ptr)`, +/// porting `crate::alloc::allocate_available` bit-faithfully over scratch-memory +/// arrays. +/// +/// The three short-circuits (`n == 0` -> nothing written; `avail >= +/// total_demand` -> each requester gets `r.max(0)`; `avail <= 0` -> zeros) +/// mirror the Rust early returns. Otherwise the per-type search range +/// `[p_min, p_max]` is computed from the profiles' `spread`, then a 100-iteration +/// bisection finds the market-clearing price (the `total < avail` -> `hi = mid` +/// step and the `|hi - lo| < 1e-14 * (1 + |hi|)` relative-convergence break), +/// and `out[i] = alloc_curve(p_star, requests[i], ...)` is written for every +/// requester. A runtime loop (never unrolled): `n` is a runtime value. +/// `alloc_curve_idx` is [`emit_alloc_curve`]'s assigned function index. +pub(crate) fn emit_allocate_available(alloc_curve_idx: u32) -> Function { + // Scratch: i32 (I), f64 (TOTAL_DEMAND, R, P_MIN, P_MAX, SPREAD, PPRIORITY, + // PWIDTH), i32 (PT_MOD), f64 (LO, HI, MID, TOTAL), i32 (ITER), f64 (PSTAR). + // Declaration order fixes the indices ALLOC_I..ALLOC_PSTAR. + let mut f = Function::new([ + (1, ValType::I32), + (7, ValType::F64), + (1, ValType::I32), + (4, ValType::F64), + (1, ValType::I32), + (1, ValType::F64), + ]); + + // if n == 0 { return } (the Rust `if n == 0 { return vec![] }`). + f.instruction(&Ins::LocalGet(ALLOC_N)); + f.instruction(&Ins::I32Eqz); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // total_demand = Σ requests[i] where requests[i] > 0.0. + emit_total_demand(&mut f); + + // if avail >= total_demand { out[i] = requests[i].max(0.0); return } + f.instruction(&Ins::LocalGet(ALLOC_AVAIL)); + f.instruction(&Ins::LocalGet(ALLOC_TOTAL_DEMAND)); + f.instruction(&Ins::F64Ge); + f.instruction(&Ins::If(BlockType::Empty)); + emit_full_grant(&mut f); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // if avail <= 0.0 { out[i] = 0.0; return } + f.instruction(&Ins::LocalGet(ALLOC_AVAIL)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Le); + f.instruction(&Ins::If(BlockType::Empty)); + emit_zero_out(&mut f); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // Compute the search range [p_min, p_max] from the profiles. + emit_search_range(&mut f); + + // 100-iteration bisection for the market-clearing price. + emit_bisection(&mut f, alloc_curve_idx); + + // p_star = (lo + hi) / 2.0; out[i] = alloc_curve(p_star, requests[i], ...). + f.instruction(&Ins::LocalGet(ALLOC_LO)); + f.instruction(&Ins::LocalGet(ALLOC_HI)); + f.instruction(&Ins::F64Add); + f.instruction(&f64_const(2.0)); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::LocalSet(ALLOC_PSTAR)); + emit_final_allocations(&mut f, alloc_curve_idx); + + f.instruction(&Ins::End); + f +} + +/// `total_demand = Σ requests[i] where requests[i] > 0.0` into +/// [`ALLOC_TOTAL_DEMAND`]. A runtime `for i in 0..n` loop. +fn emit_total_demand(f: &mut Function) { + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::LocalSet(ALLOC_TOTAL_DEMAND)); + emit_for_n(f, |f| { + // r = requests[i] + emit_load_request(f); + f.instruction(&Ins::LocalSet(ALLOC_R)); + // if r > 0.0 { total_demand += r } + f.instruction(&Ins::LocalGet(ALLOC_R)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Gt); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&Ins::LocalGet(ALLOC_TOTAL_DEMAND)); + f.instruction(&Ins::LocalGet(ALLOC_R)); + f.instruction(&Ins::F64Add); + f.instruction(&Ins::LocalSet(ALLOC_TOTAL_DEMAND)); + f.instruction(&Ins::End); + }); +} + +/// The `avail >= total_demand` arm: `out[i] = requests[i].max(0.0)` for every +/// requester. `f64::max` is NaN-ignoring; reproduce it with the compare-select +/// form (`r > 0 ? r : 0` is `r.max(0.0)` for a non-NaN `r`, and a NaN request +/// would be ignored by `f64::max` -- but the Rust path stores `r.max(0.0)` which +/// is `0.0` for a NaN `r`, matched here since `NaN > 0.0` is false). +fn emit_full_grant(f: &mut Function) { + emit_for_n(f, |f| { + // out[i] = max(requests[i], 0.0) + emit_out_addr(f); + // value = r > 0.0 ? r : 0.0 (== f64::max(r, 0.0) for non-NaN; for NaN r + // this yields 0.0, matching Rust `NaN.max(0.0) == 0.0`). + emit_load_request(f); + f.instruction(&Ins::LocalSet(ALLOC_R)); + f.instruction(&Ins::LocalGet(ALLOC_R)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::LocalGet(ALLOC_R)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Gt); // r > 0.0 + f.instruction(&Ins::Select); // r if r>0 else 0.0 + f.instruction(&Ins::F64Store(f64_memarg())); + }); +} + +/// The `avail <= 0.0` arm: `out[i] = 0.0` for every requester. +fn emit_zero_out(f: &mut Function) { + emit_for_n(f, |f| { + emit_out_addr(f); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Store(f64_memarg())); + }); +} + +/// Compute `[p_min, p_max]` from the profiles (alloc.rs:154-169): `p_min = +/// INFINITY`, `p_max = NEG_INFINITY`; for each profile `spread = match ptype % 10 +/// { 0 => 1, 1|2 => pwidth, 3 => pwidth*6, 4 => pwidth*10, 5 => ppriority*10, +/// _ => 1 }`, then `p_min = min(p_min, ppriority - spread)`, `p_max = +/// max(p_max, ppriority + spread)`. `f64::min`/`f64::max` are NaN-ignoring; +/// realistic profiles never carry NaN, and the reference uses them, so the +/// NaN-ignoring compare-select form is reproduced for fidelity. +fn emit_search_range(f: &mut Function) { + f.instruction(&f64_const(f64::INFINITY)); + f.instruction(&Ins::LocalSet(ALLOC_P_MIN)); + f.instruction(&f64_const(f64::NEG_INFINITY)); + f.instruction(&Ins::LocalSet(ALLOC_P_MAX)); + + emit_for_n(f, |f| { + // ppriority = profiles[i].1; pwidth = profiles[i].2; pt_mod = + // (profiles[i].0 as i32) % 10. + emit_load_profile_field(f, 1); + f.instruction(&Ins::LocalSet(ALLOC_PPRIORITY)); + emit_load_profile_field(f, 2); + f.instruction(&Ins::LocalSet(ALLOC_PWIDTH)); + emit_load_profile_field(f, 0); + f.instruction(&Ins::I32TruncSatF64S); + f.instruction(&Ins::I32Const(10)); + f.instruction(&Ins::I32RemS); + f.instruction(&Ins::LocalSet(ALLOC_PT_MOD)); + + // spread = match pt_mod { 1|2 => pwidth, 3 => pwidth*6, 4 => pwidth*10, + // 5 => ppriority*10, 0|_ => 1.0 }. + emit_spread(f); + f.instruction(&Ins::LocalSet(ALLOC_SPREAD)); + + // p_min = f64::min(p_min, ppriority - spread) + f.instruction(&Ins::LocalGet(ALLOC_P_MIN)); + f.instruction(&Ins::LocalGet(ALLOC_PPRIORITY)); + f.instruction(&Ins::LocalGet(ALLOC_SPREAD)); + f.instruction(&Ins::F64Sub); + emit_f64_min(f); + f.instruction(&Ins::LocalSet(ALLOC_P_MIN)); + + // p_max = f64::max(p_max, ppriority + spread) + f.instruction(&Ins::LocalGet(ALLOC_P_MAX)); + f.instruction(&Ins::LocalGet(ALLOC_PPRIORITY)); + f.instruction(&Ins::LocalGet(ALLOC_SPREAD)); + f.instruction(&Ins::F64Add); + emit_f64_max(f); + f.instruction(&Ins::LocalSet(ALLOC_P_MAX)); + }); +} + +/// Push the per-profile `spread` for the `pt_mod` in [`ALLOC_PT_MOD`] (uses +/// [`ALLOC_PWIDTH`]/[`ALLOC_PPRIORITY`]): 1 (0/_), pwidth (1/2), pwidth*6 (3), +/// pwidth*10 (4), ppriority*10 (5). Emitted as a nested if/else chain. +fn emit_spread(f: &mut Function) { + // pt_mod == 1 || pt_mod == 2 -> pwidth + f.instruction(&Ins::LocalGet(ALLOC_PT_MOD)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Eq); + f.instruction(&Ins::LocalGet(ALLOC_PT_MOD)); + f.instruction(&Ins::I32Const(2)); + f.instruction(&Ins::I32Eq); + f.instruction(&Ins::I32Or); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&Ins::LocalGet(ALLOC_PWIDTH)); + f.instruction(&Ins::Else); + + // pt_mod == 3 -> pwidth * 6.0 + f.instruction(&Ins::LocalGet(ALLOC_PT_MOD)); + f.instruction(&Ins::I32Const(3)); + f.instruction(&Ins::I32Eq); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&Ins::LocalGet(ALLOC_PWIDTH)); + f.instruction(&f64_const(6.0)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::Else); + + // pt_mod == 4 -> pwidth * 10.0 + f.instruction(&Ins::LocalGet(ALLOC_PT_MOD)); + f.instruction(&Ins::I32Const(4)); + f.instruction(&Ins::I32Eq); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&Ins::LocalGet(ALLOC_PWIDTH)); + f.instruction(&f64_const(10.0)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::Else); + + // pt_mod == 5 -> ppriority * 10.0 + f.instruction(&Ins::LocalGet(ALLOC_PT_MOD)); + f.instruction(&Ins::I32Const(5)); + f.instruction(&Ins::I32Eq); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&Ins::LocalGet(ALLOC_PPRIORITY)); + f.instruction(&f64_const(10.0)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::Else); + + // default (pt_mod == 0 or anything else) -> 1.0 + f.instruction(&f64_const(1.0)); + + f.instruction(&Ins::End); // 5 + f.instruction(&Ins::End); // 4 + f.instruction(&Ins::End); // 3 + f.instruction(&Ins::End); // 1|2 +} + +/// The 100-iteration bisection (alloc.rs:171-190): `lo = p_min; hi = p_max; for +/// _ in 0..100 { mid = (lo+hi)/2; total = Σ alloc_curve(mid, ...); if total < +/// avail { hi = mid } else { lo = mid }; if |hi-lo| < 1e-14*(1+|hi|) { break } }`. +fn emit_bisection(f: &mut Function, alloc_curve_idx: u32) { + // lo = p_min; hi = p_max; iter = 0 + f.instruction(&Ins::LocalGet(ALLOC_P_MIN)); + f.instruction(&Ins::LocalSet(ALLOC_LO)); + f.instruction(&Ins::LocalGet(ALLOC_P_MAX)); + f.instruction(&Ins::LocalSet(ALLOC_HI)); + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::LocalSet(ALLOC_ITER)); + + f.instruction(&Ins::Block(BlockType::Empty)); // $bisect_exit + f.instruction(&Ins::Loop(BlockType::Empty)); // $bisect + + // while-head: if !(iter < 100) break $bisect_exit (br depth 1). + f.instruction(&Ins::LocalGet(ALLOC_ITER)); + f.instruction(&Ins::I32Const(100)); + f.instruction(&Ins::I32LtS); + f.instruction(&Ins::I32Eqz); + f.instruction(&Ins::BrIf(1)); + + // mid = (lo + hi) / 2.0 + f.instruction(&Ins::LocalGet(ALLOC_LO)); + f.instruction(&Ins::LocalGet(ALLOC_HI)); + f.instruction(&Ins::F64Add); + f.instruction(&f64_const(2.0)); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::LocalSet(ALLOC_MID)); + + // total = Σ_{i} b) ? a : b -> r + f.instruction(&Ins::LocalGet(a)); + f.instruction(&Ins::LocalGet(b)); + f.instruction(&Ins::LocalGet(a)); + f.instruction(&Ins::LocalGet(b)); + if want_min { + f.instruction(&Ins::F64Lt); + } else { + f.instruction(&Ins::F64Gt); + } + f.instruction(&Ins::Select); + f.instruction(&Ins::LocalSet(r)); + + // r = (b is NaN) ? a : r + f.instruction(&Ins::LocalGet(a)); + f.instruction(&Ins::LocalGet(r)); + f.instruction(&Ins::LocalGet(b)); + f.instruction(&Ins::LocalGet(b)); + f.instruction(&Ins::F64Ne); // b != b + f.instruction(&Ins::Select); + f.instruction(&Ins::LocalSet(r)); + + // result = (a is NaN) ? b : r + f.instruction(&Ins::LocalGet(b)); + f.instruction(&Ins::LocalGet(r)); + f.instruction(&Ins::LocalGet(a)); + f.instruction(&Ins::LocalGet(a)); + f.instruction(&Ins::F64Ne); // a != a + f.instruction(&Ins::Select); +} + +/// An 8-byte (f64) memory access at offset 0, naturally aligned (the scratch +/// region is 8-byte aligned). +fn f64_memarg() -> wasm_encoder::MemArg { + f64_memarg_off(0) +} + +/// An 8-byte (f64) memory access at a static byte `offset`. +fn f64_memarg_off(offset: u64) -> wasm_encoder::MemArg { + wasm_encoder::MemArg { + offset, + align: 3, // log2(8): an 8-byte f64 access + memory_index: 0, + } +} + +// ── opcode lowering arms (vm.rs:2631-2794) ─────────────────────────────────── + +/// Lower `AllocateAvailable { write_temp_id }`, mirroring `vm.rs:2631-2721`. The +/// views are `profile_view = top`, `requests_view = top-1`; `avail` is the f64 +/// on top of the wasm operand stack (the VM pops it). Gathers the `n = +/// requests_view.size()` request values + the per-requester profile tuples into +/// the allocation scratch region, `call`s the [`emit_allocate_available`] helper, +/// then copies the `n` results into temp `write_temp_id`. An invalid input view +/// fills the whole destination temp region with NaN. +/// +/// `pp_cols` reproduces the VM's `if !pp_values.is_empty() && n>0 && +/// pp_size%n==0 { pp_size/n } else { 4 }`, and each profile field +/// `(ptype, ppriority, pwidth, pextra)` is read from `pp_values[i*pp_cols + j]` +/// with the VM's defaults `(0.0, 0.0, 1.0, 0.0)` when the index is out of range +/// -- all resolved at compile time (the view sizes and indices are static). +pub(crate) fn emit_allocate_available_op( + requests_view: &ViewDesc, + profile_view: &ViewDesc, + write_temp_id: u8, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + // A dynamically-subscripted view is fine: the per-element gather routes + // through `emit_view_element_load`, which folds the view's runtime offset + // addend and per-element validity guard, and the op-level gate below takes + // the VM's whole-op `!is_valid -> fill_temp_nan` short-circuit. + + // Pop `avail` (top) into the scratch f64 before the gate, so both gate arms + // are operand-balanced. + let avail = ctx.scratch_local; + f.instruction(&Ins::LocalSet(avail)); + + let n = requests_view.size(); + let pp_size = profile_view.size(); + // pp_cols: pp_size/n when the flattened profile array divides evenly into n + // requesters, else 4 (vm.rs:2680-2685). + let pp_cols = if pp_size > 0 && n > 0 && pp_size.is_multiple_of(n) { + pp_size / n + } else { + 4 + }; + + emit_with_validity_gate( + &[requests_view, profile_view], + write_temp_id, + ctx, + f, + |ctx, f| { + // Gather requests[i] -> scratch req region. + let (req_base, prof_base, out_base) = alloc_scratch_layout(ctx, n); + for i in 0..n { + f.instruction(&Ins::I32Const(0)); + emit_view_element_load(requests_view, i, ctx, f)?; + f.instruction(&Ins::F64Store(memarg( + req_base + (i as u64) * u64::from(SLOT_SIZE), + ))); + } + + // Build per-requester profile tuples (ptype, ppriority, pwidth, pextra) + // from pp_values[i*pp_cols + j], defaulting (0,0,1,0) out of range. + const DEFAULTS: [f64; 4] = [0.0, 0.0, 1.0, 0.0]; + for i in 0..n { + for (j, &default) in DEFAULTS.iter().enumerate() { + let prof_addr = + prof_base + (i as u64) * (PROFILE_BYTES as u64) + (j as u64) * 8; + f.instruction(&Ins::I32Const(0)); + let flat = i * pp_cols + j; + if flat < pp_size { + emit_view_element_load(profile_view, flat, ctx, f)?; + } else { + f.instruction(&f64_const(default)); + } + f.instruction(&Ins::F64Store(memarg(prof_addr))); + } + } + + // allocate_available(req_base, n, prof_base, avail, out_base) + f.instruction(&Ins::I32Const(req_base as i32)); + f.instruction(&Ins::I32Const(n as i32)); + f.instruction(&Ins::I32Const(prof_base as i32)); + f.instruction(&Ins::LocalGet(avail)); + f.instruction(&Ins::I32Const(out_base as i32)); + f.instruction(&Ins::Call(ctx.helpers.allocate_available)); + + // Copy out[i] -> temp[write_temp_id][i]. + emit_copy_out_to_temp(out_base, n, write_temp_id, ctx, f) + }, + ) +} + +/// Lower `AllocateByPriority { write_temp_id }`, mirroring `vm.rs:2723-2794`. The +/// views are `priority_view = top`, `requests_view = top-1`; the operand stack +/// holds `supply` on top and `width` beneath (the VM pops `supply` then +/// `width`). Gathers requests, synthesizes rectangular profiles `(1.0, +/// priorities[i] or 0.0, width, 0.0)`, `call`s [`emit_allocate_available`] with +/// `supply` as the available amount, then copies results into the temp. +pub(crate) fn emit_allocate_by_priority_op( + requests_view: &ViewDesc, + priority_view: &ViewDesc, + write_temp_id: u8, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + // A dynamically-subscripted view is handled by `emit_view_element_load` + // (runtime offset + per-element validity) and the op-level gate below; see + // `emit_allocate_available_op`. + + // Pop `supply` (top) then `width` into scratch f64s, before the gate. + let supply = ctx.scratch_local; + let width = ctx.vector_f64_locals[0]; + f.instruction(&Ins::LocalSet(supply)); + f.instruction(&Ins::LocalSet(width)); + + let n = requests_view.size(); + let pri_size = priority_view.size(); + + emit_with_validity_gate( + &[requests_view, priority_view], + write_temp_id, + ctx, + f, + |ctx, f| { + let (req_base, prof_base, out_base) = alloc_scratch_layout(ctx, n); + // Gather requests[i]. + for i in 0..n { + f.instruction(&Ins::I32Const(0)); + emit_view_element_load(requests_view, i, ctx, f)?; + f.instruction(&Ins::F64Store(memarg( + req_base + (i as u64) * u64::from(SLOT_SIZE), + ))); + } + + // Rectangular profiles: (ptype=1, ppriority=priorities[i] or 0, pwidth= + // width, pextra=0). Fields 0/3 are the constants 1.0/0.0; field 1 is the + // priority view element (default 0.0 out of range); field 2 is the + // runtime `width` local. + for i in 0..n { + let base = prof_base + (i as u64) * (PROFILE_BYTES as u64); + // ptype = 1.0 + f.instruction(&Ins::I32Const(0)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::F64Store(memarg(base))); + // ppriority = priorities[i] or 0.0 + f.instruction(&Ins::I32Const(0)); + if i < pri_size { + emit_view_element_load(priority_view, i, ctx, f)?; + } else { + f.instruction(&f64_const(0.0)); + } + f.instruction(&Ins::F64Store(memarg(base + 8))); + // pwidth = width (runtime) + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::LocalGet(width)); + f.instruction(&Ins::F64Store(memarg(base + 16))); + // pextra = 0.0 + f.instruction(&Ins::I32Const(0)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Store(memarg(base + 24))); + } + + // allocate_available(req_base, n, prof_base, supply, out_base) + f.instruction(&Ins::I32Const(req_base as i32)); + f.instruction(&Ins::I32Const(n as i32)); + f.instruction(&Ins::I32Const(prof_base as i32)); + f.instruction(&Ins::LocalGet(supply)); + f.instruction(&Ins::I32Const(out_base as i32)); + f.instruction(&Ins::Call(ctx.helpers.allocate_available)); + + emit_copy_out_to_temp(out_base, n, write_temp_id, ctx, f) + }, + ) +} + +/// The three consecutive scratch sub-region byte bases for an allocation of `n` +/// requesters: `requests` (n f64) at `alloc_scratch_base`, `profiles` (4n f64) +/// after it, `out` (n f64) after that. All three are live across the +/// `allocate_available` call; `module.rs` sizes the region for the largest `n`. +fn alloc_scratch_layout(ctx: &EmitCtx, n: usize) -> (u64, u64, u64) { + let base = u64::from(ctx.alloc_scratch_base); + let req_base = base; + let prof_base = req_base + (n as u64) * u64::from(SLOT_SIZE); + let out_base = prof_base + (n as u64) * (PROFILE_BYTES as u64); + (req_base, prof_base, out_base) +} + +/// Copy the `n` allocations the helper wrote at `out_base` into temp +/// `write_temp_id` (`temp[temp_off + i] = out[i]`). Unrolled over `n`. +fn emit_copy_out_to_temp( + out_base: u64, + n: usize, + write_temp_id: u8, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + for i in 0..n { + let temp_addr = temp_element_byte_addr(ctx, write_temp_id, i as u32)?; + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::F64Load(memarg( + out_base + (i as u64) * u64::from(SLOT_SIZE), + ))); + f.instruction(&Ins::F64Store(memarg(temp_addr))); + } + Ok(()) +} + +/// Emit `body` gated on the VM's "`!is_valid` -> fill_temp_nan" short-circuit +/// for the allocation arms. When no input view carries a runtime validity flag +/// (the common static/temp/full-var case), `body` is emitted directly with no +/// runtime check; otherwise `if all_valid { body } else { fill_temp_nan }`. +/// Mirrors `super::vector::emit_with_validity_gate`. +fn emit_with_validity_gate( + views: &[&ViewDesc], + write_temp_id: u8, + ctx: &EmitCtx, + f: &mut Function, + body: impl FnOnce(&EmitCtx, &mut Function) -> Result<(), WasmGenError>, +) -> Result<(), WasmGenError> { + let valids: Vec = views.iter().filter_map(|v| v.valid_local).collect(); + if valids.is_empty() { + return body(ctx, f); + } + f.instruction(&Ins::LocalGet(valids[0])); + for &v in &valids[1..] { + f.instruction(&Ins::LocalGet(v)); + f.instruction(&Ins::I32And); + } + f.instruction(&Ins::If(BlockType::Empty)); + body(ctx, f)?; + f.instruction(&Ins::Else); + emit_fill_temp_nan(ctx, write_temp_id, f)?; + f.instruction(&Ins::End); + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::super::lower::build_helpers; + use super::f64_memarg_off; + use checked::Store; + use wasm::validate; + use wasm_encoder::{ + CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction as Ins, + MemorySection, MemoryType, Module, TypeSection, ValType, + }; + + // The allocation helpers are bit-faithful ports of `crate::alloc`. Their + // leaf transcendental helpers (`exp`/`pow`) are NOT bit-identical to the + // VM's libm -- they are the open-coded approximations of Phase 2, pinned in + // `super::super::math` to abs 0.0 / rel ~1e-12 vs `f64`. So the alloc helpers + // can only match the Rust `crate::alloc` reference (which uses libm) to that + // leaf tolerance, propagated through the curves and the bisection. + // + // Documented tolerances (all far inside the corpus bar of abs 2e-3 / + // rel 5e-6): + // - erfc_approx / normal_cdf: abs 1e-12 OR rel 1e-12. erfc's only + // transcendental is one `exp` call (rel ~1e-12); the polynomial is exact + // arithmetic, so the wasm result tracks `crate::alloc::erfc_approx` to the + // exp helper's tolerance. + // - alloc_curve: abs 1e-9 OR rel 1e-9 across all six branches. Most use at + // most one exp/normal_cdf (rel ~1e-12); CES adds a `pow = exp(y*ln x)` + // (pinned at rel ~2.3e-12). The uniform 1e-9 bar leaves ample slack for + // the leaf approximations + DLR-FT-vs-native rounding drift. + // - allocate_available: abs 1e-9 OR rel 1e-9 -- the converged price rides on + // the curve tolerance, and the per-requester allocation is one more curve + // evaluation at that price. + const ERFC_ABS: f64 = 1e-12; + const ERFC_REL: f64 = 1e-12; + const CURVE_ABS: f64 = 1e-9; + const CURVE_REL: f64 = 1e-9; + const ALLOC_ABS: f64 = 1e-9; + const ALLOC_REL: f64 = 1e-9; + + /// Assert `got` matches `want` within absolute *or* relative tolerance, + /// propagating NaN/inf. Mirrors `super::super::math`'s `assert_close`. + fn assert_close(name: &str, got: f64, want: f64, abs_tol: f64, rel_tol: f64) { + if want.is_nan() { + assert!(got.is_nan(), "{name}: expected NaN, got {got}"); + return; + } + assert!(!got.is_nan(), "{name}: got NaN, expected {want}"); + if want.is_infinite() { + assert_eq!(got, want, "{name}: expected {want}, got {got}"); + return; + } + let abs = (got - want).abs(); + let rel = if want != 0.0 { abs / want.abs() } else { abs }; + assert!( + abs <= abs_tol || rel <= rel_tol, + "{name}: got {got}, want {want} (abs {abs:.3e}, rel {rel:.3e})" + ); + } + + /// A linear sample of `n+1` points across `[lo, hi]` inclusive. + fn linspace(lo: f64, hi: f64, n: usize) -> Vec { + (0..=n) + .map(|i| lo + (hi - lo) * (i as f64) / (n as f64)) + .collect() + } + + /// Which value-producing alloc helper a test module exports as `f`. + /// + /// The DLR-FT interop only types tuples up to arity 3, so the unary helpers + /// (`Erfc`/`NormalCdf`) export `f(x: f64) -> f64` directly, while the + /// six-argument `AllocCurve` exports `f(args_ptr: i32) -> f64` and reads its + /// six f64 arguments from `mem[args_ptr + k*8]`. + #[derive(Clone, Copy)] + enum Which { + Erfc, + NormalCdf, + AllocCurve, + } + + fn helper_index(which: Which) -> u32 { + let h = build_helpers().fns; + match which { + Which::Erfc => h.erfc_approx, + Which::NormalCdf => h.normal_cdf, + Which::AllocCurve => h.alloc_curve, + } + } + + /// Build a module with every helper body plus a thin exported `f` forwarding + /// to the helper under test, and a memory (the GF lookup helpers, also + /// bundled, `f64.load` from memory 0). For a unary helper `f(x: f64) -> f64` + /// calls directly; for `AllocCurve` (six args) `f(args_ptr: i32) -> f64` + /// loads the six args from `mem[args_ptr + k*8]` and calls the helper. + /// Mirrors `super::super::math`'s `build_helper_module` layout (helpers at + /// `0..N`, wrapper at `N`). + fn build_value_module(which: Which) -> Vec { + let helpers = build_helpers(); + let n_helpers = helpers.functions.len() as u32; + let target = helper_index(which); + let is_curve = matches!(which, Which::AllocCurve); + + let mut module = Module::new(); + + let mut types = TypeSection::new(); + if is_curve { + types.ty().function([ValType::I32], [ValType::F64]); + } else { + types.ty().function([ValType::F64], [ValType::F64]); + } + for hf in &helpers.functions { + types.ty().function(hf.params.clone(), hf.results.clone()); + } + module.section(&types); + + let mut functions = FunctionSection::new(); + for (i, _) in helpers.functions.iter().enumerate() { + functions.function(1 + i as u32); + } + functions.function(0); + module.section(&functions); + + let mut memories = MemorySection::new(); + memories.memory(MemoryType { + minimum: 1, + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + module.section(&memories); + + let mut exports = ExportSection::new(); + exports.export("f", ExportKind::Func, n_helpers); + exports.export("mem", ExportKind::Memory, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + for hf in &helpers.functions { + code.function(&hf.body); + } + let mut wrapper = Function::new([]); + if is_curve { + // Load the six f64 args from mem[args_ptr + k*8] (args_ptr is param 0). + for k in 0..6u64 { + wrapper.instruction(&Ins::LocalGet(0)); + wrapper.instruction(&Ins::F64Load(f64_memarg_off(k * 8))); + } + } else { + wrapper.instruction(&Ins::LocalGet(0)); + } + wrapper.instruction(&Ins::Call(target)); + wrapper.instruction(&Ins::End); + code.function(&wrapper); + module.section(&code); + + module.finish() + } + + fn run_unary(which: Which, x: f64) -> f64 { + let bytes = build_value_module(which); + let info = validate(&bytes).expect("helper module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let f = store + .instance_export(module, "f") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(f64,), f64>(f, (x,)) + .expect("invoke") + } + + /// Byte address the `AllocCurve` wrapper reads its six f64 args from. + const CURVE_ARGS_BASE: u32 = 512; + + /// Run `alloc_curve(p, request, ptype, ppriority, pwidth, pextra)` under the + /// interpreter. The six args are seeded into memory at [`CURVE_ARGS_BASE`] + /// (`ptype` as an integer-valued f64) and the wrapper reads them back. + fn run_alloc_curve(p: f64, request: f64, ptype: i32, pp: f64, pw: f64, pe: f64) -> f64 { + let bytes = build_value_module(Which::AllocCurve); + let info = validate(&bytes).expect("alloc_curve module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let args = [p, request, ptype as f64, pp, pw, pe]; + let mem = store + .instance_export(module, "mem") + .unwrap() + .as_mem() + .unwrap(); + store.mem_access_mut_slice(mem, |b| { + for (k, &v) in args.iter().enumerate() { + let a = CURVE_ARGS_BASE as usize + k * 8; + b[a..a + 8].copy_from_slice(&v.to_le_bytes()); + } + }); + let f = store + .instance_export(module, "f") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(i32,), f64>(f, (CURVE_ARGS_BASE as i32,)) + .expect("invoke") + } + + // ── erfc_approx parity vs crate::alloc::erfc_approx (AC7.1) ────────────── + + #[test] + fn erfc_approx_matches_rust_over_sampled_range() { + // Sweep both signs (z<0 takes the `2 - erfc_approx(-z)` symmetry branch) + // across the range where erfc is numerically interesting; the A-S 26.2.17 + // approximation is what the Rust reference uses too, so the wasm result + // tracks it to the `exp` helper's tolerance. + for z in linspace(-6.0, 6.0, 400) { + let got = run_unary(Which::Erfc, z); + let want = crate::alloc::erfc_approx(z); + assert_close(&format!("erfc_approx({z})"), got, want, ERFC_ABS, ERFC_REL); + } + // Anchor at z=0 (t=1): the wasm result tracks the Rust reference, which + // is ~0.9999999990 there -- the A-S 26.2.17 approximation, not the + // mathematical erfc(0)=1. + assert_close( + "erfc_approx(0)", + run_unary(Which::Erfc, 0.0), + crate::alloc::erfc_approx(0.0), + ERFC_ABS, + ERFC_REL, + ); + } + + // ── normal_cdf parity vs crate::alloc::normal_cdf (AC7.1) ──────────────── + + #[test] + fn normal_cdf_matches_rust_over_sampled_range() { + for x in linspace(-6.0, 6.0, 400) { + let got = run_unary(Which::NormalCdf, x); + let want = crate::alloc::normal_cdf(x); + assert_close(&format!("normal_cdf({x})"), got, want, ERFC_ABS, ERFC_REL); + } + // NaN propagates (the explicit `x.is_nan()` guard). + assert!(run_unary(Which::NormalCdf, f64::NAN).is_nan()); + // normal_cdf(0) tracks the Rust reference. (The A-S 26.2.17 erfc + // polynomial is ~0.4999999995 at x=0, NOT exactly 0.5 -- the ~1.5e-7 + // approximation error is a property of the reference itself, so parity + // is judged against `crate::alloc::normal_cdf`, not ideal math.) + assert_close( + "normal_cdf(0)", + run_unary(Which::NormalCdf, 0.0), + crate::alloc::normal_cdf(0.0), + ERFC_ABS, + ERFC_REL, + ); + } + + // ── alloc_curve parity for each of the 6 profile types + the >=10 floor ── + + /// Assert the emitted `alloc_curve` matches `crate::alloc::alloc_curve` over + /// a grid of prices for one profile `(ptype, ppriority, pwidth, pextra)` and + /// a fixed positive request. + fn assert_curve_matches(ptype: i32, pp: f64, pw: f64, pe: f64, request: f64) { + for p in linspace(-3.0, 8.0, 120) { + let got = run_alloc_curve(p, request, ptype, pp, pw, pe); + let want = crate::alloc::alloc_curve(p, request, ptype, pp, pw, pe); + assert_close( + &format!("alloc_curve(p={p}, ptype={ptype}, pp={pp}, pw={pw}, pe={pe})"), + got, + want, + CURVE_ABS, + CURVE_REL, + ); + } + } + + #[test] + fn alloc_curve_fixed_matches_rust() { + // ptype 0: fixed quantity (p <= ppriority ? request : 0). + assert_curve_matches(0, 2.0, 1.0, 0.0, 5.0); + } + + #[test] + fn alloc_curve_rectangular_matches_rust() { + // ptype 1: rectangular survival. + assert_curve_matches(1, 3.0, 1.5, 0.0, 4.0); + } + + #[test] + fn alloc_curve_triangular_matches_rust() { + // ptype 2: triangular survival (both p<=ppriority and p>ppriority arms). + assert_curve_matches(2, 2.5, 2.0, 0.0, 7.0); + } + + #[test] + fn alloc_curve_normal_matches_rust() { + // ptype 3: normal survival via normal_cdf. Also exercise the pwidth<=0 + // degenerate-to-fixed arm. + assert_curve_matches(3, 2.0, 1.0, 0.0, 6.0); + assert_curve_matches(3, 2.0, 0.0, 0.0, 6.0); // pwidth <= 0 -> fixed + } + + #[test] + fn alloc_curve_exponential_matches_rust() { + // ptype 4: symmetric exponential (both z>0 and z<=0 arms). Also the + // pwidth<=0 degenerate-to-fixed arm. + assert_curve_matches(4, 2.0, 1.0, 0.0, 8.0); + assert_curve_matches(4, 2.0, -1.0, 0.0, 8.0); // pwidth <= 0 -> fixed + } + + #[test] + fn alloc_curve_ces_matches_rust() { + // ptype 5: CES (uses pow). pextra is the elasticity. The grid spans + // p<=0 (->1), ppriority>0 normal case, and large-elasticity values that + // push q toward +inf (->1). + assert_curve_matches(5, 3.0, 1.0, 1.0, 5.0); + assert_curve_matches(5, 3.0, 1.0, 4.0, 5.0); + // ppriority <= 0 -> 0 for any positive price. + assert_curve_matches(5, 0.0, 1.0, 2.0, 5.0); + } + + #[test] + fn alloc_curve_floor_flag_matches_rust() { + // ptype >= 10 floors the allocation. ptype 10 is rectangular(0)+floor, + // 11 is rectangular(1)+floor, etc. Pick a request that yields a + // fractional allocation so the floor is observable. + for ptype in [10, 11, 13, 14, 15] { + assert_curve_matches(ptype, 2.5, 1.5, 1.0, 3.3); + } + } + + #[test] + fn alloc_curve_nonpositive_request_is_zero() { + // request <= 0 -> 0 for every profile, regardless of price/type. + for &request in &[0.0, -1.0, -100.0] { + for ptype in 0..6 { + let got = run_alloc_curve(1.0, request, ptype, 2.0, 1.0, 1.0); + let want = crate::alloc::alloc_curve(1.0, request, ptype, 2.0, 1.0, 1.0); + assert_eq!(got, want, "request {request}, ptype {ptype}"); + assert_eq!(got, 0.0); + } + } + } + + // ── allocate_available parity vs crate::alloc::allocate_available ──────── + + // Scratch byte layout for the `allocate_available` helper test: the i32 + // requester count at N_ADDR, requests at REQ_BASE, profiles at PROF_BASE + // (4 f64/requester), out at OUT_BASE. All 8-byte aligned (N_ADDR 4-byte), + // comfortably inside the single 64 KiB memory page. + const N_ADDR: u32 = 64; + const REQ_BASE: u32 = 256; + const PROF_BASE: u32 = 1024; + const OUT_BASE: u32 = 4096; + + /// Build a module with every helper body plus an exported `alloc(avail: f64)` + /// wrapper that calls `allocate_available(REQ_BASE, n, PROF_BASE, avail, + /// OUT_BASE)` with the array pointers hard-coded to the test's scratch bases + /// and `n` read from `mem[N_ADDR]` (an i32). A single f64 param keeps the + /// wrapper inside the DLR-FT interop's typed-tuple arity limit; the array + /// pointers and `n` are seeded into memory by the test. + fn build_allocate_module() -> Vec { + let helpers = build_helpers(); + let n_helpers = helpers.functions.len() as u32; + let target = helpers.fns.allocate_available; + + let mut module = Module::new(); + + let mut types = TypeSection::new(); + // alloc(avail: f64) -> () + types.ty().function([ValType::F64], []); + for hf in &helpers.functions { + types.ty().function(hf.params.clone(), hf.results.clone()); + } + module.section(&types); + + let mut functions = FunctionSection::new(); + for (i, _) in helpers.functions.iter().enumerate() { + functions.function(1 + i as u32); + } + functions.function(0); + module.section(&functions); + + let mut memories = MemorySection::new(); + memories.memory(MemoryType { + minimum: 1, + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + module.section(&memories); + + let mut exports = ExportSection::new(); + exports.export("alloc", ExportKind::Func, n_helpers); + exports.export("mem", ExportKind::Memory, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + for hf in &helpers.functions { + code.function(&hf.body); + } + let mut wrapper = Function::new([]); + // allocate_available(REQ_BASE, mem[N_ADDR] as i32, PROF_BASE, avail, OUT_BASE) + wrapper.instruction(&Ins::I32Const(REQ_BASE as i32)); + wrapper.instruction(&Ins::I32Const(0)); + wrapper.instruction(&Ins::I32Load(wasm_encoder::MemArg { + offset: u64::from(N_ADDR), + align: 2, + memory_index: 0, + })); + wrapper.instruction(&Ins::I32Const(PROF_BASE as i32)); + wrapper.instruction(&Ins::LocalGet(0)); // avail (f64 param) + wrapper.instruction(&Ins::I32Const(OUT_BASE as i32)); + wrapper.instruction(&Ins::Call(target)); + wrapper.instruction(&Ins::End); + code.function(&wrapper); + module.section(&code); + + module.finish() + } + + /// Run the emitted `allocate_available` over `requests`/`profiles` and read + /// back the `n` result slots; compare against `crate::alloc::allocate_available`. + fn assert_allocate_matches(requests: &[f64], profiles: &[(f64, f64, f64, f64)], avail: f64) { + assert_eq!(requests.len(), profiles.len()); + let n = requests.len(); + let bytes = build_allocate_module(); + let info = validate(&bytes).expect("allocate module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + + // Seed n, requests, and profiles into scratch memory. + let mem = store + .instance_export(inst, "mem") + .unwrap() + .as_mem() + .unwrap(); + store.mem_access_mut_slice(mem, |b| { + let na = N_ADDR as usize; + b[na..na + 4].copy_from_slice(&(n as i32).to_le_bytes()); + for (i, &r) in requests.iter().enumerate() { + let a = REQ_BASE as usize + i * 8; + b[a..a + 8].copy_from_slice(&r.to_le_bytes()); + } + for (i, &(pt, pp, pw, pe)) in profiles.iter().enumerate() { + let base = PROF_BASE as usize + i * 32; + b[base..base + 8].copy_from_slice(&pt.to_le_bytes()); + b[base + 8..base + 16].copy_from_slice(&pp.to_le_bytes()); + b[base + 16..base + 24].copy_from_slice(&pw.to_le_bytes()); + b[base + 24..base + 32].copy_from_slice(&pe.to_le_bytes()); + } + }); + + let alloc = store + .instance_export(inst, "alloc") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(f64,), ()>(alloc, (avail,)) + .expect("invoke"); + + let got: Vec = store.mem_access_mut_slice(mem, |b| { + (0..n) + .map(|i| { + let a = OUT_BASE as usize + i * 8; + f64::from_le_bytes(b[a..a + 8].try_into().unwrap()) + }) + .collect() + }); + let want = crate::alloc::allocate_available(requests, profiles, avail); + assert_eq!(want.len(), n); + for (i, (&g, &w)) in got.iter().zip(want.iter()).enumerate() { + assert_close( + &format!("allocate_available[{i}]"), + g, + w, + ALLOC_ABS, + ALLOC_REL, + ); + } + } + + #[test] + fn allocate_available_full_grant_when_supply_exceeds_demand() { + // avail >= total_demand: each requester gets r.max(0). A negative request + // clamps to 0 (the `r.max(0.0)` arm). + let requests = [3.0, 2.0, -1.0, 4.0]; + let profiles = [ + (1.0, 1.0, 1.0, 0.0), + (1.0, 2.0, 1.0, 0.0), + (1.0, 3.0, 1.0, 0.0), + (1.0, 1.5, 1.0, 0.0), + ]; + // total_demand = 3+2+4 = 9 (the negative request is excluded). + assert_allocate_matches(&requests, &profiles, 100.0); + } + + #[test] + fn allocate_available_zeros_when_supply_nonpositive() { + // avail <= 0: all zeros. + let requests = [3.0, 2.0, 4.0]; + let profiles = [ + (1.0, 1.0, 1.0, 0.0), + (1.0, 2.0, 1.0, 0.0), + (1.0, 3.0, 1.0, 0.0), + ]; + assert_allocate_matches(&requests, &profiles, 0.0); + assert_allocate_matches(&requests, &profiles, -5.0); + } + + #[test] + fn allocate_available_partial_bisection_rectangular() { + // The interesting case: 0 < avail < total_demand, so the bisection runs. + // Rectangular profiles (ptype 1) with distinct priorities, mirroring the + // `allocate.mdl` shape. + let requests = [3.0, 2.0, 4.0]; + let profiles = [ + (1.0, 1.0, 1.0, 0.0), + (1.0, 2.0, 1.0, 0.0), + (1.0, 3.0, 1.0, 0.0), + ]; + // total_demand = 9; supply 5 forces a partial allocation. + for avail in [1.0, 3.0, 5.0, 7.0, 8.5] { + assert_allocate_matches(&requests, &profiles, avail); + } + } + + #[test] + fn allocate_available_partial_bisection_across_profile_types() { + // Partial allocation with a mix of profile types, exercising the + // search-range `spread` per type and the per-requester curve at the + // converged price. + let requests = [4.0, 3.0, 5.0, 2.0, 6.0]; + let profiles = [ + (0.0, 2.0, 1.0, 0.0), // fixed + (2.0, 3.0, 1.5, 0.0), // triangular + (3.0, 2.5, 1.0, 0.0), // normal + (4.0, 2.0, 1.2, 0.0), // exponential + (5.0, 3.0, 1.0, 2.0), // CES + ]; + // total_demand = 20; sweep several partial supplies. + for avail in [2.0, 6.0, 10.0, 15.0, 19.0] { + assert_allocate_matches(&requests, &profiles, avail); + } + } + + #[test] + fn allocate_available_empty_requesters_is_noop() { + // n == 0: nothing is written (the helper returns immediately). Exercised + // by passing zero requesters; the read-back loop covers zero slots, so + // this simply must not trap. + assert_allocate_matches(&[], &[], 10.0); + } +} diff --git a/src/simlin-engine/src/wasmgen/lookup.rs b/src/simlin-engine/src/wasmgen/lookup.rs new file mode 100644 index 000000000..7701b2975 --- /dev/null +++ b/src/simlin-engine/src/wasmgen/lookup.rs @@ -0,0 +1,657 @@ +// Copyright 2026 The Simlin Authors. All rights reserved. +// Use of this source code is governed by the Apache License, +// Version 2.0, that can be found in the LICENSE file. + +// pattern: Functional Core +// Pure transformation: each public function emits a self-contained wasm helper +// `Function` (instruction sequence) for one graphical-function lookup mode. No +// I/O; the only side effect is in `#[cfg(test)]`, which executes the emitted +// helpers under the DLR-FT interpreter and compares against the VM's lookup +// functions. + +//! Graphical-function lookup helper functions for the wasm simulation backend. +//! +//! The bytecode VM resolves a `Lookup` opcode against a `&[(f64, f64)]` table +//! through one of three functions (`vm.rs:3055-3186`): `lookup` (linear +//! interpolation), `lookup_forward` (step up), and `lookup_backward` (step +//! down). This module emits one wasm helper per mode -- `lookup_interp`, +//! `lookup_forward`, `lookup_backward` -- each over a flat +//! `(data_off: i32, count: i32, index: f64) -> f64` interface, where the table +//! lives in linear memory as `count` consecutive f64 LE `(x, y)` knot pairs +//! starting at byte offset `data_off` (so knot `k` is +//! `x = f64.load[data_off + 16*k]`, `y = f64.load[data_off + 16*k + 8]`). +//! `module.rs` lays these regions out (see `build_gf_regions`); the `Lookup` +//! opcode (`lower.rs`) reads `(data_off, count)` from the GF directory and +//! `call`s the mode's helper. +//! +//! ## The three functions are NOT one function +//! +//! They differ in three ways, mirrored here exactly so the backend takes the +//! same branch the VM does: +//! - **edge clamps**: `lookup_interp` clamps *strictly* (`index < x[0]` / +//! `index > x[n-1]`); `forward`/`backward` clamp *inclusively* (`<=` / `>=`). +//! - **search**: `interp`/`forward` use a *lower-bound* search +//! (`x[mid] < index`); `backward` uses an *upper-bound* search +//! (`x[mid] <= index`). +//! - **result**: `interp` either returns `y[low]` exactly (when +//! `approx_eq(x[low], index)`, via the Phase 2 helper) or linearly +//! interpolates between knots `low-1` and `low`; `forward` returns `y[low]`; +//! `backward` returns `y[low-1]` (the last knot with `x <= index`; for +//! duplicate x-values, the LAST such knot, since the upper-bound search lands +//! past every equal x). +//! +//! Each helper guards `count == 0` and a NaN `index` by returning NaN, matching +//! the VM's `table.is_empty()` / `index.is_nan()` early returns. + +use wasm_encoder::{BlockType, Function, Instruction as Ins, MemArg, ValType}; + +/// Bytes per knot: an f64 `x` followed by an f64 `y`. +const KNOT_BYTES: i32 = 16; + +// Helper local layout. Params 0..2 are `data_off`/`count`/`index`; the i32 +// search cursors follow. +const DATA_OFF: u32 = 0; // i32 byte offset of knot 0 +const COUNT: u32 = 1; // i32 point count +const INDEX: u32 = 2; // f64 lookup index +const LOW: u32 = 3; // i32 binary-search low +const HIGH: u32 = 4; // i32 binary-search high +const MID: u32 = 5; // i32 binary-search midpoint + +/// An 8-byte (f64) memory access with a static byte `offset` on top of the +/// dynamic address already on the stack. The data region is 8-byte aligned (see +/// `module.rs`), so the natural-alignment hint is valid. +fn knot_memarg(offset: u64) -> MemArg { + MemArg { + offset, + align: 3, // log2(8): an 8-byte f64 access + memory_index: 0, + } +} + +/// Push the byte address of knot `k` (the i32 in `k_local`): +/// `data_off + 16*k`. A subsequent `f64.load` with `knot_memarg(0)` reads `x`, +/// `knot_memarg(8)` reads `y`. +fn push_knot_addr(f: &mut Function, k_local: u32) { + f.instruction(&Ins::LocalGet(DATA_OFF)); + f.instruction(&Ins::LocalGet(k_local)); + f.instruction(&Ins::I32Const(KNOT_BYTES)); + f.instruction(&Ins::I32Mul); + f.instruction(&Ins::I32Add); +} + +/// Push `x[k]` for the knot index in `k_local`. +fn push_x(f: &mut Function, k_local: u32) { + push_knot_addr(f, k_local); + f.instruction(&Ins::F64Load(knot_memarg(0))); +} + +/// Push `y[k]` for the knot index in `k_local`. +fn push_y(f: &mut Function, k_local: u32) { + push_knot_addr(f, k_local); + f.instruction(&Ins::F64Load(knot_memarg(8))); +} + +/// Push `x[count-1]` (the last knot's x). Computed without a dedicated local by +/// pushing the address `data_off + 16*(count-1)` inline. +fn push_last_x(f: &mut Function) { + f.instruction(&Ins::LocalGet(DATA_OFF)); + f.instruction(&Ins::LocalGet(COUNT)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Sub); + f.instruction(&Ins::I32Const(KNOT_BYTES)); + f.instruction(&Ins::I32Mul); + f.instruction(&Ins::I32Add); + f.instruction(&Ins::F64Load(knot_memarg(0))); +} + +/// Push `y[count-1]` (the last knot's y). +fn push_last_y(f: &mut Function) { + f.instruction(&Ins::LocalGet(DATA_OFF)); + f.instruction(&Ins::LocalGet(COUNT)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Sub); + f.instruction(&Ins::I32Const(KNOT_BYTES)); + f.instruction(&Ins::I32Mul); + f.instruction(&Ins::I32Add); + f.instruction(&Ins::F64Load(knot_memarg(8))); +} + +/// Emit the two early guards every lookup function shares: `count == 0 -> NaN` +/// and `index != index (NaN) -> NaN`. Mirrors the VM's `table.is_empty()` and +/// `index.is_nan()` early returns. +fn emit_empty_and_nan_guards(f: &mut Function) { + // if count == 0 { return NaN } + f.instruction(&Ins::LocalGet(COUNT)); + f.instruction(&Ins::I32Eqz); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&Ins::F64Const(f64::NAN.into())); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // if index != index { return NaN } (the NaN test) + f.instruction(&Ins::LocalGet(INDEX)); + f.instruction(&Ins::LocalGet(INDEX)); + f.instruction(&Ins::F64Ne); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&Ins::F64Const(f64::NAN.into())); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); +} + +/// Emit the binary search over `[LOW, HIGH)` into `LOW`. `mid_cmp_is_lt` selects +/// the predicate: `true` -> lower bound (`x[mid] < index`), `false` -> upper +/// bound (`x[mid] <= index`). On exit `LOW` is the first index whose `x` fails +/// the predicate (the lower/upper bound), exactly matching the VM's +/// `while low < high { mid; if pred { low = mid+1 } else { high = mid } }`. +/// +/// `LOW`/`HIGH` must already be initialized (to `0`/`count`). +fn emit_binary_search(f: &mut Function, mid_cmp_is_lt: bool) { + f.instruction(&Ins::Block(BlockType::Empty)); // $exit + f.instruction(&Ins::Loop(BlockType::Empty)); // $top + + // while-head: if !(low < high) break (br depth 1 -> $exit) + f.instruction(&Ins::LocalGet(LOW)); + f.instruction(&Ins::LocalGet(HIGH)); + f.instruction(&Ins::I32LtS); + f.instruction(&Ins::I32Eqz); + f.instruction(&Ins::BrIf(1)); + + // mid = low + (high - low) / 2 (all non-negative; signed div is exact) + f.instruction(&Ins::LocalGet(LOW)); + f.instruction(&Ins::LocalGet(HIGH)); + f.instruction(&Ins::LocalGet(LOW)); + f.instruction(&Ins::I32Sub); + f.instruction(&Ins::I32Const(2)); + f.instruction(&Ins::I32DivS); + f.instruction(&Ins::I32Add); + f.instruction(&Ins::LocalSet(MID)); + + // pred = x[mid] {<, <=} index + push_x(f, MID); + f.instruction(&Ins::LocalGet(INDEX)); + if mid_cmp_is_lt { + f.instruction(&Ins::F64Lt); + } else { + f.instruction(&Ins::F64Le); + } + f.instruction(&Ins::If(BlockType::Empty)); + // low = mid + 1 + f.instruction(&Ins::LocalGet(MID)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Add); + f.instruction(&Ins::LocalSet(LOW)); + f.instruction(&Ins::Else); + // high = mid + f.instruction(&Ins::LocalGet(MID)); + f.instruction(&Ins::LocalSet(HIGH)); + f.instruction(&Ins::End); + + f.instruction(&Ins::Br(0)); // continue -> $top + f.instruction(&Ins::End); // end loop + f.instruction(&Ins::End); // end block +} + +/// Initialize `LOW = 0; HIGH = count`. +fn emit_init_search_bounds(f: &mut Function) { + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::LocalSet(LOW)); + f.instruction(&Ins::LocalGet(COUNT)); + f.instruction(&Ins::LocalSet(HIGH)); +} + +/// Build the body of `lookup_interp(data_off: i32, count: i32, index: f64) +/// -> f64`, reproducing the VM's `lookup` (`vm.rs:3055-3102`) exactly: +/// empty/NaN -> NaN; **strict** edge clamps (`index < x[0]` -> `y[0]`, +/// `index > x[n-1]` -> `y[n-1]`); lower-bound binary search; then at `i = low`, +/// `approx_eq(x[i], index)` -> `y[i]`, else linear interpolation between knots +/// `i-1` and `i`. +/// +/// `approx_eq_idx` is the module function index of the Phase 2 `approx_eq` +/// helper (`lower::HelperFns::approx_eq`); the at-knot exact-hit test `call`s it +/// so the backend matches the VM's `crate::float::approx_eq` branch. +pub(crate) fn emit_lookup_interp(approx_eq_idx: u32) -> Function { + let mut f = Function::new([(3, ValType::I32)]); // LOW/HIGH/MID + + emit_empty_and_nan_guards(&mut f); + + // if index < x[0] { return y[0] } (strict) + f.instruction(&Ins::LocalGet(INDEX)); + push_x_const0(&mut f); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::If(BlockType::Empty)); + push_y_const0(&mut f); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // if index > x[count-1] { return y[count-1] } (strict) + f.instruction(&Ins::LocalGet(INDEX)); + push_last_x(&mut f); + f.instruction(&Ins::F64Gt); + f.instruction(&Ins::If(BlockType::Empty)); + push_last_y(&mut f); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + emit_init_search_bounds(&mut f); + emit_binary_search(&mut f, true); // lower bound + + // i = low. if approx_eq(x[i], index) { return y[i] } + push_x(&mut f, LOW); + f.instruction(&Ins::LocalGet(INDEX)); + f.instruction(&Ins::Call(approx_eq_idx)); + f.instruction(&Ins::If(BlockType::Empty)); + push_y(&mut f, LOW); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // else linear interp: + // slope = (y[i] - y[i-1]) / (x[i] - x[i-1]) + // result = (index - x[i-1]) * slope + y[i-1] + // Reuse MID as the i32 holding `i-1` so x[i-1]/y[i-1] reuse push_x/push_y. + f.instruction(&Ins::LocalGet(LOW)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Sub); + f.instruction(&Ins::LocalSet(MID)); // MID = i-1 + + // (index - x[i-1]) + f.instruction(&Ins::LocalGet(INDEX)); + push_x(&mut f, MID); + f.instruction(&Ins::F64Sub); + // * slope + push_y(&mut f, LOW); + push_y(&mut f, MID); + f.instruction(&Ins::F64Sub); // y[i] - y[i-1] + push_x(&mut f, LOW); + push_x(&mut f, MID); + f.instruction(&Ins::F64Sub); // x[i] - x[i-1] + f.instruction(&Ins::F64Div); // slope + f.instruction(&Ins::F64Mul); // (index - x[i-1]) * slope + // + y[i-1] + push_y(&mut f, MID); + f.instruction(&Ins::F64Add); + + f.instruction(&Ins::End); + f +} + +/// Build the body of `lookup_forward(data_off, count, index) -> f64`, +/// reproducing the VM's `lookup_forward` (`vm.rs:3104-3142`): empty/NaN -> NaN; +/// **inclusive** edge clamps (`index <= x[0]` -> `y[0]`, `index >= x[n-1]` -> +/// `y[n-1]`); the same lower-bound binary search; return `y[low]`. No +/// `approx_eq`, no interpolation. +pub(crate) fn emit_lookup_forward() -> Function { + let mut f = Function::new([(3, ValType::I32)]); // LOW/HIGH/MID + + emit_empty_and_nan_guards(&mut f); + + // if index <= x[0] { return y[0] } (inclusive) + f.instruction(&Ins::LocalGet(INDEX)); + push_x_const0(&mut f); + f.instruction(&Ins::F64Le); + f.instruction(&Ins::If(BlockType::Empty)); + push_y_const0(&mut f); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // if index >= x[count-1] { return y[count-1] } (inclusive) + f.instruction(&Ins::LocalGet(INDEX)); + push_last_x(&mut f); + f.instruction(&Ins::F64Ge); + f.instruction(&Ins::If(BlockType::Empty)); + push_last_y(&mut f); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + emit_init_search_bounds(&mut f); + emit_binary_search(&mut f, true); // lower bound + + // return y[low] + push_y(&mut f, LOW); + + f.instruction(&Ins::End); + f +} + +/// Build the body of `lookup_backward(data_off, count, index) -> f64`, +/// reproducing the VM's `lookup_backward` (`vm.rs:3144-3186`): empty/NaN -> +/// NaN; **inclusive** edge clamps; an **upper-bound** binary search +/// (`x[mid] <= index`); return `y[low-1]` (the last knot with `x <= index`; for +/// duplicate x-values, the LAST one). No `approx_eq`, no interpolation. +pub(crate) fn emit_lookup_backward() -> Function { + let mut f = Function::new([(3, ValType::I32)]); // LOW/HIGH/MID + + emit_empty_and_nan_guards(&mut f); + + // if index <= x[0] { return y[0] } (inclusive) + f.instruction(&Ins::LocalGet(INDEX)); + push_x_const0(&mut f); + f.instruction(&Ins::F64Le); + f.instruction(&Ins::If(BlockType::Empty)); + push_y_const0(&mut f); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // if index >= x[count-1] { return y[count-1] } (inclusive) + f.instruction(&Ins::LocalGet(INDEX)); + push_last_x(&mut f); + f.instruction(&Ins::F64Ge); + f.instruction(&Ins::If(BlockType::Empty)); + push_last_y(&mut f); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + emit_init_search_bounds(&mut f); + emit_binary_search(&mut f, false); // upper bound + + // return y[low-1] (reuse MID as low-1) + f.instruction(&Ins::LocalGet(LOW)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Sub); + f.instruction(&Ins::LocalSet(MID)); + push_y(&mut f, MID); + + f.instruction(&Ins::End); + f +} + +/// Push `x[0]` (`f64.load[data_off + 0]`). The knot-0 address is just +/// `data_off`, so no index arithmetic is needed. +fn push_x_const0(f: &mut Function) { + f.instruction(&Ins::LocalGet(DATA_OFF)); + f.instruction(&Ins::F64Load(knot_memarg(0))); +} + +/// Push `y[0]` (`f64.load[data_off + 8]`). +fn push_y_const0(f: &mut Function) { + f.instruction(&Ins::LocalGet(DATA_OFF)); + f.instruction(&Ins::F64Load(knot_memarg(8))); +} + +#[cfg(test)] +mod tests { + use super::super::lower::build_helpers; + use checked::Store; + use wasm::validate; + use wasm_encoder::{ + CodeSection, ConstExpr, DataSection, ExportKind, ExportSection, Function, FunctionSection, + Instruction, MemorySection, MemoryType, Module, TypeSection, ValType, + }; + + /// Which lookup helper a test module exports as `f`. + #[derive(Clone, Copy, Debug)] + enum Mode { + Interp, + Forward, + Backward, + } + + /// Resolve a [`Mode`] to its helper function index in the assembled table. + fn helper_index(mode: Mode) -> u32 { + let h = build_helpers().fns; + match mode { + Mode::Interp => h.lookup_interp, + Mode::Forward => h.lookup_forward, + Mode::Backward => h.lookup_backward, + } + } + + /// The byte offset the test harness writes the table to (one f64 in, so a + /// non-zero `data_off` is exercised rather than the degenerate 0). + const TABLE_BASE: u32 = 8; + + /// Build a module containing *every* helper body (so `lookup_interp`'s + /// `call approx_eq` resolves) plus a thin exported wrapper + /// `f(data_off: i32, count: i32, index: f64) -> f64` forwarding to the + /// helper-under-test, and an exported `memory` seeded with `knots` at + /// [`TABLE_BASE`] via an active data segment. Mirrors `lower.rs`'s + /// production assembly: helpers occupy function indices `0..N`, the wrapper + /// follows at `N`. + fn build_lookup_module(mode: Mode, knots: &[(f64, f64)]) -> Vec { + let helpers = build_helpers(); + let n_helpers = helpers.functions.len() as u32; + let target = helper_index(mode); + + let mut module = Module::new(); + + // Type 0 is the wrapper `(i32, i32, f64) -> f64`; helper types follow. + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::I32, ValType::I32, ValType::F64], [ValType::F64]); + for hf in &helpers.functions { + types.ty().function(hf.params.clone(), hf.results.clone()); + } + module.section(&types); + + let mut functions = FunctionSection::new(); + for (i, _) in helpers.functions.iter().enumerate() { + functions.function(1 + i as u32); + } + functions.function(0); + module.section(&functions); + + let mut memories = MemorySection::new(); + memories.memory(MemoryType { + minimum: 1, + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + module.section(&memories); + + let mut exports = ExportSection::new(); + exports.export("f", ExportKind::Func, n_helpers); + exports.export("memory", ExportKind::Memory, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + for hf in &helpers.functions { + code.function(&hf.body); + } + // wrapper: forward (data_off, count, index) to the helper-under-test. + let mut wrapper = Function::new([]); + wrapper.instruction(&Instruction::LocalGet(0)); + wrapper.instruction(&Instruction::LocalGet(1)); + wrapper.instruction(&Instruction::LocalGet(2)); + wrapper.instruction(&Instruction::Call(target)); + wrapper.instruction(&Instruction::End); + code.function(&wrapper); + module.section(&code); + + // Seed the table at TABLE_BASE as interleaved f64 LE x,y pairs. + let mut bytes: Vec = Vec::with_capacity(knots.len() * 16); + for &(x, y) in knots { + bytes.extend_from_slice(&x.to_le_bytes()); + bytes.extend_from_slice(&y.to_le_bytes()); + } + let mut data = DataSection::new(); + data.active(0, &ConstExpr::i32_const(TABLE_BASE as i32), bytes); + module.section(&data); + + module.finish() + } + + /// Run the emitted lookup helper for `mode` over `knots` at `index` under + /// the DLR-FT interpreter. The module is (re)built per call; the tables are + /// tiny (a handful of knots) so this stays well under the per-test budget. + fn run_lookup(mode: Mode, knots: &[(f64, f64)], index: f64) -> f64 { + let bytes = build_lookup_module(mode, knots); + let info = validate(&bytes).expect("lookup module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("lookup module must instantiate") + .module_addr; + let f = store + .instance_export(module, "f") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(i32, i32, f64), f64>( + f, + (TABLE_BASE as i32, knots.len() as i32, index), + ) + .expect("invocation must succeed") + } + + /// The VM oracle for `mode` (the exact function the helper reproduces). + fn vm_lookup(mode: Mode, knots: &[(f64, f64)], index: f64) -> f64 { + match mode { + Mode::Interp => crate::vm::lookup(knots, index), + Mode::Forward => crate::vm::lookup_forward(knots, index), + Mode::Backward => crate::vm::lookup_backward(knots, index), + } + } + + /// Assert the emitted helper agrees bit-for-bit with the VM oracle at + /// `index` (NaN compares as NaN). The interp helper routes its at-knot test + /// through the same `approx_eq` the VM uses, and neither helper does any + /// transcendental math, so equality is exact -- not within a tolerance. + fn assert_matches_vm(mode: Mode, knots: &[(f64, f64)], index: f64) { + let got = run_lookup(mode, knots, index); + let want = vm_lookup(mode, knots, index); + if want.is_nan() { + assert!( + got.is_nan(), + "{mode:?} lookup at index {index}: expected NaN, got {got}" + ); + } else { + assert_eq!( + got, want, + "{mode:?} lookup at index {index}: got {got}, want {want}" + ); + } + } + + /// A monotonic-x table with non-uniform spacing and a non-monotone y, so + /// interpolation, forward, and backward all give distinguishable results. + const TABLE: &[(f64, f64)] = &[ + (0.0, 10.0), + (1.0, 20.0), + (2.5, 5.0), + (4.0, 40.0), + (10.0, 0.0), + ]; + + /// A representative set of probe indices spanning every regime: below + /// range, exactly on each knot, strictly between each pair of knots, and + /// above range. Shared by all three modes (each mode's oracle defines the + /// right answer). + fn probe_indices(knots: &[(f64, f64)]) -> Vec { + let mut idx = vec![knots[0].0 - 5.0, knots[knots.len() - 1].0 + 5.0]; + for w in knots.windows(2) { + let (a, b) = (w[0].0, w[1].0); + idx.push(a); // on a knot + idx.push((a + b) / 2.0); // strictly between + // a point near but not on the knot, to exercise the approx_eq edge + idx.push(a + (b - a) * 1e-3); + } + idx.push(knots[knots.len() - 1].0); // the final knot + idx + } + + #[test] + fn lookup_interp_matches_vm_over_domain() { + for &index in &probe_indices(TABLE) { + assert_matches_vm(Mode::Interp, TABLE, index); + } + } + + #[test] + fn lookup_forward_matches_vm_over_domain() { + for &index in &probe_indices(TABLE) { + assert_matches_vm(Mode::Forward, TABLE, index); + } + } + + #[test] + fn lookup_backward_matches_vm_over_domain() { + for &index in &probe_indices(TABLE) { + assert_matches_vm(Mode::Backward, TABLE, index); + } + } + + #[test] + fn lookup_all_modes_below_and_above_range() { + // The edge clamps differ (interp strict, forward/backward inclusive) but + // all three return the boundary y for an out-of-range index; assert each + // against its own oracle so the strict-vs-inclusive distinction is + // exercised at the boundary itself. + for mode in [Mode::Interp, Mode::Forward, Mode::Backward] { + assert_matches_vm(mode, TABLE, -100.0); // below x[0] + assert_matches_vm(mode, TABLE, 1000.0); // above x[n-1] + assert_matches_vm(mode, TABLE, TABLE[0].0); // exactly x[0] + assert_matches_vm(mode, TABLE, TABLE[TABLE.len() - 1].0); // exactly x[n-1] + } + } + + #[test] + fn lookup_single_point_table() { + // A one-knot table: every index clamps to that knot's y for all modes. + let single: &[(f64, f64)] = &[(3.0, 7.0)]; + for mode in [Mode::Interp, Mode::Forward, Mode::Backward] { + for &index in &[-1.0, 3.0, 3.0 - 1e-9, 3.0 + 1e-9, 100.0] { + assert_matches_vm(mode, single, index); + } + } + } + + #[test] + fn lookup_backward_duplicate_x_returns_last() { + // Duplicate x-values: backward must return the y of the LAST knot with + // that x (the upper-bound search lands past every equal x, then steps + // back one). The interp/forward modes are also checked for consistency + // with their own oracle on the same table. + let dup: &[(f64, f64)] = &[ + (0.0, 0.0), + (2.0, 10.0), + (2.0, 20.0), + (2.0, 30.0), + (5.0, 50.0), + ]; + // Exactly on the duplicated x, and just inside either side of it. + for &index in &[2.0, 1.999, 2.001, 0.0, 5.0, 3.5] { + assert_matches_vm(Mode::Backward, dup, index); + assert_matches_vm(Mode::Forward, dup, index); + assert_matches_vm(Mode::Interp, dup, index); + } + } + + #[test] + fn lookup_nan_index_returns_nan_all_modes() { + for mode in [Mode::Interp, Mode::Forward, Mode::Backward] { + assert!( + run_lookup(mode, TABLE, f64::NAN).is_nan(), + "{mode:?} lookup of a NaN index must be NaN" + ); + } + } + + #[test] + fn lookup_empty_table_returns_nan_all_modes() { + // count == 0 -> NaN for every mode (matching the VM's table.is_empty()). + // The wrapper passes count = 0; data_off is irrelevant (never read). + for mode in [Mode::Interp, Mode::Forward, Mode::Backward] { + assert!( + run_lookup(mode, &[], 1.0).is_nan(), + "{mode:?} lookup of an empty table must be NaN" + ); + } + } + + #[test] + fn lookup_interp_exact_knot_uses_approx_eq() { + // The interp helper returns y[i] exactly when approx_eq(x[i], index), + // matching the VM. A one-ULP-perturbed index at a knot is approx-equal, + // so it must return that knot's y exactly (NOT an interpolated value). + // The VM oracle encodes the same approx_eq decision. + let knot_x = TABLE[2].0; // 2.5 + let perturbed = f64::from_bits(knot_x.to_bits() + 1); + assert_matches_vm(Mode::Interp, TABLE, perturbed); + // And the exact knot returns its y exactly. + let got = run_lookup(Mode::Interp, TABLE, knot_x); + assert_eq!(got, TABLE[2].1, "interp at the exact knot returns y[i]"); + } +} diff --git a/src/simlin-engine/src/wasmgen/lower.rs b/src/simlin-engine/src/wasmgen/lower.rs new file mode 100644 index 000000000..8cb7ba378 --- /dev/null +++ b/src/simlin-engine/src/wasmgen/lower.rs @@ -0,0 +1,3256 @@ +// Copyright 2026 The Simlin Authors. All rights reserved. +// Use of this source code is governed by the Apache License, +// Version 2.0, that can be found in the LICENSE file. + +// pattern: Functional Core +// Pure transformation: bytecode + layout data in, wasm `Function` bodies / +// instruction sequences out. No I/O; the only side effect is in `#[cfg(test)]`, +// which executes the emitted modules under the DLR-FT interpreter. + +//! Lowering of the bytecode VM's scalar-core opcode set to WebAssembly. +//! +//! The runtime data model mirrors the bytecode VM (`crate::vm`): all variable +//! values live in one flat f64 "slab" in linear memory, addressed by slot +//! offset. A model runs over two chunks at a time -- `curr` (the values at the +//! current timestep) and `next` (the values being computed for the following +//! timestep). `LoadVar` reads from `curr`; `AssignCurr`/`AssignNext` store into +//! `curr`/`next`. +//! +//! Each `Opcode` lowers to a short, mostly 1:1 wasm instruction sequence over +//! the wasm operand stack, reproducing the matching arm of `eval_bytecode` +//! (`vm.rs:1257+`). +//! +//! Three compound assignment opcodes beyond the bare scalar set reach a +//! `CompiledSimulation` consumer, and they all lower here: +//! - `AssignConstCurr` arrives by *two* routes: `compiler::codegen` emits it +//! directly for any constant-RHS `AssignCurr` (`codegen.rs:1164`), and the +//! **peephole** pass also fuses a `LoadConstant; AssignCurr` pair into it +//! (`bytecode.rs:1830`). Either way it rides through the symbolic layer into +//! `CompiledSimulation`; every model with a constant initial/aux carries it. +//! - `BinOpAssignCurr` / `BinOpAssignNext` are *only* peephole output +//! (`bytecode.rs:1837`/`1841`, fusing `Op2; Assign{Curr,Next}`). The peephole +//! pass (`ByteCode::peephole_optimize`, run inside +//! `Module::compile`/`ByteCodeBuilder::finish`) runs per-variable-fragment in +//! the incremental pipeline *before* symbolization, so these ride through +//! too. Every scalar Euler stock integration (`stock + delta`) is one, so +//! they are part of the scalar core. +//! +//! The late **3-address** pass (`ByteCode::fuse_three_address`) instead runs +//! only on the VM's private execution copy (`vm.rs:395-398`), so its +//! `BinVarVar` / `AssignAddVarVarCurr` / ... family never reaches a consumer. +//! +//! Anything outside the supported scalar core -- an array/module/lookup opcode +//! or a late-fusion superinstruction that somehow appeared -- returns +//! `WasmGenError::Unsupported` rather than emitting a wrong module. (Every +//! `Op2` variant, including `Mod`/`Exp`, is supported as of Phase 2.) +//! +//! ## Emitted helper functions +//! +//! Equality and truthiness route through a single emitted wasm helper, +//! `approx_eq(a: f64, b: f64) -> i32`, that reproduces `crate::float::approx_eq` +//! (`float_cmp` 0.10 defaults) bit-faithfully, so the backend takes the same +//! branch the VM does. Helper functions are assembled into the module ahead of +//! the per-program functions ([`build_helpers`] returns their bodies and a +//! [`HelperFns`] index registry); `module.rs` places them at function indices +//! `0..N` and the per-program + `run` functions at `N..`. `emit_bytecode` +//! references a helper by its stable index (held in [`EmitCtx::helpers`]) via a +//! `call`. Subcomponent B (the transcendental + `pulse` helpers) and later +//! phases extend this by adding a field to [`HelperFns`] and pushing the +//! corresponding body in [`build_helpers`]; no helper index is hard-coded +//! elsewhere, so the per-program offset adjusts automatically. + +use std::collections::HashMap; + +use wasm_encoder::{Function, Instruction, MemArg, ValType}; + +use crate::bytecode::{ + BuiltinId, ByteCode, ByteCodeContext, GraphicalFunctionId, LookupMode, Op2, Opcode, +}; +use crate::vm::{StepPart, make_module_key}; + +use super::WasmGenError; +use super::views::ElementAddr; +use super::views::{ViewBase, ViewDesc}; + +/// Bytes per f64 slot. +pub(crate) const SLOT_SIZE: u32 = 8; +/// Alignment exponent for an 8-byte f64 access (log2(8)). +const F64_ALIGN: u32 = 3; +/// Bytes per GF directory entry (two i32: data byte offset + point count). Must +/// match `module.rs`'s `GF_DIRECTORY_ENTRY_BYTES`, the layout the `Lookup` +/// opcode reads. +pub(crate) const GF_DIRECTORY_ENTRY_BYTES: i32 = 8; + +/// Compile-time context for lowering a scalar opcode program over the f64 slab. +/// +/// `curr_base`/`next_base` are byte offsets of slot 0 of each chunk within the +/// linear memory. `module_off_local` is the wasm local index holding this +/// instance's `module_off` (the slot base of the module instance within a +/// chunk); the per-program functions take it as their single `i32` parameter. +/// In Phase 1 the root is the only module so `module_off` is always 0, but +/// emitting with it from the start avoids a Phase 7 rewrite. +pub(crate) struct EmitCtx<'a> { + pub curr_base: u32, + pub next_base: u32, + /// Byte offset of the GF directory region (8 bytes/entry, indexed by global + /// table index: `(data_byte_offset: i32, n_points: i32)`). The `Lookup` + /// opcode reads `directory_base + table_idx*8` to map a table index to its + /// data location. Both bases are run-invariant: every per-program function + /// reads the same read-only GF regions. + pub gf_directory_base: u32, + /// Byte offset of the GF data region (every table's `(x,y)` knots as f64 LE + /// pairs, concatenated). Retained for completeness/Phase-7 reuse; the + /// per-table absolute data offset the `Lookup` opcode passes to a helper is + /// read from the directory, so opcode lowering does not consult this field. + #[allow(dead_code)] + pub gf_data_base: u32, + /// Byte offset of slot 0 of the `initial_values` snapshot region (n_slots + /// wide). `LoadInitial` reads `initial_values[module_off + off]` when the + /// program being emitted is *not* the initials program. Mirrors the VM's + /// `initial_values` buffer (`vm.rs:617`). + pub initial_values_base: u32, + /// Byte offset of slot 0 of the `prev_values` snapshot region (n_slots + /// wide). `LoadPrev` reads `prev_values[module_off + off]` once the snapshot + /// has been taken. Mirrors the VM's `prev_values` buffer. + pub prev_values_base: u32, + /// Index of the mutable i32 wasm global `use_prev_fallback` (init 1). + /// `LoadPrev` gates on it: while set, it yields the caller-supplied fallback + /// rather than reading `prev_values`. The flag -- not a `TIME == start` + /// comparison -- is the sole gate, because RK stages move `curr[TIME]` to + /// trial points before the first snapshot is taken (`vm.rs:1314-1327`). + pub use_prev_fallback_global: u32, + /// Which opcode program is being lowered. `LoadInitial` resolves its + /// "during Initials read `curr`, else read `initial_values`" branch + /// (`vm.rs:1332-1340`) at compile time from this, since the emitter knows + /// the program statically. + pub step_part: StepPart, + // dt/start_time/final_time are the run-invariant time globals that back the + // seeds `run` writes into the TIME/DT/INITIAL_TIME/FINAL_TIME memory slots. + // Opcode lowering reads those values from memory via `LoadGlobalVar` (slots + // 0..4) rather than from these fields -- the XMILE time builtins lower to + // `LoadGlobalVar`, and the time-driven `Apply` arms (Step/Ramp/Pulse) read + // TIME/DT from memory -- so the fields stay unused here. They are retained + // because a later phase may fold them into compile-time constants. + #[allow(dead_code)] + pub dt: f64, + #[allow(dead_code)] + pub start_time: f64, + #[allow(dead_code)] + pub final_time: f64, + /// wasm local index holding this instance's `module_off` (i32). + pub module_off_local: u32, + /// wasm local index of a scratch f64, used by `AssignCurr`/`AssignNext` to + /// hold the value while the store address is pushed under it. + pub scratch_local: u32, + /// wasm local indices reserved for the `SetCond`/`If` condition register. + /// Used as a stack: `SetCond` writes the top, `If` reads (and pops) it. + /// Sized to the program's maximum `If` nesting depth (see + /// [`max_condition_depth`]). + pub condition_locals: Vec, + /// Three dedicated scratch f64 local indices `[a, b, c]` for the `Apply` + /// opcode, which always pops exactly three operands (codegen pads). They + /// are distinct from [`scratch_local`](Self::scratch_local) and the + /// [`condition_locals`](Self::condition_locals) so an `Apply` inside an + /// `If` arm (sharing the function) cannot clobber the condition register. + /// Reserved unconditionally by the function builders (3 unused f64 locals + /// in a non-`Apply` function are free). + pub apply_locals: [u32; 3], + /// Function indices of the module's emitted helper functions, so + /// value-producing opcodes that need the VM's `approx_eq`/transcendental + /// semantics can `call` them. The same registry is shared by every + /// per-program function in a module. + pub helpers: HelperFns, + /// Byte offset of slot 0 of the `temp_storage` region (`temp_total_size` + /// f64 wide). The array view machinery addresses temp element `index` of + /// temp `temp_id` at `temp_storage_base + (temp_offsets[temp_id] + index)*8`, + /// mirroring the VM's `temp_storage[temp_offsets[temp_id] + index]` + /// (`vm.rs:584-586`). + pub temp_storage_base: u32, + /// First wasm local index reserved for the dynamic-subscript scratch i32 + /// locals (Task 4): the runtime-offset addend and validity flag a + /// `ViewSubscriptDynamic` / `PushSubscriptIndex` accumulation draws from. The + /// function's local declarations reserve `count_extra_i32_locals(bc)` i32s + /// starting here, past the scratch f64 / condition i32s / `Apply` f64s / the + /// fixed vector-op i32 scratch block, so these never overlap + /// [`apply_locals`](Self::apply_locals) or + /// [`vector_i32_locals`](Self::vector_i32_locals). A program with no dynamic + /// subscripts reserves none and this base is unused. + pub extra_i32_local_base: u32, + /// The fixed [`VECTOR_F64_LOCAL_COUNT`] scratch f64 local indices the Phase-6 + /// vector opcodes draw from (`VectorSelect`'s reduction accumulators, the + /// per-element value scratch). Reserved unconditionally by the function + /// builders; a non-vector function's unused f64 locals are free. + pub vector_f64_locals: [u32; VECTOR_F64_LOCAL_COUNT as usize], + /// The fixed [`VECTOR_I32_LOCAL_COUNT`] scratch i32 local indices the Phase-6 + /// vector opcodes draw from (`VectorSelect`'s action/count/reduce-index, + /// `Rank`'s ascending flag + runtime store address, `VectorElmMap`'s runtime + /// flat index). Reserved unconditionally by the function builders (a + /// non-vector function's unused i32 locals are free) and placed before the + /// dynamic-subscript [`extra_i32_local_base`], so it never disturbs the + /// `apply_locals` indices. + pub vector_i32_locals: [u32; VECTOR_I32_LOCAL_COUNT as usize], + /// Byte offset of slot 0 of the vector-op scratch region. `VectorSelect` + /// collects its selected expr values here (`size` f64 worst case); the + /// `stable_sort` helper (`VectorSortOrder`/`Rank`) sorts `(value, idx)` pairs + /// here (`2 * size` f64). The two uses are never live simultaneously within a + /// single opcode, so they share the region. Sized by `module.rs` to the + /// largest view a vector op could process; the test harness sets a fixed + /// high offset within its single memory page. + pub vector_scratch_base: u32, + /// Byte offset of slot 0 of the allocation scratch region. The Phase-6 + /// `AllocateAvailable`/`AllocateByPriority` arms stage the gathered request + /// values, the per-requester profile tuples (4 f64 each), and the output + /// allocations here before/after `call`ing the `allocate_available` helper. + /// The three sub-regions (`requests` (n) ++ `profiles` (4n) ++ `out` (n)) + /// are laid out consecutively and are all live across the helper call. + /// Sized by `module.rs` to `6 * max(temp_total_size, n_slots)` f64 (a + /// requester count is bounded by a view's element count); reserved + /// unconditionally (a model without allocators never reads it). The test + /// harness sets a fixed high offset within its single memory page. + pub alloc_scratch_base: u32, + /// First wasm local index of the `EvalModule` reverse-pop scratch f64s (Phase + /// 7). An `EvalModule { n_inputs }` pops its `n_inputs` operands into the first + /// `n_inputs` of these (in reverse, matching the VM's `for j in + /// (0..n_inputs).rev()`), then pushes `child_module_off` followed by them in + /// order before the child `call`. Sized by `module.rs` to the max `n_inputs` + /// over the program's `EvalModule` sites; 0 (and unused) for a program with no + /// submodule instantiation. See [`module_input_scratch_base`]. + pub module_input_scratch_base: u32, + /// Byte offset of slot 0 of the constants-override region (Phase 7 Task 2), + /// an `n_slots`-wide f64 region indexed by *absolute* slab offset and + /// initialized to the compiled-default literals at every overridable slot. A + /// redirected `AssignConstCurr { off }` (one whose `off` is in + /// [`flows_const_offsets`](Self::flows_const_offsets)) sources its value from + /// `const_region_base + (module_off + off) * 8` instead of an immediate + /// `f64.const`, so the exported `set_value` override takes effect every step + /// -- exactly as the VM mutating the bytecode literal does (`vm.rs:994-1008`). + /// Indexing by absolute slot (the same `module_off`-relative addressing the + /// slab uses) is what lets one shared `CompiledModule` running at several + /// `module_off`s pick up each instance's distinct override. + pub const_region_base: u32, + /// The set of *relative* offsets this instance's module assigns via an + /// `AssignConstCurr` in its flows phase -- i.e. the overridable constants of + /// this module (mirroring `collect_constant_info`'s flows-only overridability + /// rule, `vm.rs:436-450`, computed per module so it is compile-time even for a + /// shared module run at several offsets). An `AssignConstCurr { off }` in *any* + /// phase (initials/flows/stocks) whose `off` is in this set sources from the + /// constants region; one whose `off` is absent emits its immediate literal + /// unchanged. This matches the VM applying the override at every location of an + /// overridable offset (`collect_constant_info` collects flows + stocks + + /// initials locations for each flows-overridable offset). + pub flows_const_offsets: &'a std::collections::HashSet, + /// Resolves an `EvalModule { id }` site to the child instance's wasm function + /// index for the program being emitted: `module_fn_index[(child_key, part)]`, + /// where `child_key = make_module_key(&ctx.modules[id].model_name, + /// &ctx.modules[id].input_set)` and `part == step_part`. Built once by + /// `module.rs` before any program function is emitted (the module + /// instantiation graph is acyclic, so every child index exists by the time its + /// caller is emitted). Empty for a single-module (no-submodule) program. + pub module_fn_index: &'a HashMap<(crate::vm::ModuleKey, StepPart), u32>, + /// The module instance's `ByteCodeContext`, holding the compile-time array + /// tables the view opcodes reference by index (`static_views`, `dim_lists`, + /// `dimensions`, `subdim_relations`, `temp_offsets`) *and* the `modules` + /// declaration table the `EvalModule` arm resolves child keys from. This is + /// the *per-instance* context (Phase 7): each instance's functions are emitted + /// with its own context, so an `EvalModule`'s `ctx.modules[id]` and the array + /// tables refer to the instance whose function is being lowered. + pub ctx: &'a ByteCodeContext, +} + +// Reserved global slots (absolute, module-independent), mirroring `crate::vm`. +// `Apply` reads `curr[TIME_OFF]` / `curr[DT_OFF]` for the time-driven builtins. +const TIME_OFF: u16 = 0; +const DT_OFF: u16 = 1; + +pub(crate) fn memarg(addr: u64) -> MemArg { + MemArg { + offset: addr, + align: F64_ALIGN, + memory_index: 0, + } +} + +/// `.into()` keeps this robust to whether `wasm-encoder`'s `F64Const` field is +/// a bare `f64` or an `Ieee64` wrapper across versions. +pub(crate) fn f64_const(v: f64) -> Instruction<'static> { + Instruction::F64Const(v.into()) +} + +// ============================================================================ +// Emitted helper functions +// ============================================================================ + +/// Function indices of a module's emitted helper functions. +/// +/// Helpers occupy the module's first function slots (`0..N`), so their indices +/// are fixed and known before any per-program function is emitted. This is what +/// lets a value-producing opcode in `emit_bytecode` reference a helper by index +/// (`call`). [`build_helpers`] both emits the bodies and assigns these indices, +/// keeping the index assignment and the body emission in one place. +/// +/// To add a helper (Subcomponent B's transcendentals + `pulse`, later phases' +/// lookup/array/allocation helpers): add a field here and push its body in +/// [`build_helpers`], assigning the field from the pre-push `functions.len()`. +/// Do not hard-code a helper's index anywhere else. +#[derive(Clone, Copy)] +pub(crate) struct HelperFns { + /// `approx_eq(a: f64, b: f64) -> i32` (1 = approximately equal, else 0), + /// reproducing `crate::float::approx_eq` (`float_cmp` 0.10 defaults). + pub approx_eq: u32, + /// `mod_euclid(l: f64, r: f64) -> f64`, reproducing `f64::rem_euclid` (the + /// VM's `Op2::Mod`): a result in `[0, |r|)`. A self-contained helper (rather + /// than an inline sequence) because the euclidean remainder needs both + /// operands live across several uses, exceeding the single assign-scratch + /// local available to `emit_op2`. + pub mod_euclid: u32, + /// `pulse(time, dt, volume, first_pulse, interval) -> f64`, reproducing the + /// VM's `pulse` (`vm.rs:3036`) including its `while` loop. A helper because + /// of the loop (an inline expansion would need a wasm `loop`/`br_if` in the + /// middle of `Apply`'s operand handling). + pub pulse: u32, + /// Open-coded transcendental helpers (`super::math`), each `(f64) -> f64` + /// except [`pow`](Self::pow) which is `(f64, f64) -> f64`. The bodies are + /// emitted in `super::math`; the composed ones (`tan`/`asin`/`acos`/ + /// `log10`/`pow`) `call` the leaf ones by the indices recorded here, so the + /// leaves are pushed first in [`build_helpers`]. `pow` is consumed by + /// `Op2::Exp`; the rest by the `Apply` arm. + pub exp: u32, + pub ln: u32, + pub sin: u32, + pub cos: u32, + pub tan: u32, + pub atan: u32, + pub asin: u32, + pub acos: u32, + pub log10: u32, + pub pow: u32, + /// Graphical-function lookup helpers (`super::lookup`), each + /// `(data_off: i32, count: i32, index: f64) -> f64`, reproducing the VM's + /// `lookup`/`lookup_forward`/`lookup_backward` (`vm.rs:3055-3186`). The + /// `Lookup` opcode (`emit_bytecode`) reads `(data_off, count)` from the GF + /// directory and `call`s the mode's helper. [`lookup_interp`](Self::lookup_interp) + /// `call`s [`approx_eq`](Self::approx_eq) for its at-knot exact-hit test, so + /// `approx_eq` is pushed before it in [`build_helpers`]. + pub lookup_interp: u32, + pub lookup_forward: u32, + pub lookup_backward: u32, + /// `stable_sort(pairs_ptr: i32, n: i32, ascending: i32) -> ()` + /// (`super::vector`), an in-place stable comparison sort of `n` `(value: f64, + /// idx: f64)` pairs by `value`, used by `VectorSortOrder`/`Rank`. A runtime + /// loop (insertion sort) -- never unrolled -- because the element count is a + /// runtime view size and an unrolled O(n^2) body would blow up. NaN + /// comparisons sort as `Equal` (the comparison is a strict `f64.lt`/`f64.gt`, + /// which is false for NaN), reproducing the VM's stable + /// `sort_by(partial_cmp(..).unwrap_or(Equal))`. + pub stable_sort: u32, + /// Allocation helpers (`super::alloc`), porting `crate::alloc` + /// bit-faithfully for the `AllocateAvailable`/`AllocateByPriority` opcodes: + /// - `erfc_approx(z: f64) -> f64` (Abramowitz-Stegun 26.2.17; `call`s + /// [`exp`](Self::exp) for the `(-z*z).exp()` factor), + /// - `normal_cdf(x: f64) -> f64` (`0.5 * erfc_approx(-x / SQRT_2)`; `call`s + /// [`erfc_approx`](Self::erfc_approx)), + /// - `alloc_curve(p, request, ptype, ppriority, pwidth, pextra) -> f64` + /// (all six `ptype % 10` curve branches + the `ptype >= 10` floor flag; + /// `call`s [`normal_cdf`](Self::normal_cdf)/[`exp`](Self::exp)/ + /// [`pow`](Self::pow)), + /// - `allocate_available(requests_ptr: i32, n: i32, profiles_ptr: i32, + /// avail: f64, out_ptr: i32) -> ()` -- the bisection market-clearing solve + /// over scratch memory (a runtime loop; never unrolled), `call`s + /// [`alloc_curve`](Self::alloc_curve). + /// + /// Pushed after `exp`/`pow`/`erfc_approx`/`normal_cdf`/`alloc_curve` (in that + /// dependency order) in [`build_helpers`], so each inter-helper `call` + /// resolves to an already-recorded index. + /// + /// `erfc_approx`/`normal_cdf`/`alloc_curve` are only consumed *during* + /// helper construction (each is passed to the next helper's emitter so its + /// `call` resolves) and by the `#[cfg(test)]` parity harness; only + /// `allocate_available` is `call`ed from an opcode arm. They are kept as + /// named registry fields for discoverability and so the tests can target + /// each helper by index, mirroring the rest of `HelperFns`. + #[allow(dead_code)] + pub erfc_approx: u32, + #[allow(dead_code)] + pub normal_cdf: u32, + #[allow(dead_code)] + pub alloc_curve: u32, + pub allocate_available: u32, +} + +/// One emitted helper function: its signature (so the assembler can register a +/// wasm type for it) and its body (the terminating `End` is included). +pub(crate) struct HelperFn { + pub params: Vec, + pub results: Vec, + pub body: Function, +} + +/// The emitted helper functions plus the [`HelperFns`] index registry that +/// names them. `functions[i]` is the body for function index `i`. +pub(crate) struct BuiltHelpers { + pub fns: HelperFns, + pub functions: Vec, +} + +/// Emit every helper function a module needs, assigning each a stable function +/// index starting at 0. +/// +/// The returned [`HelperFns`] records the indices; the caller (`module.rs`) +/// places `functions` at module function indices `0..functions.len()` and emits +/// the per-program + `run` functions after them, threading [`BuiltHelpers::fns`] +/// into each [`EmitCtx`]. +pub(crate) fn build_helpers() -> BuiltHelpers { + let mut functions: Vec = Vec::new(); + + // Push a `(f64, ...) -> f64`-shaped helper and return its assigned index. + // The index is `functions.len()` *before* the push, so it stays valid no + // matter how many helpers precede it. Used for every transcendental. + let push_unary = |functions: &mut Vec, body: Function| -> u32 { + let idx = functions.len() as u32; + functions.push(HelperFn { + params: vec![ValType::F64], + results: vec![ValType::F64], + body, + }); + idx + }; + + let approx_eq = functions.len() as u32; + functions.push(HelperFn { + params: vec![ValType::F64, ValType::F64], + results: vec![ValType::I32], + body: emit_approx_eq(), + }); + + let mod_euclid = functions.len() as u32; + functions.push(HelperFn { + params: vec![ValType::F64, ValType::F64], + results: vec![ValType::F64], + body: emit_mod_euclid(), + }); + + let pulse = functions.len() as u32; + functions.push(HelperFn { + params: vec![ + ValType::F64, + ValType::F64, + ValType::F64, + ValType::F64, + ValType::F64, + ], + results: vec![ValType::F64], + body: emit_pulse(), + }); + + // Leaf transcendentals (no inter-helper calls). + let exp = push_unary(&mut functions, super::math::emit_exp()); + let ln = push_unary(&mut functions, super::math::emit_ln()); + let sin = push_unary(&mut functions, super::math::emit_sin()); + let cos = push_unary(&mut functions, super::math::emit_cos()); + let atan = push_unary(&mut functions, super::math::emit_atan()); + + // Composed transcendentals, referencing the leaves by their recorded index. + let tan = push_unary(&mut functions, super::math::emit_tan(sin, cos)); + let asin = push_unary(&mut functions, super::math::emit_asin(atan)); + let acos = push_unary(&mut functions, super::math::emit_acos(asin)); + let log10 = push_unary(&mut functions, super::math::emit_log10(ln)); + + // `pow` is the only binary helper. + let pow = functions.len() as u32; + functions.push(HelperFn { + params: vec![ValType::F64, ValType::F64], + results: vec![ValType::F64], + body: super::math::emit_pow(exp, ln), + }); + + // GF lookup helpers, each `(data_off: i32, count: i32, index: f64) -> f64`. + // `lookup_interp` `call`s `approx_eq` (assigned above), so its body is built + // with that index. + let push_lookup = |functions: &mut Vec, body: Function| -> u32 { + let idx = functions.len() as u32; + functions.push(HelperFn { + params: vec![ValType::I32, ValType::I32, ValType::F64], + results: vec![ValType::F64], + body, + }); + idx + }; + let lookup_interp = push_lookup(&mut functions, super::lookup::emit_lookup_interp(approx_eq)); + let lookup_forward = push_lookup(&mut functions, super::lookup::emit_lookup_forward()); + let lookup_backward = push_lookup(&mut functions, super::lookup::emit_lookup_backward()); + + // `stable_sort(pairs_ptr: i32, n: i32, ascending: i32) -> ()` -- the runtime + // insertion sort backing `VectorSortOrder`/`Rank` (`super::vector`). + let stable_sort = functions.len() as u32; + functions.push(HelperFn { + params: vec![ValType::I32, ValType::I32, ValType::I32], + results: vec![], + body: super::vector::emit_stable_sort(), + }); + + // Allocation helpers (`super::alloc`). Pushed in dependency order so each + // inter-helper `call` resolves to an already-recorded index: + // `erfc_approx` -> `exp`; `normal_cdf` -> `erfc_approx`; `alloc_curve` -> + // `normal_cdf`/`exp`/`pow`; `allocate_available` -> `alloc_curve`. + let erfc_approx = functions.len() as u32; + functions.push(HelperFn { + params: vec![ValType::F64], + results: vec![ValType::F64], + body: super::alloc::emit_erfc_approx(exp), + }); + let normal_cdf = functions.len() as u32; + functions.push(HelperFn { + params: vec![ValType::F64], + results: vec![ValType::F64], + body: super::alloc::emit_normal_cdf(erfc_approx), + }); + let alloc_curve = functions.len() as u32; + functions.push(HelperFn { + params: vec![ + ValType::F64, + ValType::F64, + ValType::F64, + ValType::F64, + ValType::F64, + ValType::F64, + ], + results: vec![ValType::F64], + body: super::alloc::emit_alloc_curve(normal_cdf, exp, pow), + }); + let allocate_available = functions.len() as u32; + functions.push(HelperFn { + params: vec![ + ValType::I32, + ValType::I32, + ValType::I32, + ValType::F64, + ValType::I32, + ], + results: vec![], + body: super::alloc::emit_allocate_available(alloc_curve), + }); + + BuiltHelpers { + fns: HelperFns { + approx_eq, + mod_euclid, + pulse, + exp, + ln, + sin, + cos, + tan, + atan, + asin, + acos, + log10, + pow, + lookup_interp, + lookup_forward, + lookup_backward, + stable_sort, + erfc_approx, + normal_cdf, + alloc_curve, + allocate_available, + }, + functions, + } +} + +// `approx_eq` helper local layout. Params 0/1 are `a`/`b`; the rest are declared +// i64 scratch locals. +const AE_A: u32 = 0; +const AE_B: u32 = 1; +const AE_BITS: u32 = 2; // scratch for one operand's raw bits +const AE_ORD_A: u32 = 3; // ordered(a) +const AE_ORD_B: u32 = 4; // ordered(b) +const AE_DIFF: u32 = 5; // ordered(a) - ordered(b) +const AE_ABS: u32 = 6; // abs(diff) before saturation +const AE_LOCAL_COUNT: u32 = 5; // declared i64 locals (indices 2..=6) + +/// Build the body of the `approx_eq(a: f64, b: f64) -> i32` helper, reproducing +/// `crate::float::approx_eq` (`float_cmp` 0.10, `f64`, default margin +/// `epsilon = f64::EPSILON`, `ulps = 4`) bit-faithfully. +/// +/// The Rust reference (`float_cmp` `eq.rs`) is the short-circuiting OR of three +/// total, trap-free checks (exact equality / ±inf, absolute-epsilon, ULP): +/// +/// ```text +/// a == b || f64abs(a - b) <= f64::EPSILON || saturating_abs(ulps(a, b)) <= 4 +/// ``` +/// +/// where `ulps(a, b) = ordered(a).wrapping_sub(ordered(b))` over `i64` and +/// `ordered(f) = { let bits = f.to_bits() as i64; if bits < 0 { !bits } else +/// { bits ^ i64::MIN } }` maps the sign-magnitude bit pattern to a monotonic +/// integer. Because all three checks are pure and total (no division, no traps), +/// evaluating them eagerly and OR-ing the i32 results is bit-identical to the +/// Rust short-circuit; the fast path is only a performance shortcut, not a +/// semantic difference. Notably this makes `approx_eq(NaN, NaN) == true` +/// (identical bits -> 0 ULPs) and keeps the finite `crate::float::NA` sentinel +/// distinct from ordinary values (its exponent is far from theirs). +fn emit_approx_eq() -> Function { + use Instruction as Ins; + let mut f = Function::new([(AE_LOCAL_COUNT, ValType::I64)]); + + // check 1: a == b -> i32 + f.instruction(&Ins::LocalGet(AE_A)); + f.instruction(&Ins::LocalGet(AE_B)); + f.instruction(&Ins::F64Eq); + + // check 2: f64.abs(a - b) <= f64::EPSILON -> i32 + f.instruction(&Ins::LocalGet(AE_A)); + f.instruction(&Ins::LocalGet(AE_B)); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::F64Abs); + f.instruction(&f64_const(f64::EPSILON)); + f.instruction(&Ins::F64Le); + + // check 3: saturating_abs(ordered(a) - ordered(b)) <= 4 -> i32. + emit_ordered_bits(&mut f, AE_A, AE_BITS); + f.instruction(&Ins::LocalSet(AE_ORD_A)); + emit_ordered_bits(&mut f, AE_B, AE_BITS); + f.instruction(&Ins::LocalSet(AE_ORD_B)); + + // diff = wrapping_sub(ordered_a, ordered_b) (i64.sub wraps) + f.instruction(&Ins::LocalGet(AE_ORD_A)); + f.instruction(&Ins::LocalGet(AE_ORD_B)); + f.instruction(&Ins::I64Sub); + f.instruction(&Ins::LocalSet(AE_DIFF)); + + // abs = if diff < 0 { 0 - diff } else { diff } (the wrapping negate; for + // diff == i64::MIN this stays negative, handled by the saturation below). + f.instruction(&Ins::I64Const(0)); + f.instruction(&Ins::LocalGet(AE_DIFF)); + f.instruction(&Ins::I64Sub); // 0 - diff + f.instruction(&Ins::LocalGet(AE_DIFF)); // [neg, diff] + f.instruction(&Ins::LocalGet(AE_DIFF)); + f.instruction(&Ins::I64Const(0)); + f.instruction(&Ins::I64LtS); // diff < 0 + f.instruction(&Ins::Select); // neg if diff<0 else diff + f.instruction(&Ins::LocalSet(AE_ABS)); + + // sat = if abs < 0 { i64::MAX } else { abs } (saturating_abs: the only abs + // still negative is the i64::MIN overflow, which saturates to i64::MAX). + f.instruction(&Ins::I64Const(i64::MAX)); + f.instruction(&Ins::LocalGet(AE_ABS)); // [i64::MAX, abs] + f.instruction(&Ins::LocalGet(AE_ABS)); + f.instruction(&Ins::I64Const(0)); + f.instruction(&Ins::I64LtS); // abs < 0 + f.instruction(&Ins::Select); // i64::MAX if abs<0 else abs + + // sat <= 4 -> i32 + f.instruction(&Ins::I64Const(4)); + f.instruction(&Ins::I64LeS); + + // Combine the three i32 booleans: (check1 | check2 | check3). Stack holds + // [c1, c2, c3]; two i32.or reduce it to one result. + f.instruction(&Ins::I32Or); + f.instruction(&Ins::I32Or); + + f.instruction(&Ins::End); + f +} + +/// Append the wasm sequence that pushes `ordered(local)` onto the stack, where +/// `ordered(f) = { let bits = f.to_bits() as i64; if bits < 0 { !bits } else +/// { bits ^ i64::MIN } }` (float_cmp's sign-magnitude -> monotonic map). `bits` +/// is materialized once into `bits_local` (i64) and reused for the two branch +/// values and the sign test; `select` chooses between them. `i64::MIN` is the +/// `1 << 63` sign mask as a signed `i64`, and `!bits` is `bits ^ -1`. +fn emit_ordered_bits(f: &mut Function, src_local: u32, bits_local: u32) { + use Instruction as Ins; + f.instruction(&Ins::LocalGet(src_local)); + f.instruction(&Ins::I64ReinterpretF64); + f.instruction(&Ins::LocalSet(bits_local)); + // neg case: !bits = bits ^ -1 + f.instruction(&Ins::LocalGet(bits_local)); + f.instruction(&Ins::I64Const(-1)); + f.instruction(&Ins::I64Xor); + // pos case: bits ^ i64::MIN (flip the sign bit) + f.instruction(&Ins::LocalGet(bits_local)); + f.instruction(&Ins::I64Const(i64::MIN)); + f.instruction(&Ins::I64Xor); + // cond: bits < 0 (the sign bit is set) + f.instruction(&Ins::LocalGet(bits_local)); + f.instruction(&Ins::I64Const(0)); + f.instruction(&Ins::I64LtS); + // select(neg, pos, cond): neg if cond != 0 else pos + f.instruction(&Ins::Select); +} + +// `mod_euclid` helper local layout. Params 0/1 are `l`/`r`; local 2 is the +// truncated remainder `r0`. +const ME_L: u32 = 0; +const ME_R: u32 = 1; +const ME_R0: u32 = 2; + +/// Build the body of `mod_euclid(l: f64, r: f64) -> f64`, reproducing +/// `f64::rem_euclid` (the VM's `Op2::Mod`) exactly. +/// +/// `rem_euclid` is `let r0 = l % r; if r0 < 0 { r0 + r.abs() } else { r0 }`, +/// where the truncated remainder `l % r` is `l - r * (l / r).trunc()` (wasm has +/// no `f64.rem`, so it is computed from `f64.div`/`f64.trunc`/`f64.mul`/ +/// `f64.sub`). The branch is a `select`. The result lies in `[0, |r|)` for a +/// non-zero divisor; this trunc-then-adjust form is correct for negative +/// divisors too (where a `floor`-based form would not be). +fn emit_mod_euclid() -> Function { + use Instruction as Ins; + let mut f = Function::new([(1, ValType::F64)]); + + // r0 = l - r * trunc(l / r) + f.instruction(&Ins::LocalGet(ME_L)); + f.instruction(&Ins::LocalGet(ME_R)); + f.instruction(&Ins::LocalGet(ME_L)); + f.instruction(&Ins::LocalGet(ME_R)); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::F64Trunc); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::LocalSet(ME_R0)); + + // select(r0 + |r|, r0, r0 < 0): the adjusted value when r0 is negative, + // else r0 unchanged. wasm `select` yields the deeper operand when the cond + // is true, so push `r0 + |r|` first. + f.instruction(&Ins::LocalGet(ME_R0)); + f.instruction(&Ins::LocalGet(ME_R)); + f.instruction(&Ins::F64Abs); + f.instruction(&Ins::F64Add); + f.instruction(&Ins::LocalGet(ME_R0)); + f.instruction(&Ins::LocalGet(ME_R0)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::Select); + + f.instruction(&Ins::End); + f +} + +// `pulse` helper local layout. Params 0..4 are time/dt/volume/first_pulse/ +// interval; local 5 is the running `next_pulse`. +const PU_TIME: u32 = 0; +const PU_DT: u32 = 1; +const PU_VOLUME: u32 = 2; +const PU_FIRST: u32 = 3; +const PU_INTERVAL: u32 = 4; +const PU_NEXT: u32 = 5; + +/// Build the body of `pulse(time, dt, volume, first_pulse, interval) -> f64`, +/// reproducing the VM's `pulse` (`vm.rs:3036`) including its `while` loop. +/// +/// ```text +/// if time < first_pulse { return 0.0 } +/// next_pulse = first_pulse +/// loop { // while time >= next_pulse +/// if time < next_pulse { break } +/// if time < next_pulse + dt { return volume / dt } +/// if interval <= 0.0 { break } +/// next_pulse += interval +/// } +/// 0.0 +/// ``` +/// +/// The `while time >= next_pulse` head is realized as a `br $exit` when +/// `time < next_pulse`, inside a `block $exit { loop $top { ... br $top } }`. +fn emit_pulse() -> Function { + use Instruction as Ins; + use wasm_encoder::BlockType; + let mut f = Function::new([(1, ValType::F64)]); + + // if time < first_pulse { return 0.0 } + f.instruction(&Ins::LocalGet(PU_TIME)); + f.instruction(&Ins::LocalGet(PU_FIRST)); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // next_pulse = first_pulse + f.instruction(&Ins::LocalGet(PU_FIRST)); + f.instruction(&Ins::LocalSet(PU_NEXT)); + + // block $exit { loop $top { ... } } + f.instruction(&Ins::Block(BlockType::Empty)); + f.instruction(&Ins::Loop(BlockType::Empty)); + + // while-head: if time < next_pulse { break } (br depth 1 -> $exit) + f.instruction(&Ins::LocalGet(PU_TIME)); + f.instruction(&Ins::LocalGet(PU_NEXT)); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::BrIf(1)); + + // if time < next_pulse + dt { return volume / dt } + f.instruction(&Ins::LocalGet(PU_TIME)); + f.instruction(&Ins::LocalGet(PU_NEXT)); + f.instruction(&Ins::LocalGet(PU_DT)); + f.instruction(&Ins::F64Add); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&Ins::LocalGet(PU_VOLUME)); + f.instruction(&Ins::LocalGet(PU_DT)); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // else if interval <= 0.0 { break } (br depth 1 -> $exit) + f.instruction(&Ins::LocalGet(PU_INTERVAL)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Le); + f.instruction(&Ins::BrIf(1)); + + // else next_pulse += interval ; continue (br depth 0 -> $top) + f.instruction(&Ins::LocalGet(PU_NEXT)); + f.instruction(&Ins::LocalGet(PU_INTERVAL)); + f.instruction(&Ins::F64Add); + f.instruction(&Ins::LocalSet(PU_NEXT)); + f.instruction(&Ins::Br(0)); + + f.instruction(&Ins::End); // end loop + f.instruction(&Ins::End); // end block + + // fell out of the loop -> 0.0 + f.instruction(&f64_const(0.0)); + + f.instruction(&Ins::End); // end function + f +} + +/// Push `call approx_eq` for two f64 operands already on the wasm stack +/// (`[a, b]`); leaves an i32 (1 = approximately equal) on the stack. Mirrors a +/// `crate::float::approx_eq(a, b)` call. +fn emit_call_approx_eq(ctx: &EmitCtx, f: &mut Function) { + f.instruction(&Instruction::Call(ctx.helpers.approx_eq)); +} + +/// Push the i32 truthiness of the f64 already on the wasm stack, reproducing the +/// VM's `is_truthy(n) = !approx_eq(n, 0.0)` (`vm.rs:89`): `approx_eq(n, 0.0)` +/// gives `is_false`, and `i32.eqz` negates it to `is_truthy`. +pub(crate) fn emit_is_truthy(ctx: &EmitCtx, f: &mut Function) { + f.instruction(&f64_const(0.0)); + emit_call_approx_eq(ctx, f); + f.instruction(&Instruction::I32Eqz); +} + +/// The maximum number of simultaneously-live `SetCond` condition registers a +/// program needs. +/// +/// `compiler::codegen` lowers an `Expr::If` by walking the *condition* sub-tree +/// to completion before emitting the pair's own `SetCond`/`If` +/// (`codegen.rs:1153-1159`: push `t`, push `f`, walk `cond`, then `SetCond`, +/// `If`). So even when a condition itself contains a nested `If`, the inner +/// pair is fully emitted before the outer `SetCond`, and the stream is +/// *sequential* -- `SetCond If SetCond If` -- never interleaved. With current +/// codegen the condition register therefore never needs to hold more than one +/// live value (this returns 1 for any model with a conditional). +/// +/// We still model the register as a LIFO stack and size it from the actual +/// opcode stream rather than hard-coding 1: it costs one cheap pass, it is +/// robust if codegen ever emits a genuinely interleaved pair, and it keeps the +/// emitter's `SetCond`-pushes-/`If`-pops logic symmetric. The depth is computed +/// here so the caller can reserve exactly that many wasm locals. +/// Number of dedicated scratch f64 locals the `Apply` opcode reserves +/// (`a`/`b`/`c`). +const APPLY_LOCAL_COUNT: u32 = 3; + +/// Number of dedicated scratch f64 locals the Phase-6 vector opcodes reserve. +/// `VectorSelect`'s single-pass reduction needs four running accumulators +/// (sum/product/min/max) plus one to hold the current value; the other vector +/// ops draw their f64 scratch from the same block. Reserved unconditionally -- +/// unused f64 locals in a non-vector function are free. +pub(crate) const VECTOR_F64_LOCAL_COUNT: u32 = 5; + +/// Number of dedicated scratch i32 locals the Phase-6 vector opcodes reserve. +/// `VectorSelect` uses one for the action selector, one for the selected-value +/// count, and one for its reduce loop index; `Rank` uses one for the +/// `ascending` flag and one for a runtime store address; `VectorElmMap` uses one +/// for the runtime flat source index. Three covers every Phase-6 vector op. +/// Reserved unconditionally -- unused i32 locals in a non-vector function are +/// free. +pub(crate) const VECTOR_I32_LOCAL_COUNT: u32 = 3; + +/// The first declared-local wasm index of an opcode-program function with +/// `n_inputs` f64 module-input parameters. Param 0 is `module_off`; params +/// `1..=n_inputs` are the module inputs (`LoadModuleInput { input }` reads param +/// `input + 1`); declared locals begin at `1 + n_inputs`. For the root (and every +/// Phase 1-6 single-module function) `n_inputs == 0`, so this is the historical +/// index 1 (the scratch f64). +fn first_local_index(n_inputs: u32) -> u32 { + 1 + n_inputs +} + +/// The local-declaration list for an opcode-program `Function` carrying +/// `cond_depth` condition locals, `extra_i32` dynamic-subscript scratch locals, +/// and `module_input_scratch` f64 locals for the `EvalModule` reverse-pop: one +/// scratch f64, then `cond_depth` i32 condition locals, then +/// [`APPLY_LOCAL_COUNT`] f64 `Apply` scratch locals, then +/// [`VECTOR_F64_LOCAL_COUNT`] f64 vector-op scratch locals, then +/// [`VECTOR_I32_LOCAL_COUNT`] i32 vector-op scratch locals, then `extra_i32` i32 +/// locals (Task 4 dynamic subscripts; 0 when the program has none), then +/// `module_input_scratch` f64 locals (Phase 7 `EvalModule`; 0 when the program +/// instantiates no submodule). +/// +/// Defined once (and consumed by both `module.rs`'s function builders and the +/// `#[cfg(test)]` harness) so the declared local *types and order* match the +/// indices [`apply_locals_for`], [`vector_f64_locals_for`], +/// [`vector_i32_locals_for`], [`extra_i32_local_base`], and +/// [`module_input_scratch_base`] hand out. The declared locals always start at +/// `1 + n_inputs` (past `module_off` and the f64 input params); the vector locals +/// sit at a *fixed* offset (independent of `extra_i32`) so the dynamic-subscript +/// extra i32s -- and the module-input scratch after them -- shift by a constant +/// and never disturb the `apply_locals` indices. +pub(crate) fn opcode_fn_locals( + cond_depth: usize, + extra_i32: u32, + module_input_scratch: u32, +) -> Vec<(u32, ValType)> { + vec![ + (1, ValType::F64), + (cond_depth as u32, ValType::I32), + (APPLY_LOCAL_COUNT, ValType::F64), + (VECTOR_F64_LOCAL_COUNT, ValType::F64), + (VECTOR_I32_LOCAL_COUNT, ValType::I32), + (extra_i32, ValType::I32), + (module_input_scratch, ValType::F64), + ] +} + +/// The [`VECTOR_F64_LOCAL_COUNT`] vector-op scratch f64 local indices for a +/// function with `n_inputs` module-input params and `cond_depth` condition +/// locals. They follow the declared scratch f64 (index `1 + n_inputs`), the +/// `cond_depth` i32 condition locals, and the [`APPLY_LOCAL_COUNT`] `Apply` f64s. +/// Threaded into [`EmitCtx::vector_f64_locals`]. +pub(crate) fn vector_f64_locals_for( + n_inputs: u32, + cond_depth: usize, +) -> [u32; VECTOR_F64_LOCAL_COUNT as usize] { + let base = first_local_index(n_inputs) + 1 + cond_depth as u32 + APPLY_LOCAL_COUNT; + [base, base + 1, base + 2, base + 3, base + 4] +} + +/// The [`VECTOR_I32_LOCAL_COUNT`] vector-op scratch i32 local indices for a +/// function with `n_inputs` module-input params and `cond_depth` condition +/// locals. They follow the [`VECTOR_F64_LOCAL_COUNT`] vector-op f64s. Threaded +/// into [`EmitCtx::vector_i32_locals`]. +pub(crate) fn vector_i32_locals_for( + n_inputs: u32, + cond_depth: usize, +) -> [u32; VECTOR_I32_LOCAL_COUNT as usize] { + let base = first_local_index(n_inputs) + + 1 + + cond_depth as u32 + + APPLY_LOCAL_COUNT + + VECTOR_F64_LOCAL_COUNT; + [base, base + 1, base + 2] +} + +/// First wasm local index of the `extra_i32` dynamic-subscript scratch locals for +/// a function with `n_inputs` module-input params and `cond_depth` condition +/// locals: past `module_off` + the `n_inputs` f64 input params, the scratch f64 +/// (index `1 + n_inputs`), the `cond_depth` i32 condition locals, the +/// [`APPLY_LOCAL_COUNT`] `Apply` f64s, the [`VECTOR_F64_LOCAL_COUNT`] vector-op +/// f64s, and the [`VECTOR_I32_LOCAL_COUNT`] vector-op i32s. Threaded into +/// [`EmitCtx::extra_i32_local_base`] so the dynamic-subscript local allocator +/// draws from exactly the declared range. +pub(crate) fn extra_i32_local_base(n_inputs: u32, cond_depth: usize) -> u32 { + first_local_index(n_inputs) + + 1 + + cond_depth as u32 + + APPLY_LOCAL_COUNT + + VECTOR_F64_LOCAL_COUNT + + VECTOR_I32_LOCAL_COUNT +} + +/// First wasm local index of the `module_input_scratch` f64 locals (Phase 7 +/// `EvalModule` reverse-pop) for a function with `n_inputs` module-input params, +/// `cond_depth` condition locals, and `extra_i32` dynamic-subscript i32 locals. +/// They follow the `extra_i32` block (the last i32 run), so this is +/// [`extra_i32_local_base`]`+ extra_i32`. The `EvalModule` arm pops a child's +/// inputs into the first `n` of these (where `n` is that call's `n_inputs`), +/// matching the VM's reverse pop into `module_inputs`. Threaded into +/// [`EmitCtx::module_input_scratch_base`]. +pub(crate) fn module_input_scratch_base(n_inputs: u32, cond_depth: usize, extra_i32: u32) -> u32 { + extra_i32_local_base(n_inputs, cond_depth) + extra_i32 +} + +/// The three `Apply` scratch f64 local indices `[a, b, c]` for a function with +/// `n_inputs` module-input params and `cond_depth` condition locals. They follow +/// `module_off` + the `n_inputs` f64 input params, the scratch f64 +/// (index `1 + n_inputs`), and the `cond_depth` i32 condition locals, so they +/// start at `1 + n_inputs + 1 + cond_depth`. Mirrors the declaration order in +/// [`opcode_fn_locals`]. +pub(crate) fn apply_locals_for(n_inputs: u32, cond_depth: usize) -> [u32; 3] { + let base = first_local_index(n_inputs) + 1 + cond_depth as u32; + [base, base + 1, base + 2] +} + +/// The wasm local index of the assign-scratch f64 for a function with `n_inputs` +/// module-input params: the first declared local (`1 + n_inputs`), past +/// `module_off` and the f64 input params. Threaded into [`EmitCtx::scratch_local`]. +pub(crate) fn scratch_local_for(n_inputs: u32) -> u32 { + first_local_index(n_inputs) +} + +/// The `cond_depth` condition-register local indices for a function with +/// `n_inputs` module-input params. They follow the scratch f64 (index +/// `1 + n_inputs`), so the first is `2 + n_inputs`. Threaded into +/// [`EmitCtx::condition_locals`]. +pub(crate) fn condition_locals_for(n_inputs: u32, cond_depth: usize) -> Vec { + let base = first_local_index(n_inputs) + 1; + (0..cond_depth as u32).map(|i| base + i).collect() +} + +pub(crate) fn max_condition_depth(bc: &ByteCode) -> usize { + let mut depth: usize = 0; + let mut max_depth: usize = 0; + for op in &bc.code { + match op { + Opcode::SetCond {} => { + depth += 1; + max_depth = max_depth.max(depth); + } + // `If` consumes the most-recently-set condition. Guard against an + // unbalanced program (which would indicate malformed bytecode) + // with a saturating decrement rather than an underflow panic. + Opcode::If {} => { + depth = depth.saturating_sub(1); + } + _ => {} + } + } + max_depth +} + +/// Push the dynamic part of a module-relative slot address: `module_off * 8`. +/// Combined with a constant `memarg.offset` of `chunk_base + off*8`, this +/// addresses `chunk_base + (module_off + off) * 8`, matching the VM's +/// `curr[module_off + off]` / `next[module_off + off]`. +pub(crate) fn push_module_relative_base(ctx: &EmitCtx, f: &mut Function) { + f.instruction(&Instruction::LocalGet(ctx.module_off_local)); + f.instruction(&Instruction::I32Const(SLOT_SIZE as i32)); + f.instruction(&Instruction::I32Mul); +} + +/// Byte offset of a slot within a chunk: `chunk_base + off*8`. +fn slot_byte_offset(chunk_base: u32, off: u16) -> u64 { + u64::from(chunk_base) + u64::from(off) * u64::from(SLOT_SIZE) +} + +/// Emit-time analogue of the VM's per-`eval_bytecode` mutable state +/// (`vm.rs:1277-1288`): the compile-time view stack, the iteration / broadcast +/// contexts, and the condition-register stack pointer. Threaded through +/// [`emit_ops`] so an unrolled iteration body can be re-emitted at each +/// compile-time index without re-deriving the view stack. +struct EmitState { + /// Emit-time stack pointer into `ctx.condition_locals`, mirroring the VM's + /// single `condition` register but generalized to nested `If`s. + cond_sp: usize, + /// Compile-time analogue of the VM's runtime `view_stack`: the `Push*View` / + /// `View*` opcodes push/transform/pop `ViewDesc`s here, and the reducers read + /// the top descriptor. Because every static view's geometry is known at + /// compile time, this never materializes anything at runtime -- element + /// addresses are folded into the emitted reads. + view_stack: Vec, + /// Active (unrolled) iteration contexts, one per nested `BeginIter`. The + /// `current` field is the compile-time iteration index the unroller is + /// emitting (Task 3). + iter_stack: Vec, + /// Active broadcast-iteration contexts (`BeginBroadcastIter`, Task 3). + broadcast_stack: Vec, + /// The legacy scalar dynamic-subscript accumulator (`PushSubscriptIndex` / + /// `LoadSubscript`, Task 4), mirroring the VM's `subscript_index` + + /// `subscript_index_valid` (`vm.rs:1287-1288`). Cleared by each + /// `LoadSubscript`. + subscript: SubscriptAccum, + /// Bump cursor for the function's extra i32 locals (Task 4). A dynamic + /// subscript draws fresh i32 locals from here; the count is pre-sized by + /// [`count_extra_i32_locals`], so this never exceeds the declared count. + next_i32_local: u32, + /// Cumulative count of unrolled element-emit "units" for the function being + /// lowered, checked against [`MAX_UNROLL_UNITS`] (see [`EmitState::charge_unroll`]). + /// Every full unroll -- a reducer fold, a `BeginIter`/`BeginBroadcastIter` + /// body re-emission -- charges its iteration count here. Nested iterations + /// multiply naturally: an inner site is reached once per outer iteration, so + /// each inner charge already reflects the outer multiplier. When the running + /// total would exceed the cap, lowering aborts with `Unsupported` so the + /// model cleanly falls back to the VM instead of emitting a multi-megabyte + /// function body that a wasm engine would reject. + unroll_units: usize, +} + +/// Upper bound on the cumulative number of unrolled element-emit "units" per +/// wasm function (one reducer-fold element, or one `BeginIter`/`BeginBroadcastIter` +/// body re-emission, is one unit). +/// +/// Every array reducer and iteration loop is fully unrolled at compile time +/// (each element address becomes a wasm constant -- see [`emit_reduce_fold`] and +/// the `BeginIter`/`BeginBroadcastIter` arms). Without a bound, a large arrayed +/// model -- especially nested iterations whose counts multiply -- could emit a +/// function body exceeding what wasm engines accept (V8, for instance, caps a +/// single function near ~7.6 MB of bytecode; the spec's 4 GiB ceiling is +/// academic). At a generous ~50 bytes of emitted code per unit, this cap bounds +/// unroll-driven code at roughly 3 MB, comfortably under the strictest engine +/// limit. +/// +/// The value `65_536` (2^16) is the natural ceiling of a single `u16` array +/// dimension (`ViewDesc::dims` entries are `u16`, so one dimension tops out at +/// 65_535). Real system-dynamics arrays are tiny -- the test corpus's largest +/// single dimension is 9, and even a region x sector x cohort nest is on the +/// order of 10^3 elements -- so this leaves >60x headroom for legitimate models +/// while rejecting pathological products (e.g. a `[300, 300]` view, 90_000 +/// elements) before any code is emitted. +/// +/// future: a runtime wasm loop driven by a precomputed offset table (per the +/// Phase 5 design's non-contiguous path) would lift this cap entirely, trading a +/// constant-size loop body for the current fully-unrolled form. +const MAX_UNROLL_UNITS: usize = 65_536; + +impl EmitState { + /// Charge `units` against the per-function unroll budget, returning + /// `Unsupported` (so the model falls back to the VM) if the running total + /// would exceed [`MAX_UNROLL_UNITS`]. Called *before* an unroll site emits + /// its body, so an over-budget model is rejected without ever materializing + /// the oversized function. `units` saturates rather than wrapping, so a + /// pathological multi-dimensional product can never alias back under the cap. + fn charge_unroll(&mut self, units: usize) -> Result<(), WasmGenError> { + self.unroll_units = self.unroll_units.saturating_add(units); + if self.unroll_units > MAX_UNROLL_UNITS { + return Err(WasmGenError::Unsupported(format!( + "wasmgen: array unrolling exceeds the per-function budget of \ + {MAX_UNROLL_UNITS} elements (a large arrayed model); falling back to the VM" + ))); + } + Ok(()) + } +} + +/// The legacy scalar dynamic-subscript accumulator (Task 4). `PushSubscriptIndex` +/// appends a `(runtime_index_local, bounds)` and folds OOB into `valid_local`; +/// `LoadSubscript` collapses the indices into a flat offset and reads the slot +/// (or NaN). Mirrors the VM's `subscript_index` SmallVec + `subscript_index_valid` +/// flag (`vm.rs:1287-1288`, `1341-1366`). +#[derive(Default)] +struct SubscriptAccum { + /// `(runtime_index_local, bounds)` for each pushed index, in push order. The + /// local holds the *0-based* runtime index (i32); `bounds` is the dimension + /// size for the row-major fold. + indices: Vec<(u32, u16)>, + /// wasm i32 local that is 0 once any pushed index was out of bounds, else 1. + /// `None` until the first `PushSubscriptIndex` of an accumulation allocates + /// it (then seeded to 1). + valid_local: Option, +} + +/// One active iteration context for the unrolled `BeginIter` loop (Task 3). +struct IterCtx { + /// The view captured as the iteration source/geometry at `BeginIter` + /// (`view_stack.last()` then). + iter_view: ViewDesc, + /// Destination temp id for `StoreIterElement`, when `has_write_temp`. + write_temp_id: Option, + /// The compile-time iteration index currently being emitted (the unroller + /// re-emits the body once per `0..size`). + current: usize, +} + +/// One active broadcast-iteration context (`BeginBroadcastIter`, Task 3), +/// mirroring the VM's `BroadcastState` (`vm.rs:68-81`) but with the result +/// geometry + per-source dim maps resolved at compile time. +struct BroadcastCtx { + /// Per source (deepest-first): the source view and its `dim_map` (one entry + /// per result dimension; `Some(src_dim)` or `None` for a broadcast axis). + sources: Vec<(ViewDesc, Vec>)>, + /// Destination temp id for `StoreBroadcastElement`. + dest_temp_id: u8, + /// Result dimension sizes (the union of all sources' dims, first-encounter + /// order), used to decompose `current` into per-result-dim indices. + result_dims: Vec, + /// The compile-time result index currently being emitted. + current: usize, +} + +/// Lower one opcode program. Value-producing opcodes leave their f64 result on +/// the wasm operand stack; the assignment opcodes emit a store and leave the +/// stack empty, exactly as the VM's stack-machine arms do. `Ret` is a no-op +/// here: the wasm function's terminating `End` is emitted by the caller. +pub(crate) fn emit_bytecode( + bc: &ByteCode, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + let mut state = EmitState { + cond_sp: 0, + view_stack: Vec::new(), + iter_stack: Vec::new(), + broadcast_stack: Vec::new(), + subscript: SubscriptAccum::default(), + next_i32_local: ctx.extra_i32_local_base, + unroll_units: 0, + }; + emit_ops(&bc.code, &bc.literals, ctx, &mut state, f) +} + +/// An upper bound on the extra i32 wasm locals a program's dynamic subscripts +/// need (Task 4), so the function-builder can reserve them past the scratch / +/// condition / `Apply` locals. +/// +/// Each `ViewSubscriptDynamic` draws at most two fresh locals (a runtime-offset +/// addend + a validity flag, allocated once per dynamically-subscripted view); +/// each `PushSubscriptIndex` draws at most two (a 0-based index local + the +/// shared validity flag of its accumulation). Counting two per opcode is a +/// generous bound -- a real accumulation reuses one view's pair across several +/// subscripts and one validity flag across several pushed indices -- but +/// over-reserving unused i32 locals is free, and the bound keeps the reservation +/// a single cheap pass with no dataflow. +pub(crate) fn count_extra_i32_locals(bc: &ByteCode) -> u32 { + bc.code + .iter() + .filter(|op| { + matches!( + op, + Opcode::ViewSubscriptDynamic { .. } | Opcode::PushSubscriptIndex { .. } + ) + }) + .count() as u32 + * 2 +} + +/// The number of scratch f64 wasm locals a program needs for the `EvalModule` +/// reverse-pop (Phase 7): the maximum `n_inputs` over its `EvalModule` sites. +/// +/// Each `EvalModule { n_inputs }` pops its `n_inputs` operands into scratch f64 +/// locals (in reverse, matching the VM) before pushing `child_module_off` and +/// re-pushing them in order. Because the sites are emitted sequentially (each +/// fully consumes its scratch before the next runs), reserving the *max* per-site +/// count -- not the sum -- suffices, and successive sites reuse the same locals. +/// Returns 0 for a program that instantiates no submodule. +pub(crate) fn count_module_input_scratch(bc: &ByteCode) -> u32 { + bc.code + .iter() + .filter_map(|op| match op { + Opcode::EvalModule { n_inputs, .. } => Some(u32::from(*n_inputs)), + _ => None, + }) + .max() + .unwrap_or(0) +} + +/// Lower a (sub-)slice of opcodes, threading the emit-time [`EmitState`]. The +/// top-level program is one call over the whole `code`; an unrolled `BeginIter` +/// loop body (Task 3) re-enters here over the body sub-slice once per iteration +/// index. A `pc`-based loop (rather than `for`) lets the iteration arms consume +/// their structured `BeginIter..NextIterOrJump..EndIter` span and re-emit the +/// body, mirroring the VM's `pc` loop without needing the `jump_back` delta. +/// +/// `literals` is the program's shared literal pool (`LoadConstant` / +/// `AssignConstCurr` index it); it is the same across every body re-emission. +fn emit_ops( + code: &[Opcode], + literals: &[f64], + ctx: &EmitCtx, + state: &mut EmitState, + f: &mut Function, +) -> Result<(), WasmGenError> { + let mut pc = 0usize; + while pc < code.len() { + let op = &code[pc]; + match op { + Opcode::LoadConstant { id } => { + let v = *literals.get(*id as usize).ok_or_else(|| { + WasmGenError::Unsupported(format!( + "wasmgen: LoadConstant literal id {id} out of range" + )) + })?; + f.instruction(&f64_const(v)); + } + Opcode::LoadVar { off } => { + push_module_relative_base(ctx, f); + f.instruction(&Instruction::F64Load(memarg(slot_byte_offset( + ctx.curr_base, + *off, + )))); + } + Opcode::LoadGlobalVar { off } => { + // Absolute slot: ignore module_off (slots 0..4 are global). + f.instruction(&Instruction::I32Const(0)); + f.instruction(&Instruction::F64Load(memarg(slot_byte_offset( + ctx.curr_base, + *off, + )))); + } + Opcode::Op2 { op } => emit_op2(*op, ctx, f)?, + Opcode::Not {} => { + // The VM's `Not` is `(!is_truthy(r)) as f64`, which simplifies to + // `approx_eq(r, 0.0) as f64` (since `is_truthy = !approx_eq(·,0.0)`, + // the double negation cancels). So push `approx_eq(r, 0.0)` and + // widen the i32 1/0 to f64. + f.instruction(&f64_const(0.0)); + emit_call_approx_eq(ctx, f); + f.instruction(&Instruction::F64ConvertI32U); + } + Opcode::SetCond {} => { + let local = *ctx.condition_locals.get(state.cond_sp).ok_or_else(|| { + WasmGenError::Unsupported( + "wasmgen: SetCond nesting exceeded reserved condition locals".to_string(), + ) + })?; + // Reduce the f64 condition to i32 truthiness, routing through + // `approx_eq` so a near-zero / ULP-adjacent condition takes the + // same branch the VM's `is_truthy(pop)` takes. + emit_is_truthy(ctx, f); + f.instruction(&Instruction::LocalSet(local)); + state.cond_sp += 1; + } + Opcode::If {} => { + if state.cond_sp == 0 { + return Err(WasmGenError::Unsupported( + "wasmgen: If without a preceding SetCond".to_string(), + )); + } + state.cond_sp -= 1; + let local = ctx.condition_locals[state.cond_sp]; + // Stack holds [t, f] (the VM pops f then t and yields + // `if condition { t } else { f }`); wasm `select` pops + // [t, f, cond_i32] and yields t when cond != 0 else f -- exact. + f.instruction(&Instruction::LocalGet(local)); + f.instruction(&Instruction::Select); + } + Opcode::AssignCurr { off } => { + emit_assign(ctx.curr_base, *off, ctx, f); + } + Opcode::AssignNext { off } => { + emit_assign(ctx.next_base, *off, ctx, f); + } + // `AssignConstCurr` reaches a `CompiledSimulation` by two routes + // (see the module docstring): `compiler::codegen` emits it directly + // for any constant-RHS `AssignCurr` (`codegen.rs:1164`), and the + // peephole pass also fuses `LoadConstant; AssignCurr` into it + // (`bytecode.rs:1830`). It is *not* a late-3-address fusion artifact, + // so it is part of the scalar core, not an Unsupported case. Every + // model with a constant initial/aux carries it. Mirrors the VM's + // `curr[module_off + off] = literals[literal_id]` (`vm.rs:1453`). + Opcode::AssignConstCurr { off, literal_id } => { + let v = *literals.get(*literal_id as usize).ok_or_else(|| { + WasmGenError::Unsupported(format!( + "wasmgen: AssignConstCurr literal id {literal_id} out of range" + )) + })?; + // Nothing is on the stack; push the store address then the value + // (f64.store wants [addr_i32, value_f64]). + push_module_relative_base(ctx, f); + // Phase 7 Task 2: an overridable constant sources its value from + // the constants-override region (initialized to `v`, mutable via + // the exported `set_value`) instead of the immediate literal, so + // an override takes effect on every assignment -- exactly as the + // VM rewrites the bytecode literal. The const region is indexed by + // absolute slot, so the read uses the same `module_off`-relative + // addressing the slab does (`const_region_base + (module_off + + // off) * 8`); a shared module run at several `module_off`s thus + // picks up each instance's distinct override. A non-overridable + // constant emits its literal unchanged. + if ctx.flows_const_offsets.contains(off) { + f.instruction(&Instruction::LocalGet(ctx.module_off_local)); + f.instruction(&Instruction::I32Const(SLOT_SIZE as i32)); + f.instruction(&Instruction::I32Mul); + f.instruction(&Instruction::F64Load(memarg(slot_byte_offset( + ctx.const_region_base, + *off, + )))); + } else { + f.instruction(&f64_const(v)); + } + f.instruction(&Instruction::F64Store(memarg(slot_byte_offset( + ctx.curr_base, + *off, + )))); + } + // Peephole fusions of `Op2; Assign{Curr,Next}`. Operands `[l, r]` + // are on the stack; apply the op (which errors cleanly on an + // unsupported operator) then store the result. Mirrors the VM's + // `curr/next[module_off + off] = eval_op2(op, l, r)` (`vm.rs:1457`, + // `vm.rs:1463`). + Opcode::BinOpAssignCurr { op, off } => { + emit_op2(*op, ctx, f)?; + emit_assign(ctx.curr_base, *off, ctx, f); + } + Opcode::BinOpAssignNext { op, off } => { + emit_op2(*op, ctx, f)?; + emit_assign(ctx.next_base, *off, ctx, f); + } + // `Apply` always pops exactly three operands (codegen pads short + // builtins with `LoadConstant 0.0` / `LoadGlobalVar{FINAL_TIME}`), + // mirroring the VM (`vm.rs:1701`). See [`emit_apply`]. + Opcode::Apply { func } => emit_apply(*func, ctx, f), + // `Lookup` pops `index` then `element_offset`, bounds-checks the + // offset, and dispatches to the mode's helper over the table at + // `base_gf + element_offset` (`vm.rs:1710`). See [`emit_lookup`]. + Opcode::Lookup { + base_gf, + table_count, + mode, + } => emit_lookup(*base_gf, *table_count, *mode, ctx, f), + // `LoadPrev` mirrors the VM (`vm.rs:1320-1328`): a fallback is + // already on the stack (codegen pushes it just before this opcode); + // yield it while `use_prev_fallback` is set, otherwise read + // `prev_values[module_off + off]`. The gate is the global flag, never + // a TIME comparison (RK moves TIME to trial points). + Opcode::LoadPrev { off } => emit_load_prev(*off, ctx, f), + // `LoadInitial` mirrors the VM (`vm.rs:1332-1340`), but its + // `part == Initials` branch is resolved at compile time from + // `ctx.step_part`: in the initials program read `curr[module_off+off]` + // (the value being computed); elsewhere read the post-initials + // `initial_values[module_off+off]` snapshot. + Opcode::LoadInitial { off } => emit_load_initial(*off, ctx, f), + + // ── View-stack construction (Phase 5 Task 1) ────────────────── + // Each opcode pushes/transforms a compile-time `ViewDesc`, mirroring + // the VM's `view_stack` arms (`vm.rs:1739-1855`). No wasm is emitted: + // the geometry is folded into later element reads. + Opcode::PushStaticView { view_id } => { + let view = ctx.ctx.get_static_view(*view_id).ok_or_else(|| { + WasmGenError::Unsupported(format!( + "wasmgen: PushStaticView view_id {view_id} out of range" + )) + })?; + state.view_stack.push(ViewDesc::from_static(view)); + } + // `PushVarView` builds a full contiguous view over a variable array; + // the VM folds `module_off` into the base (`vm.rs:1749`), so the base + // is module-relative. + Opcode::PushVarView { + base_off, + dim_list_id, + } => { + let (dims, dim_ids) = resolve_dim_list_dims(ctx, *dim_list_id)?; + state.view_stack.push(ViewDesc::contiguous( + u32::from(*base_off), + ViewBase::CurrModuleRelative, + dims, + dim_ids, + )); + } + // `PushTempView` builds a full contiguous view over a temp array + // (`vm.rs:1757`). + Opcode::PushTempView { + temp_id, + dim_list_id, + } => { + let (dims, dim_ids) = resolve_dim_list_dims(ctx, *dim_list_id)?; + state.view_stack.push(ViewDesc::contiguous( + u32::from(*temp_id), + ViewBase::Temp, + dims, + dim_ids, + )); + } + // `PushVarViewDirect` builds a contiguous view from raw dim sizes + // (dim_ids all 0), the base for a dynamic subscript (`vm.rs:1776`). + // Module-relative, like `PushVarView`. + Opcode::PushVarViewDirect { + base_off, + dim_list_id, + } => { + let dims = resolve_dim_list_raw(ctx, *dim_list_id)?; + let n = dims.len(); + state.view_stack.push(ViewDesc::contiguous( + u32::from(*base_off), + ViewBase::CurrModuleRelative, + dims, + vec![0u16; n], + )); + } + + // ── View-stack transforms (Phase 5 Task 1) ──────────────────── + Opcode::ViewSubscriptConst { dim_idx, index } => { + view_top_mut(&mut state.view_stack)? + .apply_single_subscript(*dim_idx as usize, *index); + } + Opcode::ViewRange { + dim_idx, + start, + end, + } => { + view_top_mut(&mut state.view_stack)?.apply_range(*dim_idx as usize, *start, *end); + } + Opcode::ViewStarRange { + dim_idx, + subdim_relation_id, + } => { + let rel = ctx + .ctx + .subdim_relations + .get(*subdim_relation_id as usize) + .ok_or_else(|| { + WasmGenError::Unsupported(format!( + "wasmgen: ViewStarRange subdim_relation_id {subdim_relation_id} \ + out of range" + )) + })?; + let parent_offsets = rel.parent_offsets.to_vec(); + let child_dim_id = rel.child_dim_id; + view_top_mut(&mut state.view_stack)?.apply_sparse( + *dim_idx as usize, + parent_offsets, + child_dim_id, + ); + } + // `ViewWildcard` is a no-op in the VM (`vm.rs:1839`): the dimension + // stays as-is. + Opcode::ViewWildcard { dim_idx: _ } => {} + Opcode::ViewTranspose {} => { + view_top_mut(&mut state.view_stack)?.transpose(); + } + Opcode::PopView {} => { + state.view_stack.pop().ok_or_else(|| { + WasmGenError::Unsupported("wasmgen: PopView on empty view stack".to_string()) + })?; + } + Opcode::DupView {} => { + let top = view_top(&state.view_stack)?.clone(); + state.view_stack.push(top); + } + + // ── Dynamic view subscript (Phase 5 Task 4) ─────────────────── + // `ViewSubscriptDynamic` pops a 1-based runtime index, bounds-checks + // it against the top view's `dims[dim_idx]`, and folds + // `(index-1)*strides[dim_idx]` into the descriptor's runtime offset + // local; OOB sets the validity flag to 0 so later reads yield NaN. + // Mirrors `RuntimeView::apply_single_subscript_checked` (`vm.rs:1797`, + // `bytecode.rs:242`). + Opcode::ViewSubscriptDynamic { dim_idx } => { + emit_view_subscript_dynamic(*dim_idx as usize, ctx, state, f)?; + } + // `ViewRangeDynamic` (`vm.rs:1815`) clamps a runtime `[start:end]` + // range, which yields a runtime *size*. The unrolled element + // addressing here folds every address at compile time, so a runtime + // range cannot be expressed; returning `Unsupported` keeps such a + // model `Skipped`. A literal range is constant-folded by codegen into + // the static `ViewRange` arm, so this is only reached by a true + // runtime range. + Opcode::ViewRangeDynamic { dim_idx } => { + return Err(WasmGenError::Unsupported(format!( + "wasmgen: ViewRangeDynamic (dim {dim_idx}) needs a runtime view size; \ + not supported" + ))); + } + + // ── Legacy scalar dynamic subscript (Phase 5 Task 4) ────────── + // `PushSubscriptIndex` pops a 1-based index, range-checks it against + // `bounds`, and accumulates the 0-based runtime index; OOB clears the + // accumulation's validity flag. `LoadSubscript` folds the accumulated + // indices into a flat offset and reads `curr[module_off+off+flat]` + // (NaN when invalid). Mirrors `vm.rs:1341-1366`. + Opcode::PushSubscriptIndex { bounds } => { + emit_push_subscript_index(*bounds, state, f); + } + Opcode::LoadSubscript { off } => { + emit_load_subscript(*off, ctx, state, f); + } + + // ── Temp element reads (Phase 5 Task 1) ─────────────────────── + // `temp_storage[temp_offsets[temp_id] + index]` (`vm.rs:1860`). + Opcode::LoadTempConst { temp_id, index } => { + let addr = temp_element_byte_addr(ctx, *temp_id, u32::from(*index))?; + f.instruction(&Instruction::I32Const(0)); + f.instruction(&Instruction::F64Load(memarg(addr))); + } + // `temp_storage[temp_offsets[temp_id] + index]` with a runtime index + // (`vm.rs:1866`): the VM does `stack.pop().floor() as usize`. + Opcode::LoadTempDynamic { temp_id } => { + emit_load_temp_dynamic(ctx, *temp_id, f)?; + } + + // ── Array reducers (Phase 5 Task 2) ─────────────────────────── + // Reduce over the TOP view descriptor (the production pattern is + // `PushStaticView; Array; PopView`, so the descriptor stays + // for the trailing `PopView`). + Opcode::ArraySum {} + | Opcode::ArrayMax {} + | Opcode::ArrayMin {} + | Opcode::ArrayMean {} + | Opcode::ArrayStddev {} + | Opcode::ArraySize {} => { + let view = view_top(&state.view_stack)?.clone(); + // `ArraySize` emits no element reads (just `size() as f64`), so it + // is free; every other reducer unrolls a fold over `size()` + // elements, and `ArrayStddev` makes two passes (sum, then squared + // deviations). Charge that many units before emitting the fold. + if !matches!(op, Opcode::ArraySize {}) { + let passes = if matches!(op, Opcode::ArrayStddev {}) { + 2 + } else { + 1 + }; + state.charge_unroll(view.size().saturating_mul(passes))?; + } + emit_array_reduce(op, &view, ctx, f)?; + } + + // ── Body element reads inside an unrolled iteration (Task 3) ─── + // Each reads view element `current` (the compile-time iteration index + // the unroller set on the active iter context) and pushes the f64. + Opcode::LoadIterElement {} => { + let iter = state.iter_stack.last().ok_or_else(|| { + WasmGenError::Unsupported( + "wasmgen: LoadIterElement outside an iteration".to_string(), + ) + })?; + // The iteration view is also the source: read element `current`. + let view = iter.iter_view.clone(); + let current = iter.current; + emit_view_element_load(&view, current, ctx, f)?; + } + // `temp_storage[temp_offsets[temp_id] + current]` (`vm.rs:1939`). + Opcode::LoadIterTempElement { temp_id } => { + let current = current_iter_index(state)?; + let addr = temp_element_byte_addr(ctx, *temp_id, current as u32)?; + f.instruction(&Instruction::I32Const(0)); + f.instruction(&Instruction::F64Load(memarg(addr))); + } + // Read `view_stack.last()` at `current`, broadcasting against the + // iteration view (`vm.rs:1946`). `LoadIterViewAt{offset}` reads + // `view_stack[len-offset]` instead (`vm.rs:2068`). + Opcode::LoadIterViewTop {} => { + emit_load_iter_view(state, 1, ctx, f)?; + } + Opcode::LoadIterViewAt { offset } => { + emit_load_iter_view(state, *offset as usize, ctx, f)?; + } + // Store the popped value into `temp_storage[temp_offsets[write_temp] + // + current]` (`vm.rs:2184`). + Opcode::StoreIterElement {} => { + let iter = state.iter_stack.last().ok_or_else(|| { + WasmGenError::Unsupported( + "wasmgen: StoreIterElement outside an iteration".to_string(), + ) + })?; + let write_temp_id = iter.write_temp_id.ok_or_else(|| { + WasmGenError::Unsupported( + "wasmgen: StoreIterElement without a write temp".to_string(), + ) + })?; + let current = iter.current; + emit_store_iter_element(ctx, write_temp_id, current, f)?; + } + + // ── Iteration loop (Task 3): unroll BeginIter..EndIter ──────── + // The body span between `BeginIter` and its `NextIterOrJump` is + // structured (codegen.rs:1183-1378) and well-nested, so rather than a + // runtime wasm loop with the `jump_back` PC delta, the body is fully + // unrolled over the compile-time `size()` -- every element address is + // then a compile-time constant via `emit_view_element_load`, matching + // the array reducer's unrolled fold (Task 2) and the VM element-for- + // element. The captured iter view is `view_stack.last()` at `BeginIter` + // (`vm.rs:1880`). + Opcode::BeginIter { + write_temp_id, + has_write_temp, + } => { + let iter_view = view_top(&state.view_stack)?.clone(); + let write_temp_id = if *has_write_temp { + Some(*write_temp_id) + } else { + None + }; + let size = iter_view.size(); + let (body, end_pc) = iter_span(code, pc, IterKind::Iter)?; + // Re-emitting the body once per element is `size` units of + // unrolling; charge it before the loop so an over-budget model is + // rejected without materializing the oversized body. Nested + // iterations multiply naturally: this arm is reached once per + // outer iteration, so each inner charge already carries the outer + // multiplier. + state.charge_unroll(size)?; + for current in 0..size { + state.iter_stack.push(IterCtx { + iter_view: iter_view.clone(), + write_temp_id, + current, + }); + emit_ops(body, literals, ctx, state, f)?; + state.iter_stack.pop(); + } + pc = end_pc; + continue; + } + // `NextIterOrJump`/`EndIter` are consumed by the `BeginIter` unroll + // (the body slice excludes the `NextIterOrJump`, and `pc` is advanced + // past `EndIter`), so reaching one here means malformed bytecode. + Opcode::NextIterOrJump { .. } | Opcode::EndIter {} => { + return Err(WasmGenError::Unsupported( + "wasmgen: NextIterOrJump/EndIter without a matching BeginIter".to_string(), + )); + } + + // ── Broadcast iteration (Task 3): unroll over the union geometry ── + // `BeginBroadcastIter` unions the `n_sources` views' dim_ids into the + // result geometry, building a per-source dim map (`vm.rs:2314`); the + // body is then unrolled over the result size, mirroring + // `LoadBroadcastElement` / `StoreBroadcastElement`. + Opcode::BeginBroadcastIter { + n_sources, + dest_temp_id, + } => { + let bctx = build_broadcast_ctx(state, *n_sources as usize, *dest_temp_id)?; + let size: usize = bctx.result_dims.iter().map(|&d| d as usize).product(); + let (body, end_pc) = iter_span(code, pc, IterKind::Broadcast)?; + // Same unroll accounting as `BeginIter`: the body is re-emitted + // once per element of the broadcast result geometry. + state.charge_unroll(size)?; + for current in 0..size { + state.broadcast_stack.push(BroadcastCtx { + sources: bctx.sources.clone(), + dest_temp_id: bctx.dest_temp_id, + result_dims: bctx.result_dims.clone(), + current, + }); + emit_ops(body, literals, ctx, state, f)?; + state.broadcast_stack.pop(); + } + pc = end_pc; + continue; + } + Opcode::LoadBroadcastElement { source_idx } => { + emit_load_broadcast_element(state, *source_idx as usize, ctx, f)?; + } + Opcode::StoreBroadcastElement {} => { + let bc_ctx = state.broadcast_stack.last().ok_or_else(|| { + WasmGenError::Unsupported( + "wasmgen: StoreBroadcastElement outside a broadcast iteration".to_string(), + ) + })?; + let dest_temp_id = bc_ctx.dest_temp_id; + let current = bc_ctx.current; + emit_store_iter_element(ctx, dest_temp_id, current, f)?; + } + Opcode::NextBroadcastOrJump { .. } | Opcode::EndBroadcastIter {} => { + return Err(WasmGenError::Unsupported( + "wasmgen: NextBroadcastOrJump/EndBroadcastIter without a matching \ + BeginBroadcastIter" + .to_string(), + )); + } + + // ── Vector operations (Phase 6) ─────────────────────────────── + // Each reads its inputs from the compile-time view stack (top = + // last) + the operand stack and writes its result array to its + // `write_temp_id` temp region -- except `VectorSelect`, which + // reduces to ONE scalar pushed on the stack. The view-stack reads + + // unroll-budget charging happen here (where `EmitState` lives); the + // wasm emission lives in `super::vector`, mirroring the matching VM + // arm element-for-element. The reducers leave the view descriptor on + // the stack for the trailing `PopView`, exactly like the Task-2 + // reducers (the production pattern is `Push*View; ; PopView`). + Opcode::VectorSelect {} => { + // expr_view = top, sel_view = top-1 (vm.rs:2448-2449). + let n = state.view_stack.len(); + if n < 2 { + return Err(WasmGenError::Unsupported( + "wasmgen: VectorSelect needs two views on the stack".to_string(), + )); + } + let expr_view = state.view_stack[n - 1].clone(); + let sel_view = state.view_stack[n - 2].clone(); + // The gather unrolls over `min(sel, expr)` elements. + let size = sel_view.size().min(expr_view.size()); + state.charge_unroll(size)?; + super::vector::emit_vector_select(&sel_view, &expr_view, ctx, f)?; + } + Opcode::VectorElmMap { + write_temp_id, + full_source_len, + } => { + // offset_view = top, source_view = top-1 (vm_vector_elm_map.rs). + let n = state.view_stack.len(); + if n < 2 { + return Err(WasmGenError::Unsupported( + "wasmgen: VectorElmMap needs two views on the stack".to_string(), + )); + } + let offset_view = state.view_stack[n - 1].clone(); + let source_view = state.view_stack[n - 2].clone(); + // The per-element map unrolls over the offset view's size. + state.charge_unroll(offset_view.size())?; + super::vector::emit_vector_elm_map( + &source_view, + &offset_view, + *write_temp_id, + *full_source_len, + ctx, + f, + )?; + } + Opcode::VectorSortOrder { write_temp_id } => { + let input_view = view_top(&state.view_stack)?.clone(); + // Gather + scatter both unroll over `size`; the sort itself is a + // runtime loop in the `stable_sort` helper. + state.charge_unroll(input_view.size())?; + super::vector::emit_vector_sort_order(&input_view, *write_temp_id, ctx, f)?; + } + Opcode::Rank { write_temp_id } => { + let input_view = view_top(&state.view_stack)?.clone(); + state.charge_unroll(input_view.size())?; + super::vector::emit_rank(&input_view, *write_temp_id, ctx, f)?; + } + Opcode::LookupArray { + base_gf, + table_count, + mode, + write_temp_id, + } => { + let input_view = view_top(&state.view_stack)?.clone(); + state.charge_unroll(input_view.size())?; + super::vector::emit_lookup_array( + &input_view, + *base_gf, + *table_count, + *mode, + *write_temp_id, + ctx, + f, + )?; + } + Opcode::AllocateAvailable { write_temp_id } => { + // profile_view = top, requests_view = top-1 (vm.rs:2634-2635). + let n_views = state.view_stack.len(); + if n_views < 2 { + return Err(WasmGenError::Unsupported( + "wasmgen: AllocateAvailable needs two views on the stack".to_string(), + )); + } + let profile_view = state.view_stack[n_views - 1].clone(); + let requests_view = state.view_stack[n_views - 2].clone(); + // Gather (requests) + profile reads + output copy unroll over + // n + 4n + n element-emits. + let n = requests_view.size(); + state.charge_unroll(n.saturating_mul(6))?; + super::alloc::emit_allocate_available_op( + &requests_view, + &profile_view, + *write_temp_id, + ctx, + f, + )?; + } + Opcode::AllocateByPriority { write_temp_id } => { + // priority_view = top, requests_view = top-1 (vm.rs:2728-2729). + let n_views = state.view_stack.len(); + if n_views < 2 { + return Err(WasmGenError::Unsupported( + "wasmgen: AllocateByPriority needs two views on the stack".to_string(), + )); + } + let priority_view = state.view_stack[n_views - 1].clone(); + let requests_view = state.view_stack[n_views - 2].clone(); + let n = requests_view.size(); + state.charge_unroll(n.saturating_mul(6))?; + super::alloc::emit_allocate_by_priority_op( + &requests_view, + &priority_view, + *write_temp_id, + ctx, + f, + )?; + } + // `LoadModuleInput { input }` mirrors the VM (`vm.rs:1376-1378`: + // `stack.push(module_inputs[input])`). The instance's inputs are wasm + // params `1..=n_inputs` (param 0 is `module_off`), so input `input` is + // at local `input + 1`. + Opcode::LoadModuleInput { input } => { + f.instruction(&Instruction::LocalGet(u32::from(*input) + 1)); + } + // `EvalModule { id, n_inputs }` mirrors the VM (`vm.rs:1379-1443`): + // pop the `n_inputs` operands into scratch (in reverse), resolve the + // child instance, and `call` its function for the current `StepPart`, + // passing `module_off + decl.off` and the inputs in order. + Opcode::EvalModule { id, n_inputs } => { + emit_eval_module(*id, *n_inputs, ctx, f)?; + } + Opcode::Ret => { + // The caller emits the function's terminating `End`. + } + other => return Err(WasmGenError::Unsupported(unsupported_opcode(other))), + } + pc += 1; + } + Ok(()) +} + +impl EmitState { + /// Hand out the next fresh i32 wasm local (Task 4 dynamic subscripts). The + /// count is pre-reserved by [`count_extra_i32_locals`], so this never exceeds + /// the function's declared locals. + fn alloc_i32_local(&mut self) -> u32 { + let idx = self.next_i32_local; + self.next_i32_local += 1; + idx + } +} + +/// The compile-time iteration index of the innermost active iteration context, +/// erroring on a body opcode that appeared outside any iteration. +fn current_iter_index(state: &EmitState) -> Result { + state.iter_stack.last().map(|it| it.current).ok_or_else(|| { + WasmGenError::Unsupported("wasmgen: iteration body opcode outside an iteration".to_string()) + }) +} + +/// Which structured iteration the body span belongs to: a `BeginIter` loop or a +/// `BeginBroadcastIter` loop. Each has its own begin/next/end opcode triple, but +/// the well-nested span scan is identical. +#[derive(Clone, Copy, PartialEq, Eq)] +enum IterKind { + Iter, + Broadcast, +} + +/// Given the `pc` of a `BeginIter` / `BeginBroadcastIter`, return the body slice +/// (the opcodes after the begin, up to but excluding its `NextIterOrJump` / +/// `NextBroadcastOrJump`) and the pc *after* the matching `EndIter` / +/// `EndBroadcastIter` (where the outer loop resumes). +/// +/// The span is well-nested (codegen always emits `begin .. next .. end`), so a +/// nested loop of the *same* kind is skipped by depth tracking: `begin` raises +/// the depth and `end` lowers it; the matching `next` is the one at depth 0. +/// A loop of the *other* kind cannot appear inside (codegen never interleaves +/// the two families), but its begin/end would not affect this kind's depth, and +/// its `next` is not this kind's `next`, so the scan is still correct. +fn iter_span( + code: &[Opcode], + begin_pc: usize, + kind: IterKind, +) -> Result<(&[Opcode], usize), WasmGenError> { + let is_begin = |op: &Opcode| match kind { + IterKind::Iter => matches!(op, Opcode::BeginIter { .. }), + IterKind::Broadcast => matches!(op, Opcode::BeginBroadcastIter { .. }), + }; + let is_next = |op: &Opcode| match kind { + IterKind::Iter => matches!(op, Opcode::NextIterOrJump { .. }), + IterKind::Broadcast => matches!(op, Opcode::NextBroadcastOrJump { .. }), + }; + let is_end = |op: &Opcode| match kind { + IterKind::Iter => matches!(op, Opcode::EndIter {}), + IterKind::Broadcast => matches!(op, Opcode::EndBroadcastIter {}), + }; + + let body_start = begin_pc + 1; + let mut depth = 0usize; + let mut i = body_start; + let mut body_end: Option = None; + while i < code.len() { + let op = &code[i]; + if is_begin(op) { + depth += 1; + } else if is_next(op) { + if depth == 0 { + body_end = Some(i); + break; + } + } else if is_end(op) { + // `end` closes the most recent nested `begin` of this kind. The + // outermost (depth-0) `end` is reached only *after* our `next`, so a + // saturating decrement is safe. + depth = depth.saturating_sub(1); + } + i += 1; + } + let body_end = body_end.ok_or_else(|| { + WasmGenError::Unsupported("wasmgen: iteration with no matching Next opcode".to_string()) + })?; + // The `end` opcode immediately follows the (depth-0) `next`. + let end_idx = body_end + 1; + if end_idx >= code.len() || !is_end(&code[end_idx]) { + return Err(WasmGenError::Unsupported( + "wasmgen: iteration Next not immediately followed by End".to_string(), + )); + } + Ok((&code[body_start..body_end], end_idx + 1)) +} + +/// Lower `LoadIterViewTop` (`stack_offset == 1`) / `LoadIterViewAt { offset }`: +/// read `view_stack[len - stack_offset]` at the innermost iteration's `current`, +/// broadcasting against the captured iteration view (`vm.rs:1946-2182`). An +/// invalid source view, a source smaller than the iteration, or an unmatched +/// dimension pushes NaN, exactly as the VM does. +fn emit_load_iter_view( + state: &EmitState, + stack_offset: usize, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + let iter = state.iter_stack.last().ok_or_else(|| { + WasmGenError::Unsupported("wasmgen: LoadIterView* outside an iteration".to_string()) + })?; + if stack_offset == 0 || stack_offset > state.view_stack.len() { + return Err(WasmGenError::Unsupported( + "wasmgen: LoadIterView* stack offset out of range".to_string(), + )); + } + let source = &state.view_stack[state.view_stack.len() - stack_offset]; + // The broadcast index mapping is resolved at compile time; `None` means the + // VM would push NaN for this (source-element, iteration-index) pair. + match source.iter_broadcast_offset(&iter.iter_view, iter.current, ctx.ctx) { + Some(flat) => emit_view_offset_load(source, flat, ctx, f), + None => { + f.instruction(&f64_const(f64::NAN)); + Ok(()) + } + } +} + +/// Store the f64 already on the wasm stack into `temp_storage[temp_offsets[ +/// temp_id] + index]` (the `StoreIterElement` / `StoreBroadcastElement` write). +/// `f64.store` wants `[addr_i32, value_f64]`, so park the value in the scratch +/// local, push the constant address, then reload the value. +fn emit_store_iter_element( + ctx: &EmitCtx, + temp_id: u8, + index: usize, + f: &mut Function, +) -> Result<(), WasmGenError> { + let addr = temp_element_byte_addr(ctx, temp_id, index as u32)?; + f.instruction(&Instruction::LocalSet(ctx.scratch_local)); + f.instruction(&Instruction::I32Const(0)); + f.instruction(&Instruction::LocalGet(ctx.scratch_local)); + f.instruction(&Instruction::F64Store(memarg(addr))); + Ok(()) +} + +/// Build the compile-time broadcast context for a `BeginBroadcastIter`, +/// mirroring the VM's `BeginBroadcastIter` arm (`vm.rs:2314-2373`): union the +/// `n_sources` deepest views' dim_ids into the result geometry (first-encounter +/// order), then build each source's `dim_map` (result dim -> source dim, or +/// `None` for a broadcast axis). +fn build_broadcast_ctx( + state: &EmitState, + n_sources: usize, + dest_temp_id: u8, +) -> Result { + if n_sources == 0 || n_sources > state.view_stack.len() { + return Err(WasmGenError::Unsupported( + "wasmgen: BeginBroadcastIter source count out of range".to_string(), + )); + } + let base = state.view_stack.len() - n_sources; + let sources_slice = &state.view_stack[base..]; + + // Result dim ids/sizes: the union over all sources, first-encounter order. + let mut result_dim_ids: Vec = Vec::new(); + let mut result_dims: Vec = Vec::new(); + for view in sources_slice { + for (d, &dim_id) in view.dim_ids.iter().enumerate() { + if !result_dim_ids.contains(&dim_id) { + result_dim_ids.push(dim_id); + result_dims.push(view.dims[d]); + } + } + } + + // Per source: dim_map[result_dim] = Some(src_dim) by exact dim-id match, else + // None (the source broadcasts along that axis). + let mut sources: Vec<(ViewDesc, Vec>)> = Vec::with_capacity(n_sources); + for view in sources_slice { + let dim_map: Vec> = result_dim_ids + .iter() + .map(|&rid| view.dim_ids.iter().position(|&id| id == rid)) + .collect(); + sources.push((view.clone(), dim_map)); + } + + Ok(BroadcastCtx { + sources, + dest_temp_id, + result_dims, + current: 0, + }) +} + +/// Lower `LoadBroadcastElement { source_idx }`, mirroring the VM +/// (`vm.rs:2375-2414`): decompose the broadcast `current` into per-result-dim +/// indices, scatter them into the source's dimension order through its +/// `dim_map`, then read the source element. An invalid source view pushes NaN. +fn emit_load_broadcast_element( + state: &EmitState, + source_idx: usize, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + let bc_ctx = state.broadcast_stack.last().ok_or_else(|| { + WasmGenError::Unsupported( + "wasmgen: LoadBroadcastElement outside a broadcast iteration".to_string(), + ) + })?; + let (source, dim_map) = bc_ctx.sources.get(source_idx).ok_or_else(|| { + WasmGenError::Unsupported( + "wasmgen: LoadBroadcastElement source_idx out of range".to_string(), + ) + })?; + + // Decompose the result `current` into per-result-dim indices (row-major). + let n_result = bc_ctx.result_dims.len(); + let mut result_indices = vec![0u16; n_result]; + let mut remaining = bc_ctx.current; + for d in (0..n_result).rev() { + let dim = bc_ctx.result_dims[d] as usize; + result_indices[d] = (remaining % dim) as u16; + remaining /= dim; + } + + // Scatter into the source's dimension order: ordered[src_dim] = + // result_indices[result_dim] for each mapped axis (`vm.rs:2395-2402`). + let mut ordered = vec![0u16; source.dims.len()]; + for (result_dim, mapped) in dim_map.iter().enumerate() { + if let Some(src_dim) = mapped { + ordered[*src_dim] = result_indices[result_dim]; + } + } + let flat = source.flat_offset_for_indices(&ordered); + let source = source.clone(); + emit_view_offset_load(&source, flat, ctx, f) +} + +/// Lower `ViewSubscriptDynamic { dim_idx }` (Task 4): pop the 1-based runtime +/// index off the wasm stack, bounds-check it against the top view's +/// `dims[dim_idx]`, and fold `(index-1) * strides[dim_idx]` into the view's +/// runtime-offset local; an out-of-bounds index clears the view's validity flag. +/// The *shape* change (dropping `dim_idx`) is compile-time; only the offset +/// addend and validity are runtime. Mirrors `apply_single_subscript_checked` +/// (`bytecode.rs:242`) + `apply_single_subscript` (`bytecode.rs:326`). +fn emit_view_subscript_dynamic( + dim_idx: usize, + ctx: &EmitCtx, + state: &mut EmitState, + f: &mut Function, +) -> Result<(), WasmGenError> { + use Instruction as Ins; + + // Read the geometry (stride/bound) before mutating the descriptor's shape. + let view = view_top(&state.view_stack)?; + let dim_size = view.dim_at(dim_idx).ok_or_else(|| { + WasmGenError::Unsupported(format!( + "wasmgen: ViewSubscriptDynamic dim {dim_idx} out of range" + )) + })?; + let stride = view.stride_at(dim_idx).ok_or_else(|| { + WasmGenError::Unsupported(format!( + "wasmgen: ViewSubscriptDynamic dim {dim_idx} out of range" + )) + })?; + // Snapshot the (Copy) runtime-offset/validity locals so the borrow of `view` + // ends here, freeing `state` for the mutable re-borrow in the allocate path. + let existing_locals = (view.runtime_off_local, view.valid_local); + + // Lazily allocate (and initialize) the view's runtime-offset + validity + // locals on its first dynamic subscript: offset 0, valid 1. The two locals + // are always set together (below), so once one is present so is the other -- + // the `else unreachable!` makes that invariant explicit rather than relying + // on a bare `.unwrap()` pair. + let (off_local, valid_local) = match existing_locals { + (Some(off), Some(valid)) => (off, valid), + (Some(_), None) | (None, Some(_)) => unreachable!( + "wasmgen: a dynamically-subscripted view sets runtime_off_local and \ + valid_local together; exactly one was present" + ), + (None, None) => { + let off_local = state.alloc_i32_local(); + let valid_local = state.alloc_i32_local(); + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::LocalSet(off_local)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::LocalSet(valid_local)); + let view = view_top_mut(&mut state.view_stack)?; + view.runtime_off_local = Some(off_local); + view.valid_local = Some(valid_local); + (off_local, valid_local) + } + }; + + // Park the popped f64 index in the scratch f64 local (free at an opcode + // boundary) so it can be read twice (bounds check + offset). + f.instruction(&Ins::LocalSet(ctx.scratch_local)); + + // in_bounds = (idx >= 1.0) & (idx <= dim_size). The VM floors the index, but + // the bound test is on the popped value; using the value directly (>= 1.0, + // <= dim_size) matches `index_1based == 0 || index_1based > dims[dim_idx]` + // on the floored u16 for any non-negative index, and a negative index fails + // `>= 1.0`. valid &= in_bounds (validity is sticky-false, like the VM). + f.instruction(&Ins::LocalGet(valid_local)); + f.instruction(&Ins::LocalGet(ctx.scratch_local)); + f.instruction(&Ins::F64Floor); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::F64Ge); // floor(idx) >= 1 + f.instruction(&Ins::LocalGet(ctx.scratch_local)); + f.instruction(&Ins::F64Floor); + f.instruction(&f64_const(f64::from(dim_size))); + f.instruction(&Ins::F64Le); // floor(idx) <= dim_size + f.instruction(&Ins::I32And); + f.instruction(&Ins::I32And); // valid & in_bounds + f.instruction(&Ins::LocalSet(valid_local)); + + // off_local += (floor(idx) as i32 - 1) * stride. Folded unconditionally: when + // invalid the read is NaN-gated, so the (possibly bogus) offset is never used. + f.instruction(&Ins::LocalGet(off_local)); + f.instruction(&Ins::LocalGet(ctx.scratch_local)); + f.instruction(&Ins::F64Floor); + f.instruction(&Ins::I32TruncSatF64S); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Sub); // index - 1 (0-based) + f.instruction(&Ins::I32Const(stride)); + f.instruction(&Ins::I32Mul); + f.instruction(&Ins::I32Add); + f.instruction(&Ins::LocalSet(off_local)); + + // Drop the subscripted dimension from the compile-time shape. + let view = view_top_mut(&mut state.view_stack)?; + view.apply_single_subscript_dynamic(dim_idx) + .ok_or_else(|| { + WasmGenError::Unsupported( + "wasmgen: ViewSubscriptDynamic on a sparse/out-of-range dimension".to_string(), + ) + })?; + Ok(()) +} + +/// Lower `PushSubscriptIndex { bounds }` (Task 4, legacy scalar subscript): pop +/// the 1-based runtime index, range-check it against `bounds`, and accumulate +/// its 0-based value in a fresh i32 local for the eventual `LoadSubscript` fold. +/// An out-of-bounds index clears the accumulation's shared validity flag. +/// Mirrors `vm.rs:1341-1349`. +fn emit_push_subscript_index(bounds: u16, state: &mut EmitState, f: &mut Function) { + use Instruction as Ins; + + // Allocate the shared validity flag on the first index of an accumulation + // (init 1 = valid). Subsequent indices reuse it. + let valid_local = match state.subscript.valid_local { + Some(v) => v, + None => { + let v = state.alloc_i32_local(); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::LocalSet(v)); + state.subscript.valid_local = Some(v); + v + } + }; + + // A fresh i32 local holds this index's 0-based value until LoadSubscript + // folds it (several PushSubscriptIndex precede one LoadSubscript). + let idx_local = state.alloc_i32_local(); + + // idx_i32 = floor(pop) as i32 (the 1-based index). + f.instruction(&Ins::F64Floor); + f.instruction(&Ins::I32TruncSatF64S); + // Keep a copy for the bounds check (LocalTee leaves it on the stack). + f.instruction(&Ins::LocalTee(idx_local)); + + // in_bounds = (idx >= 1) & (idx <= bounds). The VM's test is + // `index == 0 || index > bounds` on a u16 (so a 0 or negative index, which + // `floor as i32` yields <= 0, also fails `>= 1`). valid &= in_bounds. + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32GeS); // idx >= 1 + f.instruction(&Ins::LocalGet(idx_local)); + f.instruction(&Ins::I32Const(i32::from(bounds))); + f.instruction(&Ins::I32LeS); // idx <= bounds + f.instruction(&Ins::I32And); + f.instruction(&Ins::LocalGet(valid_local)); + f.instruction(&Ins::I32And); + f.instruction(&Ins::LocalSet(valid_local)); + + // Store the 0-based index (idx - 1) for the fold. + f.instruction(&Ins::LocalGet(idx_local)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Sub); + f.instruction(&Ins::LocalSet(idx_local)); + + state.subscript.indices.push((idx_local, bounds)); +} + +/// Lower `LoadSubscript { off }` (Task 4, legacy scalar subscript): fold the +/// accumulated 0-based runtime indices into a row-major flat offset and push +/// `curr[module_off + off + flat]`, or NaN when the accumulation is invalid. +/// Mirrors `vm.rs:1351-1366`: `flat = 0; for (i, b) in indices { flat = flat*b +/// + i }`. Clears the accumulator. +fn emit_load_subscript(off: u16, ctx: &EmitCtx, state: &mut EmitState, f: &mut Function) { + use Instruction as Ins; + use wasm_encoder::BlockType; + + let indices = std::mem::take(&mut state.subscript.indices); + let valid_local = state.subscript.valid_local.take(); + + let emit_load = |f: &mut Function| { + // Dynamic address part = (module_off + flat) * 8, where the row-major + // fold is `flat = (((i0)*b1 + i1)*b2 + i2)...` (the VM multiplies the + // running index by each entry's bound then adds the entry's index). + f.instruction(&Ins::LocalGet(ctx.module_off_local)); + // flat fold: + if indices.is_empty() { + f.instruction(&Ins::I32Const(0)); + } else { + // Start with i0. + f.instruction(&Ins::LocalGet(indices[0].0)); + for (idx_local, bounds) in &indices[1..] { + f.instruction(&Ins::I32Const(i32::from(*bounds))); + f.instruction(&Ins::I32Mul); + f.instruction(&Ins::LocalGet(*idx_local)); + f.instruction(&Ins::I32Add); + } + } + f.instruction(&Ins::I32Add); // module_off + flat + f.instruction(&Ins::I32Const(SLOT_SIZE as i32)); + f.instruction(&Ins::I32Mul); // (module_off + flat) * 8 + f.instruction(&Ins::F64Load(memarg(slot_byte_offset(ctx.curr_base, off)))); + }; + + match valid_local { + Some(valid_local) => { + // if valid == 0 { NaN } else { load } + f.instruction(&Ins::LocalGet(valid_local)); + f.instruction(&Ins::I32Eqz); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&f64_const(f64::NAN)); + f.instruction(&Ins::Else); + emit_load(f); + f.instruction(&Ins::End); + } + // No PushSubscriptIndex preceded this (a 0-dim subscript): always valid. + None => emit_load(f), + } +} + +/// Emit a store of the f64 already on the wasm stack into the module-relative +/// slot `off` of `chunk_base`. `f64.store` wants `[addr_i32, value_f64]`, but +/// the value is on top, so stash it in the scratch local, push the address, +/// then reload the value. +fn emit_assign(chunk_base: u32, off: u16, ctx: &EmitCtx, f: &mut Function) { + f.instruction(&Instruction::LocalSet(ctx.scratch_local)); + push_module_relative_base(ctx, f); + f.instruction(&Instruction::LocalGet(ctx.scratch_local)); + f.instruction(&Instruction::F64Store(memarg(slot_byte_offset( + chunk_base, off, + )))); +} + +/// Lower a supported binary op. Operands are already on the wasm stack in push +/// order `[l, r]`; the VM pops `r` then `l` and computes `l op r`, so the +/// non-commutative wasm ops (`f64.sub`/`f64.div`) are already correct. +/// Comparisons yield an i32 0/1 which is converted to f64 1.0/0.0 because +/// downstream opcodes consume booleans as f64 (matching `eval_op2`). +fn emit_op2(op: Op2, ctx: &EmitCtx, f: &mut Function) -> Result<(), WasmGenError> { + match op { + Op2::Add => { + f.instruction(&Instruction::F64Add); + } + Op2::Sub => { + f.instruction(&Instruction::F64Sub); + } + Op2::Mul => { + f.instruction(&Instruction::F64Mul); + } + Op2::Div => { + f.instruction(&Instruction::F64Div); + } + Op2::Gt => emit_cmp(f, &Instruction::F64Gt), + Op2::Gte => emit_cmp(f, &Instruction::F64Ge), + Op2::Lt => emit_cmp(f, &Instruction::F64Lt), + Op2::Lte => emit_cmp(f, &Instruction::F64Le), + // `Eq` is `approx_eq(l, r) as f64`: the operands `[l, r]` are already in + // call order, so `call approx_eq` then widen the i32 1/0 to f64. + Op2::Eq => { + emit_call_approx_eq(ctx, f); + f.instruction(&Instruction::F64ConvertI32U); + } + // `And`/`Or` are `(is_truthy(l) OP is_truthy(r)) as f64`. + Op2::And => emit_logical(ctx, f, Instruction::I32And), + Op2::Or => emit_logical(ctx, f, Instruction::I32Or), + // `Exp` is `l.powf(r)`: the operands `[l, r]` are already in call + // order, so `call pow` directly. Matches `powf` for a positive base + // (a negative base diverges -- see `super::math::emit_pow`). + Op2::Exp => { + f.instruction(&Instruction::Call(ctx.helpers.pow)); + } + // `Mod` is `l.rem_euclid(r)` (result in [0, |r|)), routed through the + // `mod_euclid` helper (`[l, r]` already in call order). + Op2::Mod => { + f.instruction(&Instruction::Call(ctx.helpers.mod_euclid)); + } + } + Ok(()) +} + +/// Lower `Op2::And`/`Op2::Or`: `(is_truthy(l) OP is_truthy(r)) as f64`, with +/// `combine` the bitwise `i32.and`/`i32.or` that realizes `OP`. +/// +/// The operands are on the stack as `[l, r]` (`r` on top), and the wasm operand +/// stack is strict LIFO, so `l` cannot be reduced while `r` sits above it. +/// Park `r` in the scratch f64 local (the same local `emit_assign` uses; it is +/// free here and -- in the `BinOpAssign*` callers -- is overwritten by +/// `emit_assign` before its next read), reduce `is_truthy(l)`, push `r` back and +/// reduce `is_truthy(r)`, then combine. Each `is_truthy` yields an i32 that is +/// exactly 0 or 1, so the bitwise `combine` equals the logical operator; and +/// because `is_truthy` is pure and total, evaluating both operands is +/// bit-identical to the VM's short-circuiting `&&`/`||`. +fn emit_logical(ctx: &EmitCtx, f: &mut Function, combine: Instruction) { + // stack: [l, r] -> scratch = r; stack: [l] + f.instruction(&Instruction::LocalSet(ctx.scratch_local)); + // is_truthy(l); stack: [t_l] + emit_is_truthy(ctx, f); + // bring r back; is_truthy(r); stack: [t_l, t_r] + f.instruction(&Instruction::LocalGet(ctx.scratch_local)); + emit_is_truthy(ctx, f); + // combine and widen to f64 1.0/0.0 + f.instruction(&combine); + f.instruction(&Instruction::F64ConvertI32U); +} + +/// Emit an f64 comparison and convert its i32 result to the f64 0.0/1.0 the +/// VM's `eval_op2` produces for comparisons. +fn emit_cmp(f: &mut Function, cmp: &Instruction) { + f.instruction(cmp); + f.instruction(&Instruction::F64ConvertI32U); +} + +/// Lower the `Apply { func }` opcode, mirroring the VM's `apply()` +/// (`vm.rs:2938`). The three operands are on the wasm stack in push order +/// `[a, b, c]` (`c` on top, matching the VM popping `c` then `b` then `a`); +/// they are parked in the dedicated `ctx.apply_locals` so each builtin can read +/// them any number of times in any order. The result is left on the stack. +/// +/// `time`/`dt` for the time-driven builtins are read from `curr[TIME_OFF]` / +/// `curr[DT_OFF]` (absolute global slots, like `LoadGlobalVar`), matching the +/// VM's `time = curr[TIME_OFF]; dt = curr[DT_OFF]`. +fn emit_apply(func: BuiltinId, ctx: &EmitCtx, f: &mut Function) { + use Instruction as Ins; + let [a, b, c] = ctx.apply_locals; + + // Pop the three padded operands. The stack top is `c`, so set c, then b, + // then a (the VM pops in the same order). + f.instruction(&Ins::LocalSet(c)); + f.instruction(&Ins::LocalSet(b)); + f.instruction(&Ins::LocalSet(a)); + + let get = |f: &mut Function, l: u32| { + f.instruction(&Ins::LocalGet(l)); + }; + + match func { + // ── Native f64 instructions on `a` ──────────────────────────────── + BuiltinId::Abs => { + get(f, a); + f.instruction(&Ins::F64Abs); + } + BuiltinId::Sqrt => { + get(f, a); + f.instruction(&Ins::F64Sqrt); + } + // `Int = a.floor()` -- floor, NOT trunc (the VM's choice; they differ + // for negative arguments). + BuiltinId::Int => { + get(f, a); + f.instruction(&Ins::F64Floor); + } + // `Max`/`Min` use the wasm instructions per AC7.3. These differ from the + // VM's compare form (`if a>b {a} else {b}`) only on NaN/±0; if a corpus + // model ever surfaces such a divergence, switch the offending op to the + // compare-and-select form. + BuiltinId::Max => { + get(f, a); + get(f, b); + f.instruction(&Ins::F64Max); + } + BuiltinId::Min => { + get(f, a); + get(f, b); + f.instruction(&Ins::F64Min); + } + + // ── Compare/arithmetic composed ─────────────────────────────────── + // `Sign = if a>0 {1} else if a<0 {-1} else {0}`, i.e. + // `a>0 ? 1 : (a<0 ? -1 : 0)`, via two selects. wasm `select` yields its + // *deeper* operand when the condition is true, so the outer select is + // expressed with the inverted test `a<=0` (deeper = inner). + BuiltinId::Sign => { + // inner = select(-1.0, 0.0, a < 0) -> -1 if a<0 else 0 + f.instruction(&f64_const(-1.0)); + f.instruction(&f64_const(0.0)); + get(f, a); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::Select); + // result = select(inner, 1.0, a <= 0) -> inner if a<=0 else 1 + f.instruction(&f64_const(1.0)); + get(f, a); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Le); + f.instruction(&Ins::Select); + } + // `Quantum = if b==0.0 {a} else {(a/b).trunc()*b}` (exact `==`). + BuiltinId::Quantum => { + // select(a, (a/b).trunc()*b, b == 0.0) + get(f, a); + // (a/b).trunc() * b + get(f, a); + get(f, b); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::F64Trunc); + get(f, b); + f.instruction(&Ins::F64Mul); + // cond: b == 0.0 + get(f, b); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Eq); + f.instruction(&Ins::Select); + } + // `SafeDiv = if b != 0.0 {a/b} else {c}` (exact `!=`, NOT approx). + BuiltinId::SafeDiv => { + // select(a/b, c, b != 0.0) + get(f, a); + get(f, b); + f.instruction(&Ins::F64Div); + get(f, c); + get(f, b); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Ne); + f.instruction(&Ins::Select); + } + // `Sshape = b + (c-b)/(1.0 + exp(-4.0*(2.0*a-1.0)))`. + BuiltinId::Sshape => { + get(f, b); + // (c - b) + get(f, c); + get(f, b); + f.instruction(&Ins::F64Sub); + // denom = 1.0 + exp(-4.0 * (2.0*a - 1.0)) + f.instruction(&f64_const(1.0)); + // exp arg: -4.0 * (2.0*a - 1.0) + f.instruction(&f64_const(-4.0)); + f.instruction(&f64_const(2.0)); + get(f, a); + f.instruction(&Ins::F64Mul); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::Call(ctx.helpers.exp)); + f.instruction(&Ins::F64Add); // 1.0 + exp(..) + f.instruction(&Ins::F64Div); // (c-b) / denom + f.instruction(&Ins::F64Add); // b + .. + } + + // ── Transcendentals on `a` (Task 2 helpers) ─────────────────────── + BuiltinId::Exp => emit_call_unary(ctx.helpers.exp, a, ctx, f), + BuiltinId::Ln => emit_call_unary(ctx.helpers.ln, a, ctx, f), + BuiltinId::Log10 => emit_call_unary(ctx.helpers.log10, a, ctx, f), + BuiltinId::Sin => emit_call_unary(ctx.helpers.sin, a, ctx, f), + BuiltinId::Cos => emit_call_unary(ctx.helpers.cos, a, ctx, f), + BuiltinId::Tan => emit_call_unary(ctx.helpers.tan, a, ctx, f), + BuiltinId::Arcsin => emit_call_unary(ctx.helpers.asin, a, ctx, f), + BuiltinId::Arccos => emit_call_unary(ctx.helpers.acos, a, ctx, f), + BuiltinId::Arctan => emit_call_unary(ctx.helpers.atan, a, ctx, f), + + // ── Time-driven ─────────────────────────────────────────────────── + // `Step = step(time, dt, a, b) = if time + dt/2 > b {a} else {0.0}`. + BuiltinId::Step => { + // select(a, 0.0, time + dt/2 > b) + get(f, a); + f.instruction(&f64_const(0.0)); + // time + dt/2.0 + emit_load_global(ctx, f, TIME_OFF); + emit_load_global(ctx, f, DT_OFF); + f.instruction(&f64_const(2.0)); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::F64Add); + get(f, b); + f.instruction(&Ins::F64Gt); + f.instruction(&Ins::Select); + } + // `Ramp = ramp(time, slope=a, start=b, end=Some(c))`: + // if time > b { if time >= c { a*(c-b) } else { a*(time-b) } } else 0. + // The Apply form always supplies an end time, so `end.is_some()` is true. + BuiltinId::Ramp => { + // done_value = a * (c - b) + get(f, a); + get(f, c); + get(f, b); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::F64Mul); + // ramping_value = a * (time - b) + get(f, a); + emit_load_global(ctx, f, TIME_OFF); + get(f, b); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::F64Mul); + // inner = select(done_value, ramping_value, time >= c) + emit_load_global(ctx, f, TIME_OFF); + get(f, c); + f.instruction(&Ins::F64Ge); + f.instruction(&Ins::Select); + // result = select(inner, 0.0, time > b) + f.instruction(&f64_const(0.0)); + emit_load_global(ctx, f, TIME_OFF); + get(f, b); + f.instruction(&Ins::F64Gt); + f.instruction(&Ins::Select); + } + // `Pulse = pulse(time, dt, volume=a, first=b, interval=c)` (helper). + BuiltinId::Pulse => { + emit_load_global(ctx, f, TIME_OFF); + emit_load_global(ctx, f, DT_OFF); + get(f, a); + get(f, b); + get(f, c); + f.instruction(&Ins::Call(ctx.helpers.pulse)); + } + + // ── Constants ───────────────────────────────────────────────────── + BuiltinId::Inf => { + f.instruction(&f64_const(f64::INFINITY)); + } + BuiltinId::Pi => { + f.instruction(&f64_const(std::f64::consts::PI)); + } + } +} + +/// Lower the `Lookup { base_gf, table_count, mode }` opcode, mirroring the VM's +/// `Lookup` arm (`vm.rs:1710-1731`). The two operands are on the wasm stack as +/// `[element_offset, index]` (`index` on top, matching the VM popping +/// `lookup_index` then `element_offset`). +/// +/// Bounds check: `element_offset < 0.0 || element_offset >= table_count as f64` +/// pushes NaN (the VM's `*table_count as usize as f64` widens the compile-time +/// `u16` count to f64). Otherwise the table index is +/// `base_gf + (element_offset as i32)` (the VM's `as usize` truncation; the +/// bounds check guarantees `0 <= element_offset < table_count`, so +/// `i32.trunc_sat` is exact and non-negative); its `(data_off, count)` is read +/// from the GF directory at `gf_directory_base + table_idx*8`, and the result +/// comes from a static `call` to the mode's helper (the mode is known at +/// compile time). The result is left on the stack. +/// +/// `index`/`element_offset` are parked in [`scratch_local`](EmitCtx::scratch_local) +/// and `apply_locals[0]` -- both free f64 scratch locals at an opcode boundary +/// (nothing from a prior opcode is live there; `Lookup` and `Apply` never share +/// a live operand within one opcode). The i32 directory address carries no +/// dedicated local (the opcode-program function reserves none), so it is +/// recomputed for the `count` read; the recompute is a handful of cheap integer +/// ops. +fn emit_lookup( + base_gf: GraphicalFunctionId, + table_count: u16, + mode: LookupMode, + ctx: &EmitCtx, + f: &mut Function, +) { + use Instruction as Ins; + use wasm_encoder::BlockType; + + let index_local = ctx.scratch_local; + let elem_off_local = ctx.apply_locals[0]; + + // Pop the operands. `index` is on top, then `element_offset`. + f.instruction(&Ins::LocalSet(index_local)); + f.instruction(&Ins::LocalSet(elem_off_local)); + + // bounds = (element_offset < 0.0) | (element_offset >= table_count as f64) + f.instruction(&Ins::LocalGet(elem_off_local)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::LocalGet(elem_off_local)); + f.instruction(&f64_const(table_count as f64)); + f.instruction(&Ins::F64Ge); + f.instruction(&Ins::I32Or); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + // out of range -> NaN + f.instruction(&f64_const(f64::NAN)); + f.instruction(&Ins::Else); + + let helper_idx = match mode { + LookupMode::Interpolate => ctx.helpers.lookup_interp, + LookupMode::Forward => ctx.helpers.lookup_forward, + LookupMode::Backward => ctx.helpers.lookup_backward, + }; + + // data_off = i32.load[dir_addr + 0]; count = i32.load[dir_addr + 4], where + // dir_addr = gf_directory_base + (base_gf + (element_offset as i32)) * 8. + push_gf_directory_addr(ctx, f, base_gf, elem_off_local); + f.instruction(&Ins::I32Load(i32_memarg(0))); + push_gf_directory_addr(ctx, f, base_gf, elem_off_local); + f.instruction(&Ins::I32Load(i32_memarg(4))); + // index, then call the mode's helper -> f64 result. + f.instruction(&Ins::LocalGet(index_local)); + f.instruction(&Ins::Call(helper_idx)); + + f.instruction(&Ins::End); // end if +} + +/// Push the byte address of table `base_gf + (element_offset as i32)`'s GF +/// directory entry: `gf_directory_base + (base_gf + elem_off_i32) * 8`. +/// `element_offset` is in `elem_off_local` (f64); `i32.trunc_sat_f64_s` matches +/// the VM's `as usize` for the bounds-checked non-negative offset. +fn push_gf_directory_addr( + ctx: &EmitCtx, + f: &mut Function, + base_gf: GraphicalFunctionId, + elem_off_local: u32, +) { + use Instruction as Ins; + f.instruction(&Ins::I32Const(ctx.gf_directory_base as i32)); + f.instruction(&Ins::I32Const(base_gf as i32)); + f.instruction(&Ins::LocalGet(elem_off_local)); + f.instruction(&Ins::I32TruncSatF64S); + f.instruction(&Ins::I32Add); // table_idx = base_gf + elem_off + f.instruction(&Ins::I32Const(GF_DIRECTORY_ENTRY_BYTES)); + f.instruction(&Ins::I32Mul); // table_idx * 8 + f.instruction(&Ins::I32Add); // gf_directory_base + table_idx*8 +} + +/// A 4-byte (i32) memory access with a static byte `offset` (for reading a GF +/// directory entry's two i32 fields). The directory is 8-byte aligned, so a +/// 4-byte access at offset 0 or 4 is naturally aligned. +pub(crate) fn i32_memarg(offset: u64) -> MemArg { + MemArg { + offset, + align: 2, // log2(4): a 4-byte i32 access + memory_index: 0, + } +} + +// ============================================================================ +// Array view stack + reducers (Phase 5 Tasks 1-2) +// ============================================================================ + +/// Borrow the top view descriptor, erroring (rather than panicking) on an empty +/// stack -- malformed bytecode rather than a wrong module. +fn view_top(view_stack: &[ViewDesc]) -> Result<&ViewDesc, WasmGenError> { + view_stack.last().ok_or_else(|| { + WasmGenError::Unsupported("wasmgen: view opcode on empty view stack".to_string()) + }) +} + +/// Mutably borrow the top view descriptor for a transform opcode. +fn view_top_mut(view_stack: &mut [ViewDesc]) -> Result<&mut ViewDesc, WasmGenError> { + view_stack.last_mut().ok_or_else(|| { + WasmGenError::Unsupported("wasmgen: view transform on empty view stack".to_string()) + }) +} + +/// Resolve a dim-list id to `(dim sizes, dim ids)` for `PushVarView`/ +/// `PushTempView`: each entry is a `DimId`, and the size comes from +/// `ctx.dimensions[DimId].size` (`vm.rs:1745`). +fn resolve_dim_list_dims( + ctx: &EmitCtx, + dim_list_id: u16, +) -> Result<(Vec, Vec), WasmGenError> { + let (n_dims, dim_ids) = ctx + .ctx + .dim_lists + .get(dim_list_id as usize) + .map(|(n, ids)| (*n as usize, *ids)) + .ok_or_else(|| { + WasmGenError::Unsupported(format!("wasmgen: dim_list_id {dim_list_id} out of range")) + })?; + let mut dims = Vec::with_capacity(n_dims); + for &dim_id in dim_ids.iter().take(n_dims) { + let size = ctx + .ctx + .dimensions + .get(dim_id as usize) + .map(|d| d.size) + .ok_or_else(|| { + WasmGenError::Unsupported(format!("wasmgen: DimId {dim_id} out of range")) + })?; + dims.push(size); + } + let dim_id_vec = dim_ids[..n_dims].to_vec(); + Ok((dims, dim_id_vec)) +} + +/// Resolve a dim-list id to its raw dimension sizes for `PushVarViewDirect`, +/// where each entry is a literal dimension size, not a `DimId` (`vm.rs:1780`). +/// The caller supplies the view's `dim_ids` itself (all zero -- this view is the +/// base for a dynamic subscript, which does not broadcast), so only the sizes +/// are returned here. +fn resolve_dim_list_raw(ctx: &EmitCtx, dim_list_id: u16) -> Result, WasmGenError> { + let (n_dims, sizes) = ctx + .ctx + .dim_lists + .get(dim_list_id as usize) + .map(|(n, ids)| (*n as usize, *ids)) + .ok_or_else(|| { + WasmGenError::Unsupported(format!("wasmgen: dim_list_id {dim_list_id} out of range")) + })?; + Ok(sizes[..n_dims].to_vec()) +} + +/// The absolute byte address of temp element `index` of temp `temp_id`: +/// `temp_storage_base + (temp_offsets[temp_id] + index) * 8`. +pub(crate) fn temp_element_byte_addr( + ctx: &EmitCtx, + temp_id: u8, + index: u32, +) -> Result { + let temp_off = *ctx.ctx.temp_offsets.get(temp_id as usize).ok_or_else(|| { + WasmGenError::Unsupported(format!("wasmgen: temp id {temp_id} out of range")) + })? as u64; + Ok(u64::from(ctx.temp_storage_base) + (temp_off + u64::from(index)) * u64::from(SLOT_SIZE)) +} + +/// Emit the wasm analogue of the VM's `fill_temp_nan` (`vm.rs:2866-2881`): store +/// IEEE `f64::NAN` (NOT the finite `crate::float::NA` sentinel) into every slot +/// of temp `temp_id`'s region, `temp_storage[temp_offsets[temp_id] .. +/// temp_offsets[temp_id + 1]]` (or `.. temp_total_size` for the last temp). +/// +/// The span is compile-time-known and small (one temp's worth of slots), so the +/// stores are unrolled. Used by the Phase-6 vector ops (`super::vector`) for the +/// invalid-input-view branch. +pub(crate) fn emit_fill_temp_nan( + ctx: &EmitCtx, + temp_id: u8, + f: &mut Function, +) -> Result<(), WasmGenError> { + let idx = temp_id as usize; + let start = *ctx.ctx.temp_offsets.get(idx).ok_or_else(|| { + WasmGenError::Unsupported(format!("wasmgen: temp id {temp_id} out of range")) + })?; + let end = ctx + .ctx + .temp_offsets + .get(idx + 1) + .copied() + .unwrap_or(ctx.ctx.temp_total_size); + for slot in start..end { + // f64.store wants [addr_i32, value_f64]; the per-slot byte offset rides + // in the constant memarg, so the dynamic address is a constant 0. + f.instruction(&Instruction::I32Const(0)); + f.instruction(&f64_const(f64::NAN)); + f.instruction(&Instruction::F64Store(memarg( + u64::from(ctx.temp_storage_base) + (slot as u64) * u64::from(SLOT_SIZE), + ))); + } + Ok(()) +} + +/// Lower `LoadTempDynamic { temp_id }`: pop a runtime index (the VM does +/// `stack.pop().floor() as usize`), compute the temp element address, and load. +/// +/// The address is `temp_storage_base + temp_offsets[temp_id]*8 + index*8`; the +/// constant base/offset ride in the `memarg.offset`, so only `index*8` is +/// computed at runtime. `i32.trunc_sat_f64_s` of `floor(index)` reproduces the +/// VM's `floor() as usize` for a non-negative in-range index. +fn emit_load_temp_dynamic( + ctx: &EmitCtx, + temp_id: u8, + f: &mut Function, +) -> Result<(), WasmGenError> { + use Instruction as Ins; + let base = temp_element_byte_addr(ctx, temp_id, 0)?; + // index (f64, on top) -> floor -> i32 -> *8 (byte stride) + f.instruction(&Ins::F64Floor); + f.instruction(&Ins::I32TruncSatF64S); + f.instruction(&Ins::I32Const(SLOT_SIZE as i32)); + f.instruction(&Ins::I32Mul); + f.instruction(&Ins::F64Load(memarg(base))); + Ok(()) +} + +/// Push the f64 value of view element `iter_idx` onto the wasm stack, reading +/// from the byte address [`ViewDesc::element_addr`] computes. This is the single +/// element-read primitive the reducers (Task 2) and -- for static/temp/var +/// views -- the iteration loop (Task 3) build on. +/// +/// The constant part of the address rides in the `memarg.offset`; the dynamic +/// part of the wasm address is `module_off * 8` for a module-relative view (0 in +/// the current single-root scope, but emitted for Phase 7 generality) and a bare +/// `0` otherwise. A dynamically-subscripted view (Task 4) returns `Unsupported` +/// here. +/// +/// Landed with the view machinery (Task 1) as the single element-read primitive; +/// its first consumer is the array reducer (Task 2), the iteration loop (Task 3) +/// and the Phase-6 vector ops (`super::vector`). +pub(crate) fn emit_view_element_load( + desc: &ViewDesc, + iter_idx: usize, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + let addr = desc + .element_addr(iter_idx, ctx.curr_base, ctx.temp_storage_base, ctx.ctx) + .ok_or_else(bad_temp_view)?; + emit_addr_load(addr, ctx, f); + Ok(()) +} + +/// Push the f64 value of the view element at an *already-computed* flat slot +/// offset (the broadcast paths -- `LoadIterViewTop` / `LoadBroadcastElement` -- +/// build the flat offset themselves rather than from an iteration index). +fn emit_view_offset_load( + desc: &ViewDesc, + flat: usize, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + let addr = desc + .element_addr_for_flat(flat, ctx.curr_base, ctx.temp_storage_base, ctx.ctx) + .ok_or_else(bad_temp_view)?; + emit_addr_load(addr, ctx, f); + Ok(()) +} + +/// Emit the f64 load for a resolved [`ElementAddr`]: the constant part rides in +/// the `memarg.offset`; the dynamic part is `module_off * 8` for a module- +/// relative view plus, for a dynamically-subscripted view (Task 4), the +/// `runtime_off_local * 8` runtime addend (matching the VM's +/// `curr[module_off + base_off + flat + dynamic]`). When the view carries a +/// validity flag (`valid_local`), the whole load is wrapped in a guard that +/// yields NaN when the flag is 0 -- the VM's out-of-bounds-subscript NaN. +fn emit_addr_load(addr: ElementAddr, ctx: &EmitCtx, f: &mut Function) { + use Instruction as Ins; + use wasm_encoder::BlockType; + + // Validity gate (dynamic subscript only): `if valid == 0 { NaN } else `. + if let Some(valid_local) = addr.valid_local { + f.instruction(&Ins::LocalGet(valid_local)); + f.instruction(&Ins::I32Eqz); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&f64_const(f64::NAN)); + f.instruction(&Ins::Else); + emit_addr_load_unguarded(addr, ctx, f); + f.instruction(&Ins::End); + } else { + emit_addr_load_unguarded(addr, ctx, f); + } +} + +/// The bare load half of [`emit_addr_load`] (no validity guard): push the +/// dynamic address part, then `f64.load` with the constant `memarg.offset`. The +/// dynamic part sums `module_off * 8` (module-relative views) and +/// `runtime_off_local * 8` (a dynamic subscript's accumulated offset); if +/// neither is present it is a bare `0`. +fn emit_addr_load_unguarded(addr: ElementAddr, ctx: &EmitCtx, f: &mut Function) { + use Instruction as Ins; + let mut pushed = false; + if addr.module_relative { + push_module_relative_base(ctx, f); + pushed = true; + } + if let Some(off_local) = addr.runtime_off_local { + // runtime_off_local is a slot offset; convert to bytes. + f.instruction(&Ins::LocalGet(off_local)); + f.instruction(&Ins::I32Const(SLOT_SIZE as i32)); + f.instruction(&Ins::I32Mul); + if pushed { + f.instruction(&Ins::I32Add); + } + pushed = true; + } + if !pushed { + f.instruction(&Ins::I32Const(0)); + } + f.instruction(&Ins::F64Load(memarg(addr.const_byte_offset))); +} + +/// The `Unsupported` error for a temp-backed view whose `base_off` is not a +/// valid temp id (`temp_offsets[base_off]` out of range) -- malformed bytecode +/// rather than a wrong module. +fn bad_temp_view() -> WasmGenError { + WasmGenError::Unsupported( + "wasmgen: array element read references an out-of-range temp id".to_string(), + ) +} + +/// Lower one array reducer over the top `ViewDesc` (the descriptor stays on the +/// stack; the production pattern is `PushStaticView; Array; PopView`). +/// +/// Reproduces `reduce_view` (`vm.rs:2802-2840`) and the per-reducer arms +/// (`vm.rs:2216-2309`) exactly, including the asymmetry: +/// - an **invalid** view (`valid_local` present and 0) yields NaN for *every* +/// reducer, including `ArraySum` (`reduce_view`'s `if !is_valid { NaN }`); +/// - an **empty-but-valid** view (`size() == 0`) yields `0.0` for `ArraySum`, +/// `NaN` for Max/Min/Mean/Stddev, and `0` for `ArraySize`. +/// +/// The fold is fully unrolled over the compile-time `size()`: reducer arrays are +/// small, and unrolling reads each element at its compile-time-known address via +/// [`emit_view_element_load`], so no runtime loop or precomputed offset table is +/// needed for the static/temp views the reducer path produces. `ArrayMax`/ +/// `ArrayMin` use the VM's compare-and-select form (`if v > acc { v } else +/// { acc }`), not `f64.max`/`f64.min`, matching the reduce path (AC7.3). +fn emit_array_reduce( + op: &Opcode, + desc: &ViewDesc, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + use Instruction as Ins; + + // ArraySize is always defined (the size of the view), independent of + // validity, and needs no element reads. The VM pushes `view.size() as f64` + // unconditionally (`vm.rs:2306`). + if matches!(op, Opcode::ArraySize {}) { + f.instruction(&f64_const(desc.size() as f64)); + return Ok(()); + } + + let size = desc.size(); + let is_sum = matches!(op, Opcode::ArraySum {}); + + // The empty-but-valid result, before accounting for an invalid view: 0.0 for + // Sum, NaN for the others. + let empty_result = if is_sum { 0.0 } else { f64::NAN }; + + if size == 0 { + // No element reads. For a static view (always valid) this is the final + // answer; a dynamic view's validity is folded in below. + f.instruction(&f64_const(empty_result)); + } else { + emit_reduce_fold(op, desc, size, ctx, f)?; + } + + // An invalid view (Task 4 dynamic subscript out of bounds) overrides the + // computed value with NaN for ALL reducers, mirroring `reduce_view`'s + // leading `if !is_valid { return NaN }`. For static views `valid_local` is + // `None`, so this is a no-op and the static result stands. + if let Some(valid_local) = desc.valid_local { + // Build `select(NaN, computed, valid == 0)`. wasm `select` pops + // `[a, b, cond]` and yields `a` when `cond != 0`, so `a` must be NaN and + // `b` the computed value. The computed value is currently on top, so + // park it (the fold has released `scratch_local` by now), push NaN, push + // the parked value, then `cond = (valid == 0)`. + f.instruction(&Ins::LocalSet(ctx.scratch_local)); + f.instruction(&f64_const(f64::NAN)); // a = NaN + f.instruction(&Ins::LocalGet(ctx.scratch_local)); // b = computed + f.instruction(&Ins::LocalGet(valid_local)); + f.instruction(&Ins::I32Eqz); // cond = 1 when invalid + f.instruction(&Ins::Select); + } + + Ok(()) +} + +/// Emit the unrolled fold body for a non-empty reducer (size >= 1). Leaves the +/// reduced f64 on the wasm stack. Split out so [`emit_array_reduce`] reads +/// linearly. +fn emit_reduce_fold( + op: &Opcode, + desc: &ViewDesc, + size: usize, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + use Instruction as Ins; + match op { + // Sum / Mean / Stddev all begin with the running sum over the elements. + Opcode::ArraySum {} | Opcode::ArrayMean {} | Opcode::ArrayStddev {} => { + // sum = e0 + e1 + ... (init 0.0, matching reduce_view's `0.0` init). + f.instruction(&f64_const(0.0)); + for i in 0..size { + emit_view_element_load(desc, i, ctx, f)?; + f.instruction(&Ins::F64Add); + } + match op { + Opcode::ArraySum {} => {} + Opcode::ArrayMean {} => { + // mean = sum / size (size > 0 here). + f.instruction(&f64_const(size as f64)); + f.instruction(&Ins::F64Div); + } + Opcode::ArrayStddev {} => { + // Two-pass population variance: mean = sum/size (computed + // above and on the stack), then variance = mean of + // (v - mean)^2, then sqrt. Park the mean so each squared + // deviation can reference it. + f.instruction(&f64_const(size as f64)); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::LocalSet(ctx.scratch_local)); // scratch = mean + // variance_sum = Σ (v - mean)^2 + f.instruction(&f64_const(0.0)); + for i in 0..size { + emit_view_element_load(desc, i, ctx, f)?; + f.instruction(&Ins::LocalGet(ctx.scratch_local)); + f.instruction(&Ins::F64Sub); // v - mean + // (v - mean)^2 via self-multiply. This equals `x * x` on + // the host libm and agrees with the VM's `.powf(2.0)` + // within floating-point tolerance regardless (`f64::powf` + // is libm-dependent, so the two are not guaranteed + // bit-identical on every platform). + f.instruction(&Ins::LocalTee(ctx.apply_locals[0])); + f.instruction(&Ins::LocalGet(ctx.apply_locals[0])); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::F64Add); + } + // stddev = sqrt(variance_sum / size) + f.instruction(&f64_const(size as f64)); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::F64Sqrt); + } + _ => unreachable!(), + } + } + // Max / Min: fold with the VM's compare-and-select (`if v > acc { v } + // else { acc }`), init NEG_INFINITY / INFINITY (`vm.rs:2228`/`2245`). + Opcode::ArrayMax {} | Opcode::ArrayMin {} => { + let init = if matches!(op, Opcode::ArrayMax {}) { + f64::NEG_INFINITY + } else { + f64::INFINITY + }; + f.instruction(&f64_const(init)); // acc + for i in 0..size { + // stack: [acc]; load v -> [acc, v]; select(v, acc, cmp). + emit_view_element_load(desc, i, ctx, f)?; + // Compute the comparison then select. wasm `select` pops + // [a, b, cond] and yields a when cond != 0. We want + // `if v acc { v } else { acc }`, so push v then acc and + // test `v acc`. Park acc/v in scratch f64 locals so they + // can be reused for both the select operands and the compare. + f.instruction(&Ins::LocalSet(ctx.apply_locals[1])); // b1 = v + f.instruction(&Ins::LocalSet(ctx.apply_locals[0])); // b0 = acc + f.instruction(&Ins::LocalGet(ctx.apply_locals[1])); // v (select arg a) + f.instruction(&Ins::LocalGet(ctx.apply_locals[0])); // acc (select arg b) + f.instruction(&Ins::LocalGet(ctx.apply_locals[1])); // v + f.instruction(&Ins::LocalGet(ctx.apply_locals[0])); // acc + if matches!(op, Opcode::ArrayMax {}) { + f.instruction(&Ins::F64Gt); // v > acc + } else { + f.instruction(&Ins::F64Lt); // v < acc + } + f.instruction(&Ins::Select); // v if (cmp) else acc -> new acc + } + } + _ => unreachable!("emit_reduce_fold called with non-reducer opcode"), + } + Ok(()) +} + +/// Push `helper(local)` for a unary `(f64) -> f64` helper: load the f64 local, +/// then `call`. +fn emit_call_unary(helper_idx: u32, src: u32, _ctx: &EmitCtx, f: &mut Function) { + f.instruction(&Instruction::LocalGet(src)); + f.instruction(&Instruction::Call(helper_idx)); +} + +/// Push the absolute (module-independent) global slot `off` from `curr`, +/// matching `LoadGlobalVar` (slots 0..4 are reserved globals: TIME/DT/...). +fn emit_load_global(ctx: &EmitCtx, f: &mut Function, off: u16) { + f.instruction(&Instruction::I32Const(0)); + f.instruction(&Instruction::F64Load(memarg(slot_byte_offset( + ctx.curr_base, + off, + )))); +} + +/// Lower `LoadPrev { off }`, mirroring the VM (`vm.rs:1320-1328`). A fallback +/// f64 is already on the wasm stack (codegen pushes it immediately before this +/// opcode). Park it in the scratch local, then build `select(fallback, +/// prev_values[module_off+off], use_prev_fallback)`: wasm `select` yields its +/// *deeper* operand when the condition is non-zero, so pushing +/// `[fallback, prev_value, use_prev_fallback]` yields the fallback while the +/// flag is set and the snapshot value once it is cleared. +fn emit_load_prev(off: u16, ctx: &EmitCtx, f: &mut Function) { + use Instruction as Ins; + // Park the fallback (top of stack) so the module-relative prev_values + // address can be pushed beneath it. + f.instruction(&Ins::LocalSet(ctx.scratch_local)); + f.instruction(&Ins::LocalGet(ctx.scratch_local)); // [fallback] + // prev_values[module_off + off] + push_module_relative_base(ctx, f); + f.instruction(&Ins::F64Load(memarg(slot_byte_offset( + ctx.prev_values_base, + off, + )))); // [fallback, prev_value] + f.instruction(&Ins::GlobalGet(ctx.use_prev_fallback_global)); // [fallback, prev_value, cond] + f.instruction(&Ins::Select); +} + +/// Lower `LoadInitial { off }`, mirroring the VM (`vm.rs:1332-1340`) with the +/// `part == Initials` branch resolved at compile time from `ctx.step_part`. In +/// the initials program the snapshot is not yet taken, so read +/// `curr[module_off+off]` (the value being computed); in the flows/stocks +/// programs read the post-initials `initial_values[module_off+off]` snapshot. +fn emit_load_initial(off: u16, ctx: &EmitCtx, f: &mut Function) { + let chunk_base = if ctx.step_part == StepPart::Initials { + ctx.curr_base + } else { + ctx.initial_values_base + }; + push_module_relative_base(ctx, f); + f.instruction(&Instruction::F64Load(memarg(slot_byte_offset( + chunk_base, off, + )))); +} + +/// Lower `EvalModule { id, n_inputs }`, mirroring the VM (`vm.rs:1379-1443`). +/// +/// The `n_inputs` operands are already on the wasm stack (the parent's bytecode +/// pushed them, top = the last input). The VM pops them in reverse into +/// `module_inputs[j]` (`for j in (0..n_inputs).rev()`), computes +/// `child_module_off = module_off + context.modules[id].off`, then evaluates the +/// child for the current `part`. Here: +/// 1. pop the operands in reverse into the function's `module_input_scratch` +/// f64 locals (`scratch[j]` for `j` from `n_inputs-1` down to 0), so +/// `scratch[j]` holds input `j` -- identical to the VM's `module_inputs[j]`; +/// 2. resolve the child instance's function index for `ctx.step_part` (the +/// `EvalModule` site in the initials/flows/stocks program calls the child's +/// initials/flows/stocks function -- the `StepPart` is compile-time per +/// program; the instantiation graph is acyclic, so the index already exists); +/// 3. push `child_module_off` (`module_off + decl.off`) then `scratch[0..k]` in +/// order -- the child's `(module_off, in_0, .., in_{k-1})` argument list -- +/// and `call` it. The child reads/writes the shared slab at `module_off + +/// off`, so threading the runtime `child_module_off` is what lets one +/// `CompiledModule` run at several base offsets. +fn emit_eval_module( + id: u16, + n_inputs: u8, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + let decl = ctx.ctx.modules.get(id as usize).ok_or_else(|| { + WasmGenError::Unsupported(format!("wasmgen: EvalModule module id {id} out of range")) + })?; + let child_key = make_module_key(&decl.model_name, &decl.input_set); + let &fn_index = ctx + .module_fn_index + .get(&(child_key, ctx.step_part)) + .ok_or_else(|| { + WasmGenError::Unsupported(format!( + "wasmgen: EvalModule child instance for module id {id} has no compiled function" + )) + })?; + let decl_off = i32::try_from(decl.off).map_err(|_| { + WasmGenError::Unsupported("wasmgen: module offset too large to lower".to_string()) + })?; + + // Pop the operands in reverse into the reverse-pop scratch f64 locals, so + // `scratch[j]` ends holding input `j` (exactly the VM's `module_inputs[j]`). + let n = u32::from(n_inputs); + for j in (0..n).rev() { + f.instruction(&Instruction::LocalSet(ctx.module_input_scratch_base + j)); + } + // Push `child_module_off = module_off + decl.off`. + f.instruction(&Instruction::LocalGet(ctx.module_off_local)); + f.instruction(&Instruction::I32Const(decl_off)); + f.instruction(&Instruction::I32Add); + // Push the inputs back in order, then call the child's `part` function. + for j in 0..n { + f.instruction(&Instruction::LocalGet(ctx.module_input_scratch_base + j)); + } + f.instruction(&Instruction::Call(fn_index)); + Ok(()) +} + +/// Name an unsupported opcode without depending on `Debug` (feature-gated via +/// `debug-derive`). +fn unsupported_opcode(op: &Opcode) -> String { + let name = match op { + Opcode::PushSubscriptIndex { .. } => "PushSubscriptIndex", + Opcode::LoadSubscript { .. } => "LoadSubscript", + Opcode::LoadModuleInput { .. } => "LoadModuleInput", + Opcode::EvalModule { .. } => "EvalModule", + Opcode::Apply { .. } => "Apply", + Opcode::Lookup { .. } => "Lookup", + // Fused / superinstruction / array opcodes never reach a + // CompiledSimulation consumer, but name them defensively. + _ => "opcode", + }; + format!("wasmgen: unsupported Opcode::{name}") +} + +#[cfg(test)] +#[path = "lower_tests.rs"] +mod tests; diff --git a/src/simlin-engine/src/wasmgen/lower_tests.rs b/src/simlin-engine/src/wasmgen/lower_tests.rs new file mode 100644 index 000000000..f671d694d --- /dev/null +++ b/src/simlin-engine/src/wasmgen/lower_tests.rs @@ -0,0 +1,5355 @@ +// Copyright 2026 The Simlin Authors. All rights reserved. +// Use of this source code is governed by the Apache License, +// Version 2.0, that can be found in the LICENSE file. + +//! Tests for the bytecode-to-WebAssembly lowering ([`super`]). Split out of +//! `lower.rs` to keep that file under the project line-count lint; this is the +//! `#[cfg(test)] mod tests` body, included via `#[path]` so `use super::*` +//! still resolves the lowering module's private items. + +use super::*; +use checked::Store; +use wasm::validate; +use wasm_encoder::{ + CodeSection, ExportKind, ExportSection, FunctionSection, MemorySection, MemoryType, Module, + TypeSection, ValType, +}; + +use crate::bytecode::ByteCodeContext; +use std::sync::OnceLock; + +/// Local layout for the test harness function. The function takes `module_off` +/// as param 0 (no f64 module-input params -- these are root-only lowering tests); +/// the scratch f64 and condition i32(s) are declared locals, whose indices come +/// from the production helpers (`scratch_local_for` / `condition_locals_for`). +const L_MODULE_OFF: u32 = 0; + +/// A shared empty `ByteCodeContext` for the scalar-opcode tests, which never +/// touch the array tables. Array-view tests build their own context (with +/// `static_views`/`temp_offsets`) and an `EmitCtx` borrowing it locally. +fn empty_ctx() -> &'static ByteCodeContext { + static EMPTY: OnceLock = OnceLock::new(); + EMPTY.get_or_init(ByteCodeContext::default) +} + +/// A shared empty `(ModuleKey, StepPart) -> fn index` map. These lowering unit +/// tests build single root-only functions (0 module inputs) and never emit an +/// `EvalModule`, so the map is never consulted. The whole-model `EvalModule` / +/// `LoadModuleInput` parity is exercised end-to-end in `module.rs`'s tests. +fn empty_module_fn_index() +-> &'static std::collections::HashMap<(crate::vm::ModuleKey, StepPart), u32> { + static EMPTY: OnceLock> = + OnceLock::new(); + EMPTY.get_or_init(std::collections::HashMap::new) +} + +/// A shared empty set of overridable constant offsets. These lowering unit +/// tests never exercise an `AssignConstCurr` redirect (the set is empty, so +/// every `AssignConstCurr` emits its immediate literal -- the pre-Task-2 +/// behavior these tests pin); the constants-region redirect is exercised +/// end-to-end in `module.rs`'s `set_value`/`reset` tests. +fn empty_const_offsets() -> &'static std::collections::HashSet { + static EMPTY: OnceLock> = OnceLock::new(); + EMPTY.get_or_init(std::collections::HashSet::new) +} + +fn ctx_with_cond_depth(depth: usize) -> EmitCtx<'static> { + // These tests build a root-only function: `module_off` is param 0, there are + // no f64 module-input params, so `n_inputs == 0` reproduces the historical + // (pre-Phase-7) local indices exactly (scratch at 1, conditions at 2..). + let n_inputs = 0; + EmitCtx { + curr_base: 0, + next_base: 4096, + // The non-Lookup opcode tests place no GF regions; these bases are + // unused by the opcodes they exercise. The Lookup-opcode tests + // (which do read these) build their own ctx with real GF bases. + gf_directory_base: 0, + gf_data_base: 0, + // The PREVIOUS/INIT opcode tests build their own ctx with real + // snapshot bases + flag; the rest never touch these fields. + initial_values_base: 0, + prev_values_base: 0, + use_prev_fallback_global: 0, + step_part: StepPart::Flows, + dt: 0.5, + start_time: 1.0, + final_time: 25.0, + module_off_local: L_MODULE_OFF, + scratch_local: scratch_local_for(n_inputs), + condition_locals: condition_locals_for(n_inputs, depth), + apply_locals: apply_locals_for(n_inputs, depth), + // The helper-function indices are deterministic (helpers occupy the + // module's first function slots), and `build_module` emits exactly + // these helper bodies ahead of `eval`, so the indices agree. + helpers: build_helpers().fns, + // The scalar-opcode tests place no temp region; the array-view tests + // build their own ctx with a real temp base + context. + temp_storage_base: 0, + // Dynamic-subscript scratch i32 locals (Task 4) follow the scratch + // f64 / condition i32s / Apply f64s / the vector-op scratch blocks; + // `build_module` declares exactly `count_extra_i32_locals(bc)` of them + // at this base. + extra_i32_local_base: extra_i32_local_base(n_inputs, depth), + // The fixed Phase-6 vector-op scratch local blocks. + vector_f64_locals: vector_f64_locals_for(n_inputs, depth), + vector_i32_locals: vector_i32_locals_for(n_inputs, depth), + // The vector-op scratch region: well past TEMP_BASE (8192) but within + // the harness's single 64 KiB memory page, so the small test views' + // sort-pair / collected-value staging never collides with temp_storage. + vector_scratch_base: VECTOR_SCRATCH_BASE, + // The allocation scratch region: a separate high band, past the vector + // scratch and clear of temp_storage, sized for the tiny test views' + // request/profile/out staging. + alloc_scratch_base: ALLOC_SCRATCH_BASE, + // No `EvalModule` in these single-function tests: the reverse-pop scratch + // base sits past the extra-i32 block (none declared here), and the child + // function map is empty. + module_input_scratch_base: module_input_scratch_base(n_inputs, depth, 0), + // No overridable constants in these single-function tests: the constants + // region is unused and the offset set is empty (every `AssignConstCurr` + // emits its immediate literal -- the pre-Task-2 behavior). The Task-2 + // override redirect is exercised end-to-end in `module.rs`'s tests. + const_region_base: 0, + flows_const_offsets: empty_const_offsets(), + module_fn_index: empty_module_fn_index(), + ctx: empty_ctx(), + } +} + +/// Byte offset of the vector-op scratch region for the test harness. Past +/// `TEMP_BASE` (8192) and any small test temp region, with ~6000 f64 slots of +/// headroom before the 64 KiB page end -- ample for the tiny test views. +const VECTOR_SCRATCH_BASE: u32 = 16384; + +/// Byte offset of the allocation scratch region for the test harness. A high +/// band (~40 KiB) past `VECTOR_SCRATCH_BASE`, leaving room for both regions' +/// tiny test stagings within the single 64 KiB page. +const ALLOC_SCRATCH_BASE: u32 = 40960; + +fn bc(literals: Vec, code: Vec) -> ByteCode { + ByteCode { literals, code } +} + +/// Build a module exporting `mem` and an `eval(module_off: i32)` function +/// whose body is the lowered `bc`. When `with_result`, `eval` returns the +/// f64 left on the stack. The function declares one scratch f64 local plus +/// `cond_depth` i32 condition locals. +/// +/// Mirrors `module.rs`'s production assembly: the emitted helper functions +/// ([`build_helpers`]) occupy function indices `0..N` so the `call`s +/// `emit_bytecode` generates resolve, and `eval` follows at index `N`. +fn build_module(bc: &ByteCode, ctx: &EmitCtx, with_result: bool, cond_depth: usize) -> Vec { + let mut module = Module::new(); + + let helpers = build_helpers(); + let n_helpers = helpers.functions.len() as u32; + + // Type 0 is `eval`'s signature; each helper's signature follows. + let mut types = TypeSection::new(); + if with_result { + types.ty().function([ValType::I32], [ValType::F64]); + } else { + types.ty().function([ValType::I32], []); + } + for hf in &helpers.functions { + types.ty().function(hf.params.clone(), hf.results.clone()); + } + module.section(&types); + + // Function indices follow declaration order: helpers first (0..N), then + // `eval` at N. Helper type indices are 1..=N (eval's type is 0). + let mut functions = FunctionSection::new(); + for (i, _) in helpers.functions.iter().enumerate() { + functions.function(1 + i as u32); + } + functions.function(0); + module.section(&functions); + + let mut memories = MemorySection::new(); + memories.memory(MemoryType { + minimum: 1, + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + module.section(&memories); + + let mut exports = ExportSection::new(); + exports.export("eval", ExportKind::Func, n_helpers); + exports.export("mem", ExportKind::Memory, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + for hf in helpers.functions { + code.function(&hf.body); + } + // 1 scratch f64 local, `cond_depth` i32 condition locals, the 3 `Apply` + // scratch f64 locals, the program's dynamic-subscript i32 scratch locals, + // and the `EvalModule` reverse-pop f64 scratch (none here -- root-only) -- + // the same layout production uses. + let mut func = Function::new(opcode_fn_locals( + cond_depth, + count_extra_i32_locals(bc), + count_module_input_scratch(bc), + )); + emit_bytecode(bc, ctx, &mut func).expect("lowering should succeed"); + func.instruction(&Instruction::End); + code.function(&func); + module.section(&code); + + module.finish() +} + +/// Emit, validate, instantiate, seed `curr`/`next` slots, run `eval(0)`, +/// and either return its f64 result (`read_addr == None`) or the f64 at +/// `read_addr`. +fn run( + bc: &ByteCode, + ctx: &EmitCtx, + with_result: bool, + cond_depth: usize, + seed: &[(u64, f64)], + read_addr: Option, +) -> f64 { + let bytes = build_module(bc, ctx, with_result, cond_depth); + let info = validate(&bytes).expect("emitted module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("emitted module must instantiate") + .module_addr; + + if !seed.is_empty() { + let mem = store + .instance_export(module, "mem") + .unwrap() + .as_mem() + .unwrap(); + store.mem_access_mut_slice(mem, |bytes| { + for &(addr, v) in seed { + let a = addr as usize; + bytes[a..a + 8].copy_from_slice(&v.to_le_bytes()); + } + }); + } + + let eval = store + .instance_export(module, "eval") + .unwrap() + .as_func() + .unwrap(); + + match read_addr { + None => store + .invoke_simple_typed(eval, (0_i32,)) + .expect("invocation must succeed"), + Some(addr) => { + store + .invoke_simple_typed::<(i32,), ()>(eval, (0_i32,)) + .expect("invocation must succeed"); + let mem = store + .instance_export(module, "mem") + .unwrap() + .as_mem() + .unwrap(); + store.mem_access_mut_slice(mem, |bytes| { + let a = addr as usize; + f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap()) + }) + } + } +} + +/// Evaluate a value program (with a 0-depth condition stack) and return its +/// result. +fn value(code: Vec, literals: Vec, seed: &[(u64, f64)]) -> f64 { + run( + &bc(literals, code), + &ctx_with_cond_depth(0), + true, + 0, + seed, + None, + ) +} + +/// Run an assignment program and read back the stored slot. +fn stored(code: Vec, literals: Vec, seed: &[(u64, f64)], read_addr: u64) -> f64 { + run( + &bc(literals, code), + &ctx_with_cond_depth(0), + false, + 0, + seed, + Some(read_addr), + ) +} + +fn op2(op: Op2) -> Opcode { + Opcode::Op2 { op } +} + +// ── LoadConstant ────────────────────────────────────────────────────── + +#[test] +fn lowers_load_constant() { + assert_eq!( + value(vec![Opcode::LoadConstant { id: 0 }], vec![3.5], &[]), + 3.5 + ); +} + +#[test] +fn lowers_load_constant_selects_right_literal() { + let code = vec![Opcode::LoadConstant { id: 2 }]; + assert_eq!(value(code, vec![1.0, 2.0, 42.0], &[]), 42.0); +} + +// ── LoadVar / LoadGlobalVar ─────────────────────────────────────────── + +#[test] +fn lowers_load_var_from_curr() { + // slot 4 of curr lives at byte 4*8 = 32; module_off is 0. + let code = vec![Opcode::LoadVar { off: 4 }]; + assert_eq!(value(code, vec![], &[(32, 7.0)]), 7.0); +} + +#[test] +fn lowers_load_global_var_absolute() { + // LoadGlobalVar reads slot `off` ignoring module_off; slot 0 (TIME) at + // byte 0. + let code = vec![Opcode::LoadGlobalVar { off: 0 }]; + assert_eq!(value(code, vec![], &[(0, 13.0)]), 13.0); +} + +#[test] +fn load_var_honors_module_off() { + // With a non-zero module_off, LoadVar{off:1} reads curr[module_off+1]; + // LoadGlobalVar{off:1} reads curr[1] regardless. We verify the dynamic + // base path by running eval with module_off=2 directly. + let ctx = ctx_with_cond_depth(0); + let program = bc(vec![], vec![Opcode::LoadVar { off: 1 }]); + let bytes = build_module(&program, &ctx, true, 0); + let info = validate(&bytes).expect("module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let mem = store + .instance_export(module, "mem") + .unwrap() + .as_mem() + .unwrap(); + // curr[3] at byte 24 (module_off=2 + off=1). + store.mem_access_mut_slice(mem, |bytes| { + bytes[24..32].copy_from_slice(&99.0_f64.to_le_bytes()); + }); + let eval = store + .instance_export(module, "eval") + .unwrap() + .as_func() + .unwrap(); + let result: f64 = store.invoke_simple_typed(eval, (2_i32,)).expect("invoke"); + assert_eq!(result, 99.0); +} + +// ── Op2: arithmetic ─────────────────────────────────────────────────── + +#[test] +fn lowers_arithmetic_ops() { + let lc = |id| Opcode::LoadConstant { id }; + // 2 + 3 = 5 + assert_eq!( + value(vec![lc(0), lc(1), op2(Op2::Add)], vec![2.0, 3.0], &[]), + 5.0 + ); + // 2 - 3 = -1 (operand order: l=2, r=3) + assert_eq!( + value(vec![lc(0), lc(1), op2(Op2::Sub)], vec![2.0, 3.0], &[]), + -1.0 + ); + // 2 * 3 = 6 + assert_eq!( + value(vec![lc(0), lc(1), op2(Op2::Mul)], vec![2.0, 3.0], &[]), + 6.0 + ); + // 3 / 2 = 1.5 (operand order: l=3, r=2) + assert_eq!( + value(vec![lc(0), lc(1), op2(Op2::Div)], vec![3.0, 2.0], &[]), + 1.5 + ); +} + +#[test] +fn op2_operand_order_matches_vm() { + // The VM computes `l op r` with l pushed first. births = pop * rate: + // pop=slot4 (byte 32), constant rate. + let code = vec![ + Opcode::LoadVar { off: 4 }, + Opcode::LoadConstant { id: 0 }, + op2(Op2::Mul), + ]; + assert_eq!(value(code, vec![0.1], &[(32, 100.0)]), 10.0); +} + +// ── Op2: comparisons yield f64 0.0/1.0 ──────────────────────────────── + +#[test] +fn lowers_comparisons_to_f64_bool() { + let lc = |id| Opcode::LoadConstant { id }; + assert_eq!( + value(vec![lc(0), lc(1), op2(Op2::Gt)], vec![2.0, 1.0], &[]), + 1.0 + ); + assert_eq!( + value(vec![lc(0), lc(1), op2(Op2::Gt)], vec![1.0, 2.0], &[]), + 0.0 + ); + assert_eq!( + value(vec![lc(0), lc(1), op2(Op2::Gte)], vec![1.0, 1.0], &[]), + 1.0 + ); + assert_eq!( + value(vec![lc(0), lc(1), op2(Op2::Lt)], vec![1.0, 2.0], &[]), + 1.0 + ); + assert_eq!( + value(vec![lc(0), lc(1), op2(Op2::Lte)], vec![1.0, 1.0], &[]), + 1.0 + ); +} + +// ── Not ─────────────────────────────────────────────────────────────── + +#[test] +fn lowers_not_truthiness() { + let lc = |id| Opcode::LoadConstant { id }; + assert_eq!(value(vec![lc(0), Opcode::Not {}], vec![0.0], &[]), 1.0); + assert_eq!(value(vec![lc(0), Opcode::Not {}], vec![5.0], &[]), 0.0); +} + +// ── SetCond + If ────────────────────────────────────────────────────── + +/// `if cond then t else f`. Mirrors codegen's emission order: push t, push +/// f, push cond, SetCond, If. Run with a depth-1 condition stack. +fn if_program(cond: f64, t: f64, f: f64) -> f64 { + let code = vec![ + Opcode::LoadConstant { id: 1 }, // t + Opcode::LoadConstant { id: 2 }, // f + Opcode::LoadConstant { id: 0 }, // cond + Opcode::SetCond {}, + Opcode::If {}, + ]; + run( + &bc(vec![cond, t, f], code), + &ctx_with_cond_depth(1), + true, + 1, + &[], + None, + ) +} + +#[test] +fn lowers_if_selects_true_arm() { + assert_eq!(if_program(1.0, 10.0, 20.0), 10.0); +} + +#[test] +fn lowers_if_selects_false_arm_for_zero() { + assert_eq!(if_program(0.0, 10.0, 20.0), 20.0); +} + +#[test] +fn lowers_if_truthy_nonzero_is_true() { + // Any non-zero condition is true (matches the VM's is_truthy). + assert_eq!(if_program(0.5, 10.0, 20.0), 10.0); + assert_eq!(if_program(-3.0, 10.0, 20.0), 10.0); +} + +#[test] +fn lowers_if_with_comparison_condition() { + // if pop > 50 then 1 else 0, pop in slot 4 (byte 32). + let code = vec![ + Opcode::LoadConstant { id: 0 }, // t = 1 + Opcode::LoadConstant { id: 1 }, // f = 0 + Opcode::LoadVar { off: 4 }, // pop + Opcode::LoadConstant { id: 2 }, // 50 + op2(Op2::Gt), + Opcode::SetCond {}, + Opcode::If {}, + ]; + let run_with = |seed: &[(u64, f64)]| { + run( + &bc(vec![1.0, 0.0, 50.0], code.clone()), + &ctx_with_cond_depth(1), + true, + 1, + seed, + None, + ) + }; + assert_eq!(run_with(&[(32, 100.0)]), 1.0); + assert_eq!(run_with(&[(32, 10.0)]), 0.0); +} + +#[test] +fn lowers_nested_if() { + // if (if a then b else c) then d else e. + // codegen order: push d, push e, then walk the cond which is the inner + // If (push b, push c, push a, SetCond_inner, If_inner), then + // SetCond_outer, If_outer. literals: a,b,c,d,e at 0..5. + let code = vec![ + Opcode::LoadConstant { id: 3 }, // d + Opcode::LoadConstant { id: 4 }, // e + Opcode::LoadConstant { id: 1 }, // b + Opcode::LoadConstant { id: 2 }, // c + Opcode::LoadConstant { id: 0 }, // a + Opcode::SetCond {}, // inner + Opcode::If {}, // inner -> b or c + Opcode::SetCond {}, // outer (cond = inner result) + Opcode::If {}, // outer -> d or e + ]; + let eval = |a: f64, b: f64, c: f64, d: f64, e: f64| { + run( + &bc(vec![a, b, c, d, e], code.clone()), + &ctx_with_cond_depth(2), + true, + 2, + &[], + None, + ) + }; + // a truthy -> inner = b. b truthy -> outer = d. + assert_eq!(eval(1.0, 1.0, 0.0, 100.0, 200.0), 100.0); + // a falsey -> inner = c. c falsey -> outer = e. + assert_eq!(eval(0.0, 1.0, 0.0, 100.0, 200.0), 200.0); + // a truthy -> inner = b=0 (falsey) -> outer = e. + assert_eq!(eval(1.0, 0.0, 9.0, 100.0, 200.0), 200.0); +} + +// ── AssignCurr / AssignNext ─────────────────────────────────────────── + +#[test] +fn lowers_assign_curr_constant() { + // store 42.0 into curr slot 5 (byte 40), read it back. + let code = vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::AssignCurr { off: 5 }, + ]; + assert_eq!(stored(code, vec![42.0], &[], 40), 42.0); +} + +#[test] +fn lowers_assign_const_curr() { + // AssignConstCurr is emitted by base codegen for a constant-RHS + // assignment (e.g. a constant initial or aux): curr[off] = literals[id]. + // Store 7.0 into curr slot 6 (byte 48), read it back. + let code = vec![Opcode::AssignConstCurr { + off: 6, + literal_id: 0, + }]; + assert_eq!(stored(code, vec![7.0], &[], 48), 7.0); +} + +#[test] +fn assign_const_curr_honors_module_off() { + // With module_off=2, AssignConstCurr{off:1} writes curr[3] (byte 24). + let ctx = ctx_with_cond_depth(0); + let program = bc( + vec![3.5], + vec![Opcode::AssignConstCurr { + off: 1, + literal_id: 0, + }], + ); + let bytes = build_module(&program, &ctx, false, 0); + let info = validate(&bytes).expect("module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let eval = store + .instance_export(module, "eval") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(i32,), ()>(eval, (2_i32,)) + .expect("invoke"); + let mem = store + .instance_export(module, "mem") + .unwrap() + .as_mem() + .unwrap(); + let v = store.mem_access_mut_slice(mem, |bytes| { + f64::from_le_bytes(bytes[24..32].try_into().unwrap()) + }); + assert_eq!(v, 3.5); +} + +#[test] +fn lowers_bin_op_assign_curr() { + // BinOpAssignCurr is the peephole fusion of `Op2; AssignCurr`: pops + // [l, r], computes l op r, stores to curr[off]. Mirrors vm.rs:1457. + // deaths = pop / 80 -> curr slot 6 (byte 48); pop = slot 4 (byte 32). + let code = vec![ + Opcode::LoadVar { off: 4 }, + Opcode::LoadConstant { id: 0 }, + Opcode::BinOpAssignCurr { + op: Op2::Div, + off: 6, + }, + ]; + assert_eq!(stored(code, vec![80.0], &[(32, 200.0)], 48), 2.5); +} + +#[test] +fn lowers_bin_op_assign_next() { + // BinOpAssignNext is the peephole fusion of `Op2; AssignNext` (stock + // integration): pops [l, r], computes l op r, stores to next[off]. + // next[pop] = pop + delta, with delta in curr slot 5. + // next slot 4 lives at next_base(4096) + 32 = 4128. + let code = vec![ + Opcode::LoadVar { off: 4 }, // pop + Opcode::LoadVar { off: 5 }, // delta + Opcode::BinOpAssignNext { + op: Op2::Add, + off: 4, + }, + ]; + // pop=100, delta=3.75 -> 103.75 + assert_eq!( + stored(code, vec![], &[(32, 100.0), (40, 3.75)], 4128), + 103.75 + ); +} + +#[test] +fn bin_op_assign_curr_operand_order_matches_vm() { + // Non-commutative op: l - r with l pushed first. + // result = a - b -> curr slot 5 (byte 40); a=slot 3 (24), b=slot 4 (32). + let code = vec![ + Opcode::LoadVar { off: 3 }, + Opcode::LoadVar { off: 4 }, + Opcode::BinOpAssignCurr { + op: Op2::Sub, + off: 5, + }, + ]; + assert_eq!(stored(code, vec![], &[(24, 10.0), (32, 3.0)], 40), 7.0); +} + +// Note: every `Op2` variant is supported as of Phase 2 (Mod/Exp landed in +// Task 3), so there is no longer an unsupported operator to drive the +// `BinOpAssign*` error-propagation path. The fused-`Mod` form is exercised +// for correctness by `bin_op_assign_curr_mod_stores_rem_euclid`; the +// clean-error-on-unsupported-*opcode* path is covered by +// `unsupported_lookup_returns_error` / `unsupported_array_opcode_returns_error`. + +#[test] +fn lowers_assign_curr_from_expr() { + // deaths = pop / 80 -> curr slot 6 (byte 48); pop = slot 4 (byte 32). + let code = vec![ + Opcode::LoadVar { off: 4 }, + Opcode::LoadConstant { id: 0 }, + op2(Op2::Div), + Opcode::AssignCurr { off: 6 }, + ]; + assert_eq!(stored(code, vec![80.0], &[(32, 200.0)], 48), 2.5); +} + +#[test] +fn lowers_assign_next_euler_update() { + // next[pop] = pop + (births - deaths) * dt, all read from curr. + // pop=slot4 (32), births=slot5 (40), deaths=slot6 (48); dt=0.5 literal. + // next slot 4 lives at next_base(4096) + 32 = 4128. + let code = vec![ + Opcode::LoadVar { off: 4 }, // pop + Opcode::LoadVar { off: 5 }, // births + Opcode::LoadVar { off: 6 }, // deaths + op2(Op2::Sub), // births - deaths + Opcode::LoadConstant { id: 0 }, // dt + op2(Op2::Mul), // (births - deaths) * dt + op2(Op2::Add), // pop + ... + Opcode::AssignNext { off: 4 }, + ]; + // pop=100, births=10, deaths=2.5 -> 100 + 7.5*0.5 = 103.75 + let seed = &[(32, 100.0), (40, 10.0), (48, 2.5)]; + assert_eq!(stored(code, vec![0.5], seed, 4128), 103.75); +} + +#[test] +fn assign_next_honors_module_off() { + // With module_off=2, AssignNext{off:0} writes next[2]; next_base=4096, + // so byte 4096 + 2*8 = 4112. + let ctx = ctx_with_cond_depth(0); + let program = bc( + vec![7.0], + vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::AssignNext { off: 0 }, + ], + ); + let bytes = build_module(&program, &ctx, false, 0); + let info = validate(&bytes).expect("module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let eval = store + .instance_export(module, "eval") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(i32,), ()>(eval, (2_i32,)) + .expect("invoke"); + let mem = store + .instance_export(module, "mem") + .unwrap() + .as_mem() + .unwrap(); + let v = store.mem_access_mut_slice(mem, |bytes| { + f64::from_le_bytes(bytes[4112..4120].try_into().unwrap()) + }); + assert_eq!(v, 7.0); +} + +// ── Ret is a no-op ──────────────────────────────────────────────────── + +#[test] +fn ret_emits_nothing() { + // A program that loads a constant then Ret leaves just the constant. + let code = vec![Opcode::LoadConstant { id: 0 }, Opcode::Ret]; + assert_eq!(value(code, vec![5.0], &[]), 5.0); +} + +// ── AC1.5: raw Op2::Div by zero matches IEEE / the VM ───────────────── + +#[test] +fn div_by_zero_matches_vm_ieee() { + let lc = |id| Opcode::LoadConstant { id }; + // x/0 -> +Inf + assert_eq!( + value(vec![lc(0), lc(1), op2(Op2::Div)], vec![1.0, 0.0], &[]), + f64::INFINITY + ); + // -x/0 -> -Inf + assert_eq!( + value(vec![lc(0), lc(1), op2(Op2::Div)], vec![-1.0, 0.0], &[]), + f64::NEG_INFINITY + ); + // 0/0 -> NaN + let nan = value(vec![lc(0), lc(1), op2(Op2::Div)], vec![0.0, 0.0], &[]); + assert!(nan.is_nan()); +} + +// ── AC1.4: unsupported opcodes return a clean error, never a panic ──── + +#[test] +fn op2_eq_lowers_without_error() { + // Eq is now supported (routed through the approx_eq helper), so lowering + // must succeed where Phase 1 returned Unsupported. Numeric parity is + // covered by the dedicated approx_eq / Op2::Eq tests below. + let mut func = Function::new([]); + let program = bc(vec![1.0, 2.0], vec![op2(Op2::Eq)]); + let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func); + assert!(result.is_ok(), "Op2::Eq should lower without error"); +} + +#[test] +fn op2_mod_lowers_without_error() { + // Mod is now supported (rem_euclid via the mod_euclid helper); lowering + // must succeed where Phase 1 returned Unsupported. + let mut func = Function::new([]); + let program = bc(vec![], vec![op2(Op2::Mod)]); + let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func); + assert!(result.is_ok(), "Op2::Mod should lower without error"); +} + +#[test] +fn op2_exp_lowers_without_error() { + // Exp is now supported (powf via the pow helper). + let mut func = Function::new([]); + let program = bc(vec![], vec![op2(Op2::Exp)]); + let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func); + assert!(result.is_ok(), "Op2::Exp should lower without error"); +} + +// ── Op2::Exp (pow) / Op2::Mod (rem_euclid) numeric parity ───────────── + +/// Evaluate `l Op2::Exp r` (push l, push r, Op2::Exp) -> f64. +fn eval_exp(l: f64, r: f64) -> f64 { + value( + vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::LoadConstant { id: 1 }, + op2(Op2::Exp), + ], + vec![l, r], + &[], + ) +} + +#[test] +fn op2_exp_matches_powf_for_positive_base() { + // The VM's `eval_op2` Exp is `l.powf(r)`. The wasm `pow` helper matches + // `powf` for a positive base across integer/fractional/negative + // exponents; assert within the documented helper tolerance. + let bases: [f64; 6] = [0.5, 1.0, 2.0, 3.7, 10.0, 100.0]; + let exps: [f64; 9] = [-3.0, -1.5, -1.0, 0.0, 0.5, 1.0, 2.0, 2.5, 7.0]; + for &l in &bases { + for &r in &exps { + let want = l.powf(r); + let got = eval_exp(l, r); + let abs = (got - want).abs(); + let rel = if want != 0.0 { abs / want.abs() } else { abs }; + assert!( + abs <= 1e-9 || rel <= 1e-9, + "Exp({l}, {r}): got {got}, want {want} (abs {abs:.3e}, rel {rel:.3e})", + ); + } + } + // x == 1 and y == 0 are the helper's exact short-circuits. + assert_eq!(eval_exp(1.0, 42.0), 1.0); + assert_eq!(eval_exp(7.0, 0.0), 1.0); +} + +/// Evaluate `l Op2::Mod r` (push l, push r, Op2::Mod) -> f64. +fn eval_mod(l: f64, r: f64) -> f64 { + value( + vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::LoadConstant { id: 1 }, + op2(Op2::Mod), + ], + vec![l, r], + &[], + ) +} + +#[test] +fn op2_mod_matches_rem_euclid_all_sign_combos() { + // The VM's `eval_op2` Mod is `l.rem_euclid(r)` (result in [0, |r|)), + // NOT a truncated remainder. Cover all four sign combinations and + // non-integer operands. + let cases: &[(f64, f64)] = &[ + (7.0, 3.0), + (-7.0, 3.0), + (7.0, -3.0), + (-7.0, -3.0), + (7.5, 2.5), + (-7.5, 2.5), + (7.5, -2.5), + (-7.5, -2.5), + (5.3, 2.1), + (-5.3, 2.1), + (5.3, -2.1), + (-5.3, -2.1), + (0.0, 3.0), + (3.0, 3.0), + (-3.0, 3.0), + (2.0, 4.0), + ]; + for &(l, r) in cases { + let want = l.rem_euclid(r); + let got = eval_mod(l, r); + assert!( + (got - want).abs() < 1e-12, + "Mod({l}, {r}): got {got}, want {want}", + ); + // The euclidean remainder is always in [0, |r|). + assert!( + (0.0..r.abs()).contains(&got), + "Mod({l}, {r}) = {got} not in [0, {})", + r.abs(), + ); + } +} + +#[test] +fn bin_op_assign_curr_mod_stores_rem_euclid() { + // The peephole-fused `Op2::Mod; AssignCurr` form must also lower (it was + // an Unsupported case in Phase 1). -7 mod 3 = 2 -> curr slot 5 (byte 40). + let code = vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::LoadConstant { id: 1 }, + Opcode::BinOpAssignCurr { + op: Op2::Mod, + off: 5, + }, + ]; + assert_eq!(stored(code, vec![-7.0, 3.0], &[], 40), 2.0); +} + +#[test] +fn apply_lowers_without_error() { + // Apply is supported as of Phase 2 Task 4; lowering must succeed where + // Phase 1 returned Unsupported. (Numeric parity is covered by the + // dedicated per-builtin tests below.) + let mut func = Function::new([]); + let program = bc( + vec![], + vec![Opcode::Apply { + func: BuiltinId::Abs, + }], + ); + let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func); + assert!(result.is_ok(), "Apply should lower without error"); +} + +#[test] +fn lookup_lowers_without_error() { + // Lookup is supported as of Phase 3; lowering must succeed where Phase 2 + // returned Unsupported. (Numeric parity is covered by the seeded-table + // tests below and the end-to-end GF model tests in module.rs.) + let mut func = Function::new(opcode_fn_locals(0, 0, 0)); + let program = bc( + vec![0.0, 1.0], + vec![ + Opcode::LoadConstant { id: 0 }, // element_offset + Opcode::LoadConstant { id: 1 }, // index + Opcode::Lookup { + base_gf: 0, + table_count: 1, + mode: LookupMode::Interpolate, + }, + ], + ); + let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func); + assert!(result.is_ok(), "Lookup should lower without error"); +} + +#[test] +fn unsupported_array_opcode_returns_error() { + // The reducers, static view ops, and iteration loops are supported as of + // Phase 5 Tasks 1-3, so this drives a still-unsupported module opcode + // (`EvalModule`, Phase 7) to confirm an unhandled opcode still returns a + // clean error rather than a wrong module. + let mut func = Function::new([]); + let program = bc(vec![], vec![Opcode::EvalModule { id: 0, n_inputs: 0 }]); + let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func); + assert!(matches!(result, Err(WasmGenError::Unsupported(_)))); +} + +#[test] +fn begin_iter_on_empty_view_stack_errors() { + // A `BeginIter` with no view pushed first is malformed bytecode: it must + // error cleanly (empty-view-stack), not panic. + let mut func = Function::new([]); + let program = bc( + vec![], + vec![Opcode::BeginIter { + write_temp_id: 0, + has_write_temp: false, + }], + ); + let result = emit_bytecode(&program, &ctx_with_cond_depth(0), &mut func); + assert!(matches!(result, Err(WasmGenError::Unsupported(_)))); +} + +// ── Lookup opcode: seeded-table parity with the VM lookup functions ─── + +// GF region bases for the Lookup opcode tests, placed well past +// `next_base` (4096) so they cannot overlap the curr/next chunks. The +// single test table's directory entry sits at `GF_DIR_BASE`; its data +// follows at `GF_DATA_BASE`. +const GF_DIR_BASE: u32 = 8192; +const GF_DATA_BASE: u32 = 8192 + 8; // one 8-byte directory entry + +/// A ctx whose GF region bases point at the hand-seeded test regions, so a +/// `Lookup` opcode reads the directory at `GF_DIR_BASE`. +fn ctx_with_gf() -> EmitCtx<'static> { + EmitCtx { + gf_directory_base: GF_DIR_BASE, + gf_data_base: GF_DATA_BASE, + ..ctx_with_cond_depth(0) + } +} + +/// Pack a GF directory entry `(data_off, count)` into the f64 whose 8 LE +/// bytes are `data_off` (low i32) then `count` (high i32) -- so seeding it as +/// one f64 writes exactly the two i32 the `Lookup` opcode reads. +/// +/// Assumes a little-endian test host: the low 32 bits land at the lower +/// address, matching production's `to_le_bytes` directory encoding (the +/// opcode reads `data_off` at offset 0 and `count` at offset 4). +fn dir_entry_f64(data_off: u32, count: u32) -> f64 { + f64::from_bits(((count as u64) << 32) | data_off as u64) +} + +/// Seed a single GF table (`base_gf == 0`, `table_count == 1`) into memory: +/// the directory entry at `GF_DIR_BASE` and the knots at `GF_DATA_BASE`. +fn seed_single_table(knots: &[(f64, f64)]) -> Vec<(u64, f64)> { + let mut seed = vec![( + u64::from(GF_DIR_BASE), + dir_entry_f64(GF_DATA_BASE, knots.len() as u32), + )]; + for (k, &(x, y)) in knots.iter().enumerate() { + let knot_addr = u64::from(GF_DATA_BASE) + (k as u64) * 16; + seed.push((knot_addr, x)); + seed.push((knot_addr + 8, y)); + } + seed +} + +/// Run a `Lookup` over a single seeded table at `(element_offset, index)`. +/// `table_count` lets a test push an out-of-range element_offset. +fn run_lookup_opcode( + mode: LookupMode, + knots: &[(f64, f64)], + table_count: u16, + element_offset: f64, + index: f64, +) -> f64 { + let code = vec![ + Opcode::LoadConstant { id: 0 }, // element_offset (pushed first) + Opcode::LoadConstant { id: 1 }, // index (pushed second, on top) + Opcode::Lookup { + base_gf: 0, + table_count, + mode, + }, + ]; + run( + &bc(vec![element_offset, index], code), + &ctx_with_gf(), + true, + 0, + &seed_single_table(knots), + None, + ) +} + +/// The VM oracle for `mode` -- the exact function the opcode dispatches to. +fn vm_lookup_oracle(mode: LookupMode, knots: &[(f64, f64)], index: f64) -> f64 { + match mode { + LookupMode::Interpolate => crate::vm::lookup(knots, index), + LookupMode::Forward => crate::vm::lookup_forward(knots, index), + LookupMode::Backward => crate::vm::lookup_backward(knots, index), + } +} + +fn assert_lookup_opcode_matches_vm(mode: LookupMode, knots: &[(f64, f64)], index: f64) { + let got = run_lookup_opcode(mode, knots, 1, 0.0, index); + let want = vm_lookup_oracle(mode, knots, index); + if want.is_nan() { + assert!(got.is_nan(), "{mode:?} at {index}: expected NaN, got {got}"); + } else { + assert_eq!(got, want, "{mode:?} at {index}: got {got}, want {want}"); + } +} + +const LOOKUP_OPCODE_TABLE: &[(f64, f64)] = &[(0.0, 10.0), (1.0, 20.0), (2.5, 5.0), (4.0, 40.0)]; + +#[test] +fn lookup_opcode_dispatches_to_each_mode_and_reads_directory() { + // The opcode reads (data_off, count) from the directory, then dispatches + // to the mode's helper. Probe below/above range, on a knot, and between + // knots for all three modes against the VM oracle. + let probes = [-1.0, 0.0, 0.5, 1.0, 1.75, 2.5, 3.0, 4.0, 9.0]; + for mode in [ + LookupMode::Interpolate, + LookupMode::Forward, + LookupMode::Backward, + ] { + for &index in &probes { + assert_lookup_opcode_matches_vm(mode, LOOKUP_OPCODE_TABLE, index); + } + } +} + +#[test] +fn lookup_opcode_out_of_range_element_offset_is_nan() { + // The VM pushes NaN when element_offset < 0 or >= table_count, BEFORE + // touching the table; the opcode must match (the directory is seeded for + // table 0 only, so an OOB offset must short-circuit, never read garbage). + for mode in [ + LookupMode::Interpolate, + LookupMode::Forward, + LookupMode::Backward, + ] { + // table_count = 1, so offset 1 and -1 are both out of range. + assert!( + run_lookup_opcode(mode, LOOKUP_OPCODE_TABLE, 1, 1.0, 2.0).is_nan(), + "{mode:?}: element_offset == table_count must be NaN" + ); + assert!( + run_lookup_opcode(mode, LOOKUP_OPCODE_TABLE, 1, -1.0, 2.0).is_nan(), + "{mode:?}: negative element_offset must be NaN" + ); + // In range (offset 0) is NOT NaN for an in-range index. + assert!( + !run_lookup_opcode(mode, LOOKUP_OPCODE_TABLE, 1, 0.0, 2.0).is_nan(), + "{mode:?}: in-range element_offset must not be NaN" + ); + } +} + +#[test] +fn lookup_opcode_nan_index_is_nan() { + for mode in [ + LookupMode::Interpolate, + LookupMode::Forward, + LookupMode::Backward, + ] { + assert!( + run_lookup_opcode(mode, LOOKUP_OPCODE_TABLE, 1, 0.0, f64::NAN).is_nan(), + "{mode:?}: a NaN index must be NaN" + ); + } +} + +// ── Lookup opcode: runtime table selection across TWO tables ────────── +// +// The single-table parity tests above always pass `element_offset == 0`, so +// the directory-indexing arithmetic in `push_gf_directory_addr` +// (`gf_directory_base + (base_gf + element_offset) * 8`) is only exercised +// for offset 0 -- the `* 8` stride and the offset add are never tested with +// a nonzero offset (the out-of-range tests short-circuit to NaN before the +// directory read). Phase 5/7 lower an arrayed scalar `Lookup` to a runtime +// per-element `element_offset` that selects a per-element table, so the +// table-selection path must be pinned here. + +// Two-table layout: a 2-entry directory at `GF2_DIR_BASE`, then each +// table's knots laid out back-to-back past the directory. +const GF2_DIR_BASE: u32 = 8192; +const GF2_TABLE0_DATA: u32 = GF2_DIR_BASE + 2 * 8; // past two 8-byte entries +// Table 0 has two knots (4 f64 = 32 bytes); table 1's data follows. +const GF2_TABLE1_DATA: u32 = GF2_TABLE0_DATA + 2 * 16; + +/// Seed two GF tables so that directory entry `t` (`t ∈ {0,1}`) points at +/// `table_t`'s knots. Mirrors the production directory layout the opcode +/// reads via `push_gf_directory_addr`. +fn seed_two_tables(table0: &[(f64, f64)], table1: &[(f64, f64)]) -> Vec<(u64, f64)> { + let mut seed = vec![ + ( + u64::from(GF2_DIR_BASE), + dir_entry_f64(GF2_TABLE0_DATA, table0.len() as u32), + ), + ( + u64::from(GF2_DIR_BASE) + 8, + dir_entry_f64(GF2_TABLE1_DATA, table1.len() as u32), + ), + ]; + for (base, knots) in [(GF2_TABLE0_DATA, table0), (GF2_TABLE1_DATA, table1)] { + for (k, &(x, y)) in knots.iter().enumerate() { + let knot_addr = u64::from(base) + (k as u64) * 16; + seed.push((knot_addr, x)); + seed.push((knot_addr + 8, y)); + } + } + seed +} + +/// Run a `Lookup` with a compile-time-constant `element_offset` against a +/// two-table directory (`base_gf == 0`, `table_count == 2`). +fn run_lookup_two_tables( + mode: LookupMode, + table0: &[(f64, f64)], + table1: &[(f64, f64)], + element_offset: f64, + index: f64, +) -> f64 { + let code = vec![ + Opcode::LoadConstant { id: 0 }, // element_offset (pushed first) + Opcode::LoadConstant { id: 1 }, // index (pushed second, on top) + Opcode::Lookup { + base_gf: 0, + table_count: 2, + mode, + }, + ]; + let ctx = EmitCtx { + gf_directory_base: GF2_DIR_BASE, + // `gf_data_base` is unused at runtime by the opcode (each table's + // data offset comes from its directory entry), but set it to the + // first table's data so the ctx is internally consistent. + gf_data_base: GF2_TABLE0_DATA, + ..ctx_with_cond_depth(0) + }; + run( + &bc(vec![element_offset, index], code), + &ctx, + true, + 0, + &seed_two_tables(table0, table1), + None, + ) +} + +#[test] +fn lookup_opcode_selects_table_by_element_offset() { + // Two tables whose values differ at the probe index in ALL three modes, + // so selecting the wrong table is observable regardless of mode: + // table 0: y = 10x index 5 -> interp 50, fwd 100, bwd 0 + // table 1: y = x/10 + 1 index 5 -> interp 1.5, fwd 2, bwd 1 + let table0: &[(f64, f64)] = &[(0.0, 0.0), (10.0, 100.0)]; + let table1: &[(f64, f64)] = &[(0.0, 1.0), (10.0, 2.0)]; + let index = 5.0; + + for mode in [ + LookupMode::Interpolate, + LookupMode::Forward, + LookupMode::Backward, + ] { + // The two tables must genuinely disagree here, otherwise selecting + // the wrong table would silently pass. + let want0 = vm_lookup_oracle(mode, table0, index); + let want1 = vm_lookup_oracle(mode, table1, index); + assert_ne!( + want0, want1, + "{mode:?}: tables must differ at the probe index to detect mis-selection" + ); + + // element_offset == 1 selects table 1; the result must match the VM + // oracle over table 1 (and therefore differ from table 0). + let got = run_lookup_two_tables(mode, table0, table1, 1.0, index); + assert_eq!( + got, want1, + "{mode:?}: element_offset==1 must read table 1: got {got}, want {want1}" + ); + + // Sanity: element_offset == 0 still selects table 0 (the offset is a + // real selector, not a constant remap to table 1). + let got0 = run_lookup_two_tables(mode, table0, table1, 0.0, index); + assert_eq!( + got0, want0, + "{mode:?}: element_offset==0 must read table 0: got {got0}, want {want0}" + ); + } +} + +// ── LoadInitial / LoadPrev opcodes (Task 1: snapshot regions) ───────── + +// Snapshot region bases for these tests, placed past `next_base` (4096) so +// they cannot overlap the curr/next chunks. +const INITIAL_BASE: u32 = 8192; +const PREV_BASE: u32 = 8192 + 4096; + +/// `LoadInitial` in the flows/stocks programs reads `initial_values[off]` +/// (the post-initials snapshot), NOT `curr`. Seed both regions to distinct +/// values at the same slot so a wrong-region read is observable. +#[test] +fn load_initial_in_flows_reads_initial_values_region() { + let ctx = EmitCtx { + initial_values_base: INITIAL_BASE, + step_part: StepPart::Flows, + ..ctx_with_cond_depth(0) + }; + // curr[2] = 111 (byte 16), initial_values[2] = 222 (INITIAL_BASE + 16). + let seed = [(16u64, 111.0), (u64::from(INITIAL_BASE) + 16, 222.0)]; + let got = run( + &bc(vec![], vec![Opcode::LoadInitial { off: 2 }]), + &ctx, + true, + 0, + &seed, + None, + ); + assert_eq!(got, 222.0, "LoadInitial in Flows must read initial_values"); +} + +/// `LoadInitial` in the initials program reads `curr[off]` (the value being +/// computed), because the snapshot is not yet taken (`vm.rs:1334`). +#[test] +fn load_initial_in_initials_reads_curr() { + let ctx = EmitCtx { + initial_values_base: INITIAL_BASE, + step_part: StepPart::Initials, + ..ctx_with_cond_depth(0) + }; + let seed = [(16u64, 111.0), (u64::from(INITIAL_BASE) + 16, 222.0)]; + let got = run( + &bc(vec![], vec![Opcode::LoadInitial { off: 2 }]), + &ctx, + true, + 0, + &seed, + None, + ); + assert_eq!(got, 111.0, "LoadInitial in Initials must read curr"); +} + +/// `LoadInitial` honors `module_off`: with a non-zero module base it reads +/// `initial_values[module_off + off]`. +#[test] +fn load_initial_honors_module_off() { + let ctx = EmitCtx { + initial_values_base: INITIAL_BASE, + step_part: StepPart::Stocks, + ..ctx_with_cond_depth(0) + }; + // module_off=2, off=1 -> initial_values[3] at INITIAL_BASE + 24. + let program = bc(vec![], vec![Opcode::LoadInitial { off: 1 }]); + let bytes = build_module(&program, &ctx, true, 0); + let info = validate(&bytes).expect("module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let mem = store + .instance_export(module, "mem") + .unwrap() + .as_mem() + .unwrap(); + store.mem_access_mut_slice(mem, |bytes| { + let a = (INITIAL_BASE + 24) as usize; + bytes[a..a + 8].copy_from_slice(&77.0_f64.to_le_bytes()); + }); + let eval = store + .instance_export(module, "eval") + .unwrap() + .as_func() + .unwrap(); + let result: f64 = store.invoke_simple_typed(eval, (2_i32,)).expect("invoke"); + assert_eq!( + result, 77.0, + "LoadInitial must read initial_values[module_off+off]" + ); +} + +/// Build a module exporting `mem`, a mutable i32 global `use_prev_fallback` +/// (at index 0, the index the test ctx names), and an `eval(module_off: i32) +/// -> f64` whose body lowers `LoadConstant(fallback); LoadPrev{off}`. The +/// helper functions lead the function/code sections so any `call` resolves; +/// `eval` follows. `fallback_flag` is the global's init value (1 = use the +/// fallback, 0 = read prev_values). +fn build_load_prev_module(off: u16, fallback: f64, fallback_flag: i32) -> Vec { + let mut module = Module::new(); + let helpers = build_helpers(); + let n_helpers = helpers.functions.len() as u32; + + let mut types = TypeSection::new(); + types.ty().function([ValType::I32], [ValType::F64]); // eval + for hf in &helpers.functions { + types.ty().function(hf.params.clone(), hf.results.clone()); + } + module.section(&types); + + let mut functions = FunctionSection::new(); + for (i, _) in helpers.functions.iter().enumerate() { + functions.function(1 + i as u32); + } + functions.function(0); // eval -> type 0 + module.section(&functions); + + let mut memories = MemorySection::new(); + memories.memory(MemoryType { + minimum: 1, + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + module.section(&memories); + + // The single mutable i32 global the LoadPrev ctx gates on (index 0). + let mut globals = wasm_encoder::GlobalSection::new(); + globals.global( + wasm_encoder::GlobalType { + val_type: ValType::I32, + mutable: true, + shared: false, + }, + &wasm_encoder::ConstExpr::i32_const(fallback_flag), + ); + module.section(&globals); + + let mut exports = ExportSection::new(); + exports.export("eval", ExportKind::Func, n_helpers); + exports.export("mem", ExportKind::Memory, 0); + module.section(&exports); + + let ctx = EmitCtx { + prev_values_base: PREV_BASE, + use_prev_fallback_global: 0, + ..ctx_with_cond_depth(0) + }; + let program = bc( + vec![fallback], + vec![Opcode::LoadConstant { id: 0 }, Opcode::LoadPrev { off }], + ); + + let mut code = CodeSection::new(); + for hf in helpers.functions { + code.function(&hf.body); + } + let mut func = Function::new(opcode_fn_locals(0, 0, 0)); + emit_bytecode(&program, &ctx, &mut func).expect("LoadPrev should lower"); + func.instruction(&Instruction::End); + code.function(&func); + module.section(&code); + + module.finish() +} + +/// Run `LoadConstant(fallback); LoadPrev{off}` with `prev_values[off]` seeded +/// to `prev_value` and the gate set to `fallback_flag`. +fn run_load_prev(off: u16, fallback: f64, prev_value: f64, fallback_flag: i32) -> f64 { + let bytes = build_load_prev_module(off, fallback, fallback_flag); + let info = validate(&bytes).expect("LoadPrev module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let mem = store + .instance_export(module, "mem") + .unwrap() + .as_mem() + .unwrap(); + store.mem_access_mut_slice(mem, |bytes| { + let a = (PREV_BASE + u32::from(off) * SLOT_SIZE) as usize; + bytes[a..a + 8].copy_from_slice(&prev_value.to_le_bytes()); + }); + let eval = store + .instance_export(module, "eval") + .unwrap() + .as_func() + .unwrap(); + store.invoke_simple_typed(eval, (0_i32,)).expect("invoke") +} + +/// `LoadPrev` returns the caller-supplied fallback while `use_prev_fallback` +/// is set (1), exactly as the VM does before the first snapshot +/// (`vm.rs:1322`). The seeded `prev_values` value must NOT be read. +#[test] +fn load_prev_returns_fallback_when_flag_set() { + let got = run_load_prev(2, 3.5, 999.0, 1); + assert_eq!(got, 3.5, "with the flag set, LoadPrev yields its fallback"); +} + +/// `LoadPrev` reads `prev_values[off]` once `use_prev_fallback` is cleared +/// (0), exactly as the VM does after the first snapshot (`vm.rs:1325`). +#[test] +fn load_prev_reads_prev_values_when_flag_clear() { + let got = run_load_prev(2, 3.5, 999.0, 0); + assert_eq!( + got, 999.0, + "with the flag clear, LoadPrev reads prev_values" + ); +} + +// ── approx_eq helper (AC7.2, AC1.5) ─────────────────────────────────── + +/// Build a module exporting `eq(a: f64, b: f64) -> i32` whose body is just +/// `local.get a; local.get b; call approx_eq`, directly exercising the +/// emitted helper in isolation. The helper functions are placed at indices +/// `0..N` (so the `call` resolves) and `eq` follows at index `N`. +fn build_approx_eq_module() -> Vec { + let mut module = Module::new(); + + let helpers = build_helpers(); + let n_helpers = helpers.functions.len() as u32; + + // Type 0 is `eq`'s signature (f64, f64) -> i32; helper types follow. + let mut types = TypeSection::new(); + types + .ty() + .function([ValType::F64, ValType::F64], [ValType::I32]); + for hf in &helpers.functions { + types.ty().function(hf.params.clone(), hf.results.clone()); + } + module.section(&types); + + let mut functions = FunctionSection::new(); + for (i, _) in helpers.functions.iter().enumerate() { + functions.function(1 + i as u32); + } + functions.function(0); + module.section(&functions); + + // The GF lookup helpers (`super::lookup`) `f64.load` from memory 0, so + // a module that includes every helper body must declare a memory even + // though `eq` itself never touches it. + let mut memories = MemorySection::new(); + memories.memory(MemoryType { + minimum: 1, + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + module.section(&memories); + + let mut exports = ExportSection::new(); + exports.export("eq", ExportKind::Func, n_helpers); + module.section(&exports); + + let mut code = CodeSection::new(); + for hf in helpers.functions { + code.function(&hf.body); + } + let mut eq = Function::new([]); + eq.instruction(&Instruction::LocalGet(0)); + eq.instruction(&Instruction::LocalGet(1)); + eq.instruction(&Instruction::Call(helpers.fns.approx_eq)); + eq.instruction(&Instruction::End); + code.function(&eq); + module.section(&code); + + module.finish() +} + +/// Run the emitted `approx_eq` helper on `(a, b)` under the interpreter, +/// returning its i32 result (1 = approximately equal). Built once per call +/// (cheap; the sample sizes are small). +fn run_approx_eq(a: f64, b: f64) -> i32 { + let bytes = build_approx_eq_module(); + let info = validate(&bytes).expect("approx_eq module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("approx_eq module must instantiate") + .module_addr; + let eq = store + .instance_export(module, "eq") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(f64, f64), i32>(eq, (a, b)) + .expect("eq invocation must succeed") +} + +/// Assert the emitted helper agrees with the Rust `crate::float::approx_eq` +/// oracle for both argument orders (the function is symmetric). +fn assert_approx_eq_matches_oracle(a: f64, b: f64) { + let oracle = crate::float::approx_eq(a, b) as i32; + assert_eq!( + run_approx_eq(a, b), + oracle, + "approx_eq({a:?}, {b:?}) disagreed with oracle {oracle}" + ); + let oracle_swapped = crate::float::approx_eq(b, a) as i32; + assert_eq!( + run_approx_eq(b, a), + oracle_swapped, + "approx_eq({b:?}, {a:?}) disagreed with oracle {oracle_swapped}" + ); +} + +/// Move `x` by `k` ULPs in raw-bit order (the increment the float-cmp ordered +/// map measures within a sign). For small `|k|` and finite `x` this yields a +/// value the oracle judges 0..|k| ULPs away. +fn nudge_ulps(x: f64, k: i64) -> f64 { + f64::from_bits(((x.to_bits() as i64).wrapping_add(k)) as u64) +} + +#[test] +fn approx_eq_matches_oracle_curated() { + // The exact edge cases the task enumerates. + let na = crate::float::NA; // finite -2^109 sentinel, NOT NaN. + let cases: &[(f64, f64)] = &[ + // exact-equal + (1.0, 1.0), + (0.0, 0.0), + (-3.5, -3.5), + (1e300, 1e300), + // far apart + (1.0, 2.0), + (0.0, 1e100), + (-1e9, 1e9), + // 1-4 ULP apart around 1.0 + (1.0, nudge_ulps(1.0, 1)), + (1.0, nudge_ulps(1.0, 2)), + (1.0, nudge_ulps(1.0, 3)), + (1.0, nudge_ulps(1.0, 4)), + // 5 ULPs apart (just past the threshold) around a larger magnitude + (1000.0, nudge_ulps(1000.0, 5)), + (1000.0, nudge_ulps(1000.0, 4)), + // f64::EPSILON-apart around 1.0 (the absolute-epsilon check) + (1.0, 1.0 + f64::EPSILON), + (1.0, 1.0 - f64::EPSILON), + // around zero (subnormals and tiny values straddling the epsilon) + (0.0, f64::from_bits(1)), // smallest subnormal + (0.0, -f64::from_bits(1)), // negative smallest subnormal + (0.0, f64::EPSILON), // EPSILON away from zero + (0.0, 1e-300), // tiny normal, within epsilon + (f64::MIN_POSITIVE, -f64::MIN_POSITIVE), // straddle zero by subnormal step + // signed zeros + (0.0, -0.0), + // NaN cases + (f64::NAN, f64::NAN), + (f64::NAN, 1.0), + (f64::NAN, 0.0), + // the finite :NA: sentinel + (na, na), + (na, 0.0), + (na, 1.0), + (na, -(2.0_f64).powi(110)), + // infinities + (f64::INFINITY, f64::INFINITY), + (f64::NEG_INFINITY, f64::NEG_INFINITY), + (f64::INFINITY, f64::NEG_INFINITY), + (f64::INFINITY, f64::MAX), + (f64::NEG_INFINITY, f64::MIN), + ]; + for &(a, b) in cases { + assert_approx_eq_matches_oracle(a, b); + } +} + +#[test] +fn approx_eq_matches_oracle_randomized() { + use rand::prelude::*; + // Fixed seed: a sampled-but-reproducible parity sweep against the oracle. + let mut rng = StdRng::seed_from_u64(0xA222_02EE); + for _ in 0..400 { + // A diverse magnitude/sign base value. + let exp = rng.random_range(-308i32..=308); + let mantissa: f64 = rng.random_range(-1.0..1.0); + let x = mantissa * 10f64.powi(exp); + + // ULP-adjacent partner (often within the 4-ULP threshold, sometimes + // just past it), exercising the ULP path on both sides of the gap. + let k = rng.random_range(-8i64..=8); + assert_approx_eq_matches_oracle(x, nudge_ulps(x, k)); + + // An independent unrelated value (usually far apart -> ULP + epsilon + // both fail), exercising the false path. + let exp2 = rng.random_range(-308i32..=308); + let y: f64 = rng.random_range(-1.0..1.0) * 10f64.powi(exp2); + assert_approx_eq_matches_oracle(x, y); + + // Near-zero straddling pairs (the epsilon absolute check region). + let tiny_a = rng.random_range(-1.0..1.0) * f64::EPSILON; + let tiny_b = rng.random_range(-1.0..1.0) * f64::EPSILON; + assert_approx_eq_matches_oracle(tiny_a, tiny_b); + } +} + +// ── Op2::Eq / And / Or / Not / SetCond+If route through approx_eq ───── + +/// Evaluate `l Op2::Eq r` (push l, push r, Op2::Eq) and return the f64 bool. +fn eval_eq(l: f64, r: f64) -> f64 { + let lit = vec![l, r]; + value( + vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::LoadConstant { id: 1 }, + op2(Op2::Eq), + ], + lit, + &[], + ) +} + +#[test] +fn op2_eq_matches_vm_for_ulp_adjacent_operands() { + // Raw `==` would call these unequal, but the VM's `approx_eq` (and so the + // wasm) calls them equal: 1 ULP and EPSILON-apart around 1.0. + let one_ulp = nudge_ulps(1.0, 1); + assert_eq!(eval_eq(1.0, one_ulp), 1.0); + assert_eq!(eval_eq(1.0, 1.0 + f64::EPSILON), 1.0); + // 5 ULPs apart at a larger magnitude: past the threshold -> not equal. + assert_eq!(eval_eq(1000.0, nudge_ulps(1000.0, 5)), 0.0); + // Exact and far-apart anchors. + assert_eq!(eval_eq(2.5, 2.5), 1.0); + assert_eq!(eval_eq(1.0, 2.0), 0.0); + // NaN == NaN is true under approx_eq (identical bits -> 0 ULPs). + assert_eq!(eval_eq(f64::NAN, f64::NAN), 1.0); + assert_eq!(eval_eq(f64::NAN, 1.0), 0.0); +} + +#[test] +fn op2_eq_matches_vm_oracle_over_sample() { + // The whole-expression Eq lowering must agree with the VM's eval_op2 Eq + // (= approx_eq as f64) across the curated edge values. + let na = crate::float::NA; + let cases: &[(f64, f64)] = &[ + (1.0, nudge_ulps(1.0, 3)), + (1.0, nudge_ulps(1.0, 4)), + (1.0, nudge_ulps(1.0, 5)), + (0.0, -0.0), + (0.0, f64::EPSILON), + (na, na), + (na, 0.0), + (f64::INFINITY, f64::INFINITY), + (f64::INFINITY, f64::NEG_INFINITY), + ]; + for &(l, r) in cases { + let expected = crate::float::approx_eq(l, r) as i8 as f64; + assert_eq!(eval_eq(l, r), expected, "Eq({l:?}, {r:?})"); + } +} + +/// Evaluate `l Op2::And r` / `l Op2::Or r`. +fn eval_logical(op: Op2, l: f64, r: f64) -> f64 { + value( + vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::LoadConstant { id: 1 }, + op2(op), + ], + vec![l, r], + &[], + ) +} + +/// The VM's truthiness: `is_truthy(n) = !approx_eq(n, 0.0)`. +fn vm_is_truthy(n: f64) -> bool { + !crate::float::approx_eq(n, 0.0) +} + +#[test] +fn op2_and_or_match_vm_truthiness() { + // EPSILON is falsy (within epsilon of 0); a small-but-not-epsilon value + // is truthy. These are exactly where raw `!= 0.0` would diverge from the + // VM. + let eps = f64::EPSILON; + let small = 0.001; + let operands = [ + 0.0, + -0.0, + eps, + -eps, + small, + -small, + 1.0, + f64::NAN, + f64::INFINITY, + ]; + for &l in &operands { + for &r in &operands { + let and_expected = (vm_is_truthy(l) && vm_is_truthy(r)) as i8 as f64; + let or_expected = (vm_is_truthy(l) || vm_is_truthy(r)) as i8 as f64; + assert_eq!( + eval_logical(Op2::And, l, r), + and_expected, + "And({l:?}, {r:?})" + ); + assert_eq!(eval_logical(Op2::Or, l, r), or_expected, "Or({l:?}, {r:?})"); + } + } +} + +#[test] +fn op2_and_or_operand_order_preserved() { + // And/Or stash the right operand in the scratch local; verify a + // non-symmetric truthiness pairing still combines correctly and that the + // scratch reuse doesn't corrupt a following assignment. + // (truthy AND falsy) = 0; (truthy OR falsy) = 1. + assert_eq!(eval_logical(Op2::And, 5.0, 0.0), 0.0); + assert_eq!(eval_logical(Op2::And, 0.0, 5.0), 0.0); + assert_eq!(eval_logical(Op2::Or, 5.0, 0.0), 1.0); + assert_eq!(eval_logical(Op2::Or, 0.0, 5.0), 1.0); +} + +#[test] +fn bin_op_assign_and_uses_scratch_safely() { + // BinOpAssignCurr{And} fuses the And reduction with a store; the And + // lowering reuses the scratch local, which emit_assign then overwrites. + // Verify the stored result is correct. (truthy AND truthy) = 1 -> slot 5. + let code = vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::LoadConstant { id: 1 }, + Opcode::BinOpAssignCurr { + op: Op2::And, + off: 5, + }, + ]; + assert_eq!(stored(code, vec![3.0, 7.0], &[], 40), 1.0); + // (truthy AND falsy) = 0. + let code0 = vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::LoadConstant { id: 1 }, + Opcode::BinOpAssignCurr { + op: Op2::And, + off: 5, + }, + ]; + assert_eq!(stored(code0, vec![3.0, 0.0], &[], 40), 0.0); +} + +#[test] +fn not_matches_vm_approx_eq_truthiness() { + // Not(n) = (!is_truthy(n)) as f64 = approx_eq(n, 0.0) as f64. + // EPSILON is "false" so Not(EPSILON) = 1.0; small-but-not-epsilon is + // "true" so Not(0.001) = 0.0. + let operands = [0.0, -0.0, f64::EPSILON, -f64::EPSILON, 0.001, 1.0, f64::NAN]; + for &n in &operands { + let expected = (!vm_is_truthy(n)) as i8 as f64; + let got = value( + vec![Opcode::LoadConstant { id: 0 }, Opcode::Not {}], + vec![n], + &[], + ); + assert_eq!(got, expected, "Not({n:?})"); + } +} + +#[test] +fn setcond_if_uses_approx_eq_truthiness() { + // `if cond then t else f` with the condition routed through approx_eq. + // EPSILON is falsy -> selects the else arm; 0.001 is truthy -> then arm. + let if_eval = |cond: f64| { + let code = vec![ + Opcode::LoadConstant { id: 1 }, // t + Opcode::LoadConstant { id: 2 }, // f + Opcode::LoadConstant { id: 0 }, // cond + Opcode::SetCond {}, + Opcode::If {}, + ]; + run( + &bc(vec![cond, 10.0, 20.0], code), + &ctx_with_cond_depth(1), + true, + 1, + &[], + None, + ) + }; + // Falsy conditions (within epsilon of 0) -> else (20.0). + assert_eq!(if_eval(0.0), 20.0); + assert_eq!(if_eval(-0.0), 20.0); + assert_eq!(if_eval(f64::EPSILON), 20.0); + assert_eq!(if_eval(-f64::EPSILON), 20.0); + // Truthy conditions -> then (10.0). + assert_eq!(if_eval(0.001), 10.0); + assert_eq!(if_eval(1.0), 10.0); + assert_eq!(if_eval(f64::NAN), 10.0); // is_truthy(NaN) is true + assert_eq!(if_eval(f64::INFINITY), 10.0); +} + +// ── Apply: per-builtin parity with the VM's apply() ─────────────────── + +/// Run `Apply{func}` over the three operands `(a, b, c)` with `time`/`dt` +/// seeded into the reserved global slots (TIME at byte 0, DT at byte 8 of +/// `curr`). The program pushes a, b, c then `Apply`, so `c` is on top -- +/// matching the VM's pop order. +fn apply_eval(func: BuiltinId, a: f64, b: f64, c: f64, time: f64, dt: f64) -> f64 { + let code = vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::LoadConstant { id: 1 }, + Opcode::LoadConstant { id: 2 }, + Opcode::Apply { func }, + ]; + // Seed TIME (slot 0 -> byte 0) and DT (slot 1 -> byte 8) of curr. + value(code, vec![a, b, c], &[(0, time), (8, dt)]) +} + +/// `step`/`ramp`/`pulse` reproduced verbatim from `vm.rs` so the per-builtin +/// tests compare the wasm output to the exact formula the VM's `apply()` +/// uses, not to libm. +fn vm_step(time: f64, dt: f64, height: f64, step_time: f64) -> f64 { + if time + dt / 2.0 > step_time { + height + } else { + 0.0 + } +} +fn vm_ramp(time: f64, slope: f64, start: f64, end: f64) -> f64 { + if time > start { + if time >= end { + slope * (end - start) + } else { + slope * (time - start) + } + } else { + 0.0 + } +} +fn vm_pulse(time: f64, dt: f64, volume: f64, first: f64, interval: f64) -> f64 { + if time < first { + return 0.0; + } + let mut next = first; + while time >= next { + if time < next + dt { + return volume / dt; + } else if interval <= 0.0 { + break; + } else { + next += interval; + } + } + 0.0 +} + +/// Assert a wasm `Apply` result equals an exact f64 value (for the +/// non-transcendental builtins, which the wasm reproduces bit-for-bit). +fn assert_apply_exact(func: BuiltinId, a: f64, b: f64, c: f64, time: f64, dt: f64, want: f64) { + let got = apply_eval(func, a, b, c, time, dt); + if want.is_nan() { + assert!(got.is_nan(), "apply result expected NaN, got {got}"); + } else { + assert_eq!(got, want, "apply({a},{b},{c},t={time},dt={dt})"); + } +} + +#[test] +fn apply_abs_sqrt_int() { + assert_apply_exact(BuiltinId::Abs, -3.5, 0.0, 0.0, 0.0, 1.0, 3.5); + assert_apply_exact(BuiltinId::Abs, 3.5, 0.0, 0.0, 0.0, 1.0, 3.5); + assert_apply_exact(BuiltinId::Sqrt, 16.0, 0.0, 0.0, 0.0, 1.0, 4.0); + // Int is floor, NOT trunc: floor(-2.5) = -3 (trunc would give -2). + assert_apply_exact(BuiltinId::Int, -2.5, 0.0, 0.0, 0.0, 1.0, (-2.5f64).floor()); + assert_apply_exact(BuiltinId::Int, 2.9, 0.0, 0.0, 0.0, 1.0, 2.0); + assert_apply_exact(BuiltinId::Int, -2.9, 0.0, 0.0, 0.0, 1.0, -3.0); +} + +#[test] +fn apply_min_max() { + assert_apply_exact(BuiltinId::Max, 3.0, 7.0, 0.0, 0.0, 1.0, 7.0); + assert_apply_exact(BuiltinId::Max, 7.0, 3.0, 0.0, 0.0, 1.0, 7.0); + assert_apply_exact(BuiltinId::Min, 3.0, 7.0, 0.0, 0.0, 1.0, 3.0); + assert_apply_exact(BuiltinId::Min, 7.0, 3.0, 0.0, 0.0, 1.0, 3.0); + assert_apply_exact(BuiltinId::Max, -1.0, -5.0, 0.0, 0.0, 1.0, -1.0); + assert_apply_exact(BuiltinId::Min, -1.0, -5.0, 0.0, 0.0, 1.0, -5.0); +} + +#[test] +fn apply_sign() { + assert_apply_exact(BuiltinId::Sign, 5.0, 0.0, 0.0, 0.0, 1.0, 1.0); + assert_apply_exact(BuiltinId::Sign, -5.0, 0.0, 0.0, 0.0, 1.0, -1.0); + // Sign(0) = 0 exactly (the VM's `else` branch). + assert_apply_exact(BuiltinId::Sign, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0); + assert_apply_exact(BuiltinId::Sign, -0.0, 0.0, 0.0, 0.0, 1.0, 0.0); +} + +#[test] +fn apply_quantum() { + // q == 0 -> x (exact ==, returns a unchanged). + assert_apply_exact(BuiltinId::Quantum, 3.7, 0.0, 0.0, 0.0, 1.0, 3.7); + // q != 0 -> (x/q).trunc() * q. + assert_apply_exact( + BuiltinId::Quantum, + 7.0, + 2.0, + 0.0, + 0.0, + 1.0, + (7.0f64 / 2.0).trunc() * 2.0, + ); + assert_apply_exact( + BuiltinId::Quantum, + -7.0, + 2.0, + 0.0, + 0.0, + 1.0, + (-7.0f64 / 2.0).trunc() * 2.0, + ); + assert_apply_exact( + BuiltinId::Quantum, + 5.5, + 0.5, + 0.0, + 0.0, + 1.0, + (5.5f64 / 0.5).trunc() * 0.5, + ); +} + +#[test] +fn apply_safe_div() { + // b != 0 -> a/b. + assert_apply_exact(BuiltinId::SafeDiv, 6.0, 3.0, 99.0, 0.0, 1.0, 2.0); + // b == 0 -> c (the default), via exact `!= 0.0`. + assert_apply_exact(BuiltinId::SafeDiv, 6.0, 0.0, 99.0, 0.0, 1.0, 99.0); + // A subnormal (non-zero) denominator still divides, NOT falls back. + let sub = f64::from_bits(1); + assert_apply_exact(BuiltinId::SafeDiv, 6.0, sub, 99.0, 0.0, 1.0, 6.0 / sub); + // -0.0 is == 0.0, so it falls back to c (matches the VM's `b != 0.0`). + assert_apply_exact(BuiltinId::SafeDiv, 6.0, -0.0, 99.0, 0.0, 1.0, 99.0); +} + +#[test] +fn apply_sshape() { + // b + (c-b)/(1 + exp(-4*(2a-1))), within the exp helper's tolerance. + for &a in &[0.0f64, 0.25, 0.5, 0.75, 1.0] { + let want = 2.0 + (8.0 - 2.0) / (1.0 + (-4.0 * (2.0 * a - 1.0)).exp()); + let got = apply_eval(BuiltinId::Sshape, a, 2.0, 8.0, 0.0, 1.0); + assert!( + (got - want).abs() < 1e-9, + "Sshape({a}): got {got}, want {want}", + ); + } +} + +#[test] +fn apply_transcendentals_match_libm() { + // Each transcendental Apply arm calls the Task 2 helper on `a`; assert + // it lands within the helpers' documented tolerance of Rust f64. + let close = |func: BuiltinId, a: f64, want: f64| { + let got = apply_eval(func, a, 0.0, 0.0, 0.0, 1.0); + assert!( + (got - want).abs() < 1e-8 || (got - want).abs() / want.abs() < 1e-8, + "{func:?}({a}): got {got}, want {want}", + ); + }; + close(BuiltinId::Exp, 1.5, 1.5f64.exp()); + close(BuiltinId::Ln, 7.0, 7.0f64.ln()); + close(BuiltinId::Log10, 1000.0, 3.0); + close(BuiltinId::Sin, 0.7, 0.7f64.sin()); + close(BuiltinId::Cos, 0.7, 0.7f64.cos()); + close(BuiltinId::Tan, 0.7, 0.7f64.tan()); + close(BuiltinId::Arcsin, 0.5, 0.5f64.asin()); + close(BuiltinId::Arccos, 0.5, 0.5f64.acos()); + close(BuiltinId::Arctan, 2.0, 2.0f64.atan()); +} + +#[test] +fn apply_step_across_breakpoint() { + // step(time, dt, height=a, step_time=b) = if time+dt/2 > b {a} else 0. + let dt = 0.5; + for &t in &[0.0, 1.0, 2.0, 3.0, 4.0, 5.0] { + let want = vm_step(t, dt, 10.0, 3.0); + assert_apply_exact(BuiltinId::Step, 10.0, 3.0, 0.0, t, dt, want); + } +} + +#[test] +fn apply_ramp_across_breakpoints() { + // ramp(time, slope=a, start=b, end=c) over its three regimes. + for &t in &[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0] { + let want = vm_ramp(t, 2.0, 2.0, 5.0); + assert_apply_exact(BuiltinId::Ramp, 2.0, 2.0, 5.0, t, 1.0, want); + } +} + +#[test] +fn apply_pulse_across_intervals() { + // pulse(time, dt, volume=a, first=b, interval=c) across several periods, + // including the no-repeat (interval == 0) case. + let dt = 1.0; + for &t in &[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0] { + // Repeating pulse: volume 4, first at t=2, every 3. + assert_apply_exact( + BuiltinId::Pulse, + 4.0, + 2.0, + 3.0, + t, + dt, + vm_pulse(t, dt, 4.0, 2.0, 3.0), + ); + // Single pulse: interval 0 -> fires once at t in [first, first+dt). + assert_apply_exact( + BuiltinId::Pulse, + 4.0, + 2.0, + 0.0, + t, + dt, + vm_pulse(t, dt, 4.0, 2.0, 0.0), + ); + } +} + +#[test] +fn apply_inf_pi() { + assert_apply_exact(BuiltinId::Inf, 0.0, 0.0, 0.0, 0.0, 1.0, f64::INFINITY); + assert_apply_exact(BuiltinId::Pi, 0.0, 0.0, 0.0, 0.0, 1.0, std::f64::consts::PI); +} + +#[test] +fn apply_inside_if_does_not_clobber_condition() { + // An `Apply` in an If arm shares the function with the condition local; + // the dedicated apply locals must not collide. Build (codegen-padded + // Apply operands): `if cond then ABS(a) else f`, cond truthy. + let padded = vec![ + Opcode::LoadConstant { id: 1 }, // a = -4 (the `then` operand) + Opcode::LoadConstant { id: 3 }, // pad b = 0 + Opcode::LoadConstant { id: 3 }, // pad c = 0 + Opcode::Apply { + func: BuiltinId::Abs, + }, // ABS(-4) = 4 -> the `then` value + Opcode::LoadConstant { id: 2 }, // f = 99 + Opcode::LoadConstant { id: 0 }, // cond = 1 (truthy) + Opcode::SetCond {}, + Opcode::If {}, + ]; + let got = run( + &bc(vec![1.0, -4.0, 99.0, 0.0], padded), + &ctx_with_cond_depth(1), + true, + 1, + &[], + None, + ); + assert_eq!(got, 4.0, "Apply in an If-then arm should yield ABS(-4)=4"); +} + +// ── max_condition_depth ─────────────────────────────────────────────── + +#[test] +fn max_condition_depth_counts_nesting() { + // Single If: depth 1. + let single = bc(vec![], vec![Opcode::SetCond {}, Opcode::If {}]); + assert_eq!(max_condition_depth(&single), 1); + + // Two sequential Ifs: still depth 1 (LIFO, fully popped between). + let sequential = bc( + vec![], + vec![ + Opcode::SetCond {}, + Opcode::If {}, + Opcode::SetCond {}, + Opcode::If {}, + ], + ); + assert_eq!(max_condition_depth(&sequential), 1); + + // Interleaved: SetCond, SetCond, If, If -> depth 2. Current codegen + // never emits this (it walks a condition to completion before its + // SetCond, so nested IFs come out sequentially); this guards the + // defensive stack-sizing against a future interleaved emission. + let nested = bc( + vec![], + vec![ + Opcode::SetCond {}, + Opcode::SetCond {}, + Opcode::If {}, + Opcode::If {}, + ], + ); + assert_eq!(max_condition_depth(&nested), 2); + + // No conditions: depth 0. + let none = bc(vec![], vec![Opcode::LoadConstant { id: 0 }]); + assert_eq!(max_condition_depth(&none), 0); +} + +// ════════════════════════════════════════════════════════════════════════ +// Phase 5 Task 1: temp-element reads (LoadTempConst / LoadTempDynamic) +// +// The compile-time view-descriptor stack + the static view ops' addressing +// are pinned directly against the VM's `RuntimeView` in `views.rs`'s unit +// tests (no wasm or reducer needed); here the LoadTemp opcodes -- which read +// `temp_storage` and produce a value on the arithmetic stack -- are run under +// DLR-FT to confirm the emitted reads hit the temp region the VM addresses. +// ════════════════════════════════════════════════════════════════════════ + +// Region base for the temp-storage reads: well past `next_base` (4096) so it +// cannot overlap the curr/next chunks. +const TEMP_BASE: u32 = 8192; + +/// Build an `EmitCtx` over a real `ByteCodeContext` (so the temp opcodes can +/// resolve `temp_offsets`), with `temp_storage_base` set. +fn ctx_with_arrays(context: &ByteCodeContext) -> EmitCtx<'_> { + EmitCtx { + temp_storage_base: TEMP_BASE, + ctx: context, + ..ctx_with_cond_depth(0) + } +} + +#[test] +fn load_temp_const_reads_temp_storage() { + // temp_offsets = [0, 4]; LoadTempConst{temp_id:1, index:2} reads + // temp_storage[4 + 2] = temp slot 6 (byte TEMP_BASE + 6*8). + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0, 4], 8); + let ctx = ctx_with_arrays(&context); + let code = vec![Opcode::LoadTempConst { + temp_id: 1, + index: 2, + }]; + let seed = vec![(u64::from(TEMP_BASE) + 6 * 8, 42.0)]; + let got = run(&bc(vec![], code), &ctx, true, 0, &seed, None); + assert_eq!(got, 42.0); +} + +#[test] +fn load_temp_dynamic_reads_temp_storage() { + // LoadTempDynamic{temp_id:0} pops a runtime index (floor) and reads + // temp_storage[temp_offsets[0] + index]. Push index 3 via a constant. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], 5); + let ctx = ctx_with_arrays(&context); + let code = vec![ + Opcode::LoadConstant { id: 0 }, // index = 3.0 + Opcode::LoadTempDynamic { temp_id: 0 }, + ]; + let seed = vec![(u64::from(TEMP_BASE) + 3 * 8, 77.0)]; + let got = run(&bc(vec![3.0], code), &ctx, true, 0, &seed, None); + assert_eq!(got, 77.0); +} + +#[test] +fn load_temp_dynamic_floors_fractional_index() { + // The VM does `stack.pop().floor() as usize`; index 2.9 -> slot 2. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], 4); + let ctx = ctx_with_arrays(&context); + let code = vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::LoadTempDynamic { temp_id: 0 }, + ]; + let seed = vec![(u64::from(TEMP_BASE) + 2 * 8, 13.0)]; + let got = run(&bc(vec![2.9], code), &ctx, true, 0, &seed, None); + assert_eq!(got, 13.0); +} + +// ════════════════════════════════════════════════════════════════════════ +// Phase 5 Task 2: array reducers (Sum/Max/Min/Mean/Stddev/Size) +// +// These run the emitted reducers under DLR-FT and assert the result matches +// the VM's own addressing oracle (`RuntimeView::flat_offset`, via +// `StaticArrayView::to_runtime_view`) folded per the matching VM reducer arm +// (`vm.rs:2216-2309`). The view transform opcodes the production codegen does +// not emit directly (it bakes constant subscripts into one `PushStaticView`) +// are exercised here on a `PushVarView` base so each `apply_*` is reduced +// over and checked against the VM. Reuses `TEMP_BASE` / `ctx_with_arrays` +// from the Task 1 section above. +// ════════════════════════════════════════════════════════════════════════ + +use crate::bytecode::{ + DimensionInfo, RuntimeSparseMapping, RuntimeView, StaticArrayView, SubdimensionRelation, +}; +use smallvec::SmallVec; + +fn seed_run(base_byte: u64, values: &[f64]) -> Vec<(u64, f64)> { + values + .iter() + .enumerate() + .map(|(i, &v)| (base_byte + (i as u64) * 8, v)) + .collect() +} + +/// Read element `iter_idx` of `view` from a flat slab `data` indexed by slot, +/// using the VM's own addressing (`to_runtime_view().flat_offset`). The +/// addressing oracle for every reducer parity check. +fn vm_view_element(view: &StaticArrayView, data: &[f64], iter_idx: usize) -> f64 { + let rv = view.to_runtime_view(); + let n = rv.dims.len(); + let mut indices: SmallVec<[u16; 4]> = smallvec::smallvec![0; n]; + let mut remaining = iter_idx; + for d in (0..n).rev() { + let dim = rv.dims[d] as usize; + indices[d] = (remaining % dim) as u16; + remaining /= dim; + } + let flat = rv.flat_offset(&indices); + data[rv.base_off as usize + flat] +} + +/// The VM's expected `ArraySum` over `view`'s elements drawn from `data`. +fn vm_sum(view: &StaticArrayView, data: &[f64]) -> f64 { + (0..view.to_runtime_view().size()) + .map(|i| vm_view_element(view, data, i)) + .sum() +} + +fn dense_view(base_off: u32, dims: &[u16]) -> StaticArrayView { + // Row-major strides for a dense contiguous array. + let mut strides: SmallVec<[i32; 4]> = SmallVec::new(); + let mut s = 1i32; + for &d in dims.iter().rev() { + strides.push(s); + s *= d as i32; + } + strides.reverse(); + StaticArrayView { + base_off, + is_temp: false, + dims: dims.iter().copied().collect(), + strides, + offset: 0, + sparse: SmallVec::new(), + dim_ids: dims.iter().map(|_| 0u16).collect(), + } +} + +/// Compile+run `PushStaticView(view); ; PopView` over a `curr` array +/// seeded from `data` (slot 0 of curr is byte 0). +fn run_static_reduce(view: StaticArrayView, reduce: Opcode, data: &[f64]) -> f64 { + let mut context = ByteCodeContext::default(); + let view_id = context.add_static_view(view); + let ctx = ctx_with_arrays(&context); + let code = vec![ + Opcode::PushStaticView { view_id }, + reduce, + Opcode::PopView {}, + ]; + run(&bc(vec![], code), &ctx, true, 0, &seed_run(0, data), None) +} +// ── Task 1: PushStaticView addressing across geometries ─────────────── + +#[test] +fn static_view_sum_contiguous_matches_vm() { + // A bare 1-D contiguous view over curr slots 0..4. + let data = [10.0, 20.0, 30.0, 40.0]; + let view = dense_view(0, &[4]); + let got = run_static_reduce(view.clone(), Opcode::ArraySum {}, &data); + assert_eq!(got, vm_sum(&view, &data)); + assert_eq!(got, 100.0); +} + +#[test] +fn static_view_sum_with_offset_matches_vm() { + // A range slice source[3:5] over a 5-element array bakes into `offset=2` + // (0-based start), dims=[3]. Elements are data[2], data[3], data[4]. + let data = [1.0, 2.0, 3.0, 4.0, 5.0]; + let mut view = dense_view(0, &[3]); + view.offset = 2; + let got = run_static_reduce(view.clone(), Opcode::ArraySum {}, &data); + assert_eq!(got, vm_sum(&view, &data)); + assert_eq!(got, 3.0 + 4.0 + 5.0); +} + +#[test] +fn static_view_sum_transposed_strides_matches_vm() { + // A 2x3 matrix stored row-major (strides [3,1]) transposed to dims [3,2] + // with strides [1,3] -- non-contiguous, so the strided flat_offset path + // is exercised. Data laid out row-major: m[r,c] = data[r*3 + c]. + let data = [11.0, 12.0, 13.0, 21.0, 22.0, 23.0]; + let view = StaticArrayView { + base_off: 0, + is_temp: false, + dims: SmallVec::from_slice(&[3, 2]), + strides: SmallVec::from_slice(&[1, 3]), + offset: 0, + sparse: SmallVec::new(), + dim_ids: SmallVec::from_slice(&[0, 0]), + }; + assert!(!view.to_runtime_view().is_contiguous()); + let got = run_static_reduce(view.clone(), Opcode::ArraySum {}, &data); + // Sum is order-independent and covers all six cells regardless. + assert_eq!(got, vm_sum(&view, &data)); + assert_eq!(got, 11.0 + 12.0 + 13.0 + 21.0 + 22.0 + 23.0); +} + +#[test] +fn static_view_max_transposed_picks_right_cells() { + // Max over the transposed view must read the same cells the VM reads. + // Make one cell dominate so a mis-addressed read would change the max. + let data = [11.0, 12.0, 99.0, 21.0, 22.0, 23.0]; + let view = StaticArrayView { + base_off: 0, + is_temp: false, + dims: SmallVec::from_slice(&[3, 2]), + strides: SmallVec::from_slice(&[1, 3]), + offset: 0, + sparse: SmallVec::new(), + dim_ids: SmallVec::from_slice(&[0, 0]), + }; + let got = run_static_reduce(view, Opcode::ArrayMax {}, &data); + assert_eq!(got, 99.0); +} + +#[test] +fn static_view_sum_sparse_matches_vm() { + // A sparse (star-range) view selecting elements at parent offsets [0, 2] + // of a 4-element array: dims=[2], a RuntimeSparseMapping mapping view + // index 0->parent 0, 1->parent 2. Elements are data[0], data[2]. + let data = [5.0, 6.0, 7.0, 8.0]; + let view = StaticArrayView { + base_off: 0, + is_temp: false, + dims: SmallVec::from_slice(&[2]), + strides: SmallVec::from_slice(&[1]), + offset: 0, + sparse: smallvec::smallvec![RuntimeSparseMapping { + dim_index: 0, + parent_offsets: SmallVec::from_slice(&[0, 2]), + }], + dim_ids: SmallVec::from_slice(&[0]), + }; + let got = run_static_reduce(view.clone(), Opcode::ArraySum {}, &data); + assert_eq!(got, vm_sum(&view, &data)); + assert_eq!(got, 5.0 + 7.0); +} + +#[test] +fn static_temp_view_sum_reads_temp_storage() { + // A contiguous temp view (is_temp) reads temp_storage, not curr. temp_id + // 0 lives at temp_offsets[0]=0, so its slot 0 is byte TEMP_BASE. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], 3); + let view = StaticArrayView { + base_off: 0, // temp_id 0 + is_temp: true, + dims: SmallVec::from_slice(&[3]), + strides: SmallVec::from_slice(&[1]), + offset: 0, + sparse: SmallVec::new(), + dim_ids: SmallVec::from_slice(&[0]), + }; + let view_id = context.add_static_view(view); + let ctx = ctx_with_arrays(&context); + let code = vec![ + Opcode::PushStaticView { view_id }, + Opcode::ArraySum {}, + Opcode::PopView {}, + ]; + // Seed curr slots 0..3 with decoys and temp_storage with the real data; + // a read from the wrong region would pick up the decoys. + let mut seed = seed_run(0, &[100.0, 200.0, 300.0]); + seed.extend(seed_run(u64::from(TEMP_BASE), &[2.0, 3.0, 4.0])); + let got = run(&bc(vec![], code), &ctx, true, 0, &seed, None); + assert_eq!(got, 9.0, "temp view must read temp_storage, not curr"); +} + +#[test] +fn static_temp_view_honors_temp_offset() { + // temp_id 1 lives at temp_offsets[1]=4, so its slot 0 is byte + // TEMP_BASE + 4*8. A reducer over it must skip temp 0's slots. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0, 4], 6); + let view = StaticArrayView { + base_off: 1, // temp_id 1 + is_temp: true, + dims: SmallVec::from_slice(&[2]), + strides: SmallVec::from_slice(&[1]), + offset: 0, + sparse: SmallVec::new(), + dim_ids: SmallVec::from_slice(&[0]), + }; + let view_id = context.add_static_view(view); + let ctx = ctx_with_arrays(&context); + let code = vec![ + Opcode::PushStaticView { view_id }, + Opcode::ArraySum {}, + Opcode::PopView {}, + ]; + // temp_storage: [t0_0, t0_1, t0_2, t0_3, t1_0, t1_1] = [9,9,9,9, 2, 5]. + let seed = seed_run(u64::from(TEMP_BASE), &[9.0, 9.0, 9.0, 9.0, 2.0, 5.0]); + let got = run(&bc(vec![], code), &ctx, true, 0, &seed, None); + assert_eq!(got, 7.0, "temp view must start at temp_offsets[temp_id]"); +} + +// ── Task 1: view transform opcodes (mirror RuntimeView::apply_*) ────── +// +// Build a full var view with PushVarView, apply one transform, reduce, and +// compare to the VM's RuntimeView with the same transform applied. These are +// the opcodes production codegen bakes into a single PushStaticView, so they +// are exercised here directly to pin each `apply_*` against the VM. + +/// A `ByteCodeContext` with a single dimension of `size` (DimId 0) and a +/// dim-list `[DimId 0]` (DimListId 0) for a 1-D `PushVarView`. +fn ctx_one_dim(size: u16) -> ByteCodeContext { + let mut context = ByteCodeContext::default(); + let name_id = context.intern_name("D"); + context.add_dimension(DimensionInfo::indexed(name_id, size)); + context.add_dim_list(1, [0, 0, 0, 0]); + context +} + +/// Run `PushVarView(base 0, dims) ; ; ; PopView` and +/// also build the VM `RuntimeView` the same way for the addressing oracle. +fn run_var_view_reduce( + context: &ByteCodeContext, + transforms: &[Opcode], + reduce: Opcode, + data: &[f64], +) -> f64 { + let ctx = ctx_with_arrays(context); + let mut code = vec![Opcode::PushVarView { + base_off: 0, + dim_list_id: 0, + }]; + code.extend_from_slice(transforms); + code.push(reduce); + code.push(Opcode::PopView {}); + run(&bc(vec![], code), &ctx, true, 0, &seed_run(0, data), None) +} + +#[test] +fn view_subscript_const_drops_dim_matches_vm() { + // A 2x3 matrix; subscript dim 0 to index 1 (0-based) -> row 1: cells + // data[3], data[4], data[5]. Mirror with RuntimeView. + let mut context = ByteCodeContext::default(); + let name_d = context.intern_name("D"); + context.add_dimension(DimensionInfo::indexed(name_d, 2)); + let name_e = context.intern_name("E"); + context.add_dimension(DimensionInfo::indexed(name_e, 3)); + context.add_dim_list(2, [0, 1, 0, 0]); // [DimId 0 (size2), DimId 1 (size3)] + let data = [11.0, 12.0, 13.0, 21.0, 22.0, 23.0]; + + let got = run_var_view_reduce( + &context, + &[Opcode::ViewSubscriptConst { + dim_idx: 0, + index: 1, + }], + Opcode::ArraySum {}, + &data, + ); + // VM oracle: build the same RuntimeView and apply the same subscript. + let mut rv = RuntimeView::for_var( + 0, + SmallVec::from_slice(&[2, 3]), + SmallVec::from_slice(&[0, 1]), + ); + rv.apply_single_subscript(0, 1); + let want: f64 = (0..rv.size()) + .map(|i| { + let n = rv.dims.len(); + let mut idx: SmallVec<[u16; 4]> = smallvec::smallvec![0; n]; + let mut rem = i; + for d in (0..n).rev() { + idx[d] = (rem % rv.dims[d] as usize) as u16; + rem /= rv.dims[d] as usize; + } + data[rv.base_off as usize + rv.flat_offset(&idx)] + }) + .sum(); + assert_eq!(got, want); + assert_eq!(got, 21.0 + 22.0 + 23.0); +} + +#[test] +fn view_range_matches_vm() { + // 1-D dim of 5; ViewRange [1:4) keeps indices 1,2,3 -> data[1..4]. + let context = ctx_one_dim(5); + let data = [1.0, 2.0, 3.0, 4.0, 5.0]; + let got = run_var_view_reduce( + &context, + &[Opcode::ViewRange { + dim_idx: 0, + start: 1, + end: 4, + }], + Opcode::ArraySum {}, + &data, + ); + assert_eq!(got, 2.0 + 3.0 + 4.0); +} + +#[test] +fn view_wildcard_is_noop() { + // ViewWildcard leaves the dimension as-is: the sum is the full array. + let context = ctx_one_dim(4); + let data = [1.0, 2.0, 3.0, 4.0]; + let got = run_var_view_reduce( + &context, + &[Opcode::ViewWildcard { dim_idx: 0 }], + Opcode::ArraySum {}, + &data, + ); + assert_eq!(got, 10.0); +} + +#[test] +fn view_transpose_then_reduce_matches_vm() { + // 2x3 matrix; transpose to 3x2 then sum (order-independent but exercises + // the stride/dim reversal addressing). + let mut context = ByteCodeContext::default(); + let name_d = context.intern_name("D"); + context.add_dimension(DimensionInfo::indexed(name_d, 2)); + let name_e = context.intern_name("E"); + context.add_dimension(DimensionInfo::indexed(name_e, 3)); + context.add_dim_list(2, [0, 1, 0, 0]); + let data = [11.0, 12.0, 13.0, 21.0, 22.0, 23.0]; + let got = run_var_view_reduce( + &context, + &[Opcode::ViewTranspose {}], + Opcode::ArraySum {}, + &data, + ); + assert_eq!(got, 11.0 + 12.0 + 13.0 + 21.0 + 22.0 + 23.0); +} + +#[test] +fn view_star_range_sparse_matches_vm() { + // A 1-D parent dim of 4; a star-range via a subdim relation selecting + // parent offsets [1, 3] -> sum of data[1] + data[3]. + let mut context = ByteCodeContext::default(); + let name_p = context.intern_name("P"); + context.add_dimension(DimensionInfo::indexed(name_p, 4)); + let name_s = context.intern_name("S"); + context.add_dimension(DimensionInfo::indexed(name_s, 2)); // child dim + context.add_dim_list(1, [0, 0, 0, 0]); // parent dim list + context.add_subdim_relation(SubdimensionRelation::sparse( + 0, + 1, + SmallVec::from_slice(&[1, 3]), + )); + let data = [5.0, 6.0, 7.0, 8.0]; + let got = run_var_view_reduce( + &context, + &[Opcode::ViewStarRange { + dim_idx: 0, + subdim_relation_id: 0, + }], + Opcode::ArraySum {}, + &data, + ); + assert_eq!(got, 6.0 + 8.0); +} + +#[test] +fn dup_view_then_reduce_matches_single() { + // DupView duplicates the top descriptor; reducing the dup gives the same + // result as reducing the original (and the original stays on the stack). + let context = ctx_one_dim(3); + let data = [2.0, 3.0, 5.0]; + let got = run_var_view_reduce(&context, &[Opcode::DupView {}], Opcode::ArraySum {}, &data); + assert_eq!(got, 10.0); + // The duplicate must leave the stack balanced for the trailing PopView; + // a second PopView would underflow, so add one more here to drain the + // dup and confirm both pops succeed. + let ctx = ctx_with_arrays(&context); + let code = vec![ + Opcode::PushVarView { + base_off: 0, + dim_list_id: 0, + }, + Opcode::DupView {}, + Opcode::ArraySum {}, + Opcode::PopView {}, // pop dup + Opcode::PopView {}, // pop original + ]; + let got2 = run(&bc(vec![], code), &ctx, true, 0, &seed_run(0, &data), None); + assert_eq!(got2, 10.0); +} + +// ── Task 2: each reducer vs an explicit VM-mirrored oracle ──────────── + +/// Sum/Max/Min/Mean/Stddev/Size oracle over a contiguous element slice, +/// mirroring the VM's per-reducer arms (`vm.rs:2216-2309`) exactly. +fn reducer_oracle(op: &Opcode, elems: &[f64]) -> f64 { + let size = elems.len(); + match op { + Opcode::ArraySum {} => elems.iter().sum(), + Opcode::ArraySize {} => size as f64, + _ if size == 0 => f64::NAN, + Opcode::ArrayMax {} => elems + .iter() + .copied() + .fold(f64::NEG_INFINITY, |a, v| if v > a { v } else { a }), + Opcode::ArrayMin {} => elems + .iter() + .copied() + .fold(f64::INFINITY, |a, v| if v < a { v } else { a }), + Opcode::ArrayMean {} => elems.iter().sum::() / size as f64, + Opcode::ArrayStddev {} => { + let mean = elems.iter().sum::() / size as f64; + let var = elems.iter().map(|v| (v - mean).powf(2.0)).sum::() / size as f64; + var.sqrt() + } + _ => unreachable!(), + } +} + +fn assert_reducer_matches(op: Opcode, elems: &[f64]) { + // A bare contiguous 1-D static view over the data. + let data: Vec = elems.to_vec(); + let view = dense_view(0, &[elems.len() as u16]); + let got = run_static_reduce(view, op, &data); + let want = reducer_oracle(&op, elems); + if want.is_nan() { + assert!(got.is_nan(), "{}: expected NaN, got {got}", op.name()); + } else { + assert!( + (got - want).abs() < 1e-12, + "{}: got {got}, want {want}", + op.name() + ); + } +} + +#[test] +fn reducer_sum_matches_vm() { + assert_reducer_matches(Opcode::ArraySum {}, &[1.0, 2.0, 3.0, 4.5]); +} + +#[test] +fn reducer_max_matches_vm() { + assert_reducer_matches(Opcode::ArrayMax {}, &[3.0, -1.0, 7.5, 2.0]); + // Negative-only set: max stays negative (init NEG_INFINITY never wins). + assert_reducer_matches(Opcode::ArrayMax {}, &[-5.0, -2.0, -9.0]); +} + +#[test] +fn reducer_min_matches_vm() { + assert_reducer_matches(Opcode::ArrayMin {}, &[3.0, -1.0, 7.5, 2.0]); + assert_reducer_matches(Opcode::ArrayMin {}, &[5.0, 2.0, 9.0]); +} + +#[test] +fn reducer_mean_matches_vm() { + assert_reducer_matches(Opcode::ArrayMean {}, &[2.0, 4.0, 6.0]); + assert_reducer_matches(Opcode::ArrayMean {}, &[1.0, 2.0]); +} + +#[test] +fn reducer_stddev_matches_vm_population_variance() { + // Population variance (divisor N): for [2,4,4,4,5,5,7,9] the population + // stddev is exactly 2.0 -- a value check, not just parity, pinning the + // divisor-N (not N-1) choice that matches `vm.rs::ArrayStddev`. + let elems = [2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0]; + assert_reducer_matches(Opcode::ArrayStddev {}, &elems); + let view = dense_view(0, &[elems.len() as u16]); + let got = run_static_reduce(view, Opcode::ArrayStddev {}, &elems); + assert!( + (got - 2.0).abs() < 1e-12, + "population stddev should be 2.0, got {got}" + ); +} + +#[test] +fn reducer_size_matches_vm() { + assert_reducer_matches(Opcode::ArraySize {}, &[1.0, 2.0, 3.0]); +} + +#[test] +fn reducer_size_multidim_is_product() { + // SIZE over a 2x3 view is 6, regardless of the data. + let data = [0.0; 6]; + let view = dense_view(0, &[2, 3]); + let got = run_static_reduce(view, Opcode::ArraySize {}, &data); + assert_eq!(got, 6.0); +} + +// ── Task 2: empty-but-valid view asymmetry (AC1.5) ──────────────────── + +/// An empty-but-valid view: a `[start:start)` range collapses dim 0 to size +/// 0 (`apply_range_checked`), valid with zero elements. Built as a static +/// view with a zero-size dimension. +fn empty_static_view() -> StaticArrayView { + StaticArrayView { + base_off: 0, + is_temp: false, + dims: SmallVec::from_slice(&[0]), + strides: SmallVec::from_slice(&[1]), + offset: 0, + sparse: SmallVec::new(), + dim_ids: SmallVec::from_slice(&[0]), + } +} + +#[test] +fn empty_valid_view_sum_is_zero() { + // ArraySum over an empty-but-valid view is the additive identity 0.0 + // (`vm.rs:2216`), NOT NaN. + let got = run_static_reduce(empty_static_view(), Opcode::ArraySum {}, &[1.0]); + assert_eq!(got, 0.0); +} + +#[test] +fn empty_valid_view_max_min_mean_stddev_are_nan() { + for op in [ + Opcode::ArrayMax {}, + Opcode::ArrayMin {}, + Opcode::ArrayMean {}, + Opcode::ArrayStddev {}, + ] { + let got = run_static_reduce(empty_static_view(), op, &[1.0]); + assert!( + got.is_nan(), + "{}: empty-but-valid view must be NaN", + op.name() + ); + } +} + +#[test] +fn empty_valid_view_size_is_zero() { + let got = run_static_reduce(empty_static_view(), Opcode::ArraySize {}, &[1.0]); + assert_eq!(got, 0.0); +} + +// ── Task 2: invalid view -> NaN for ALL reducers (AC1.5) ────────────── +// +// A static view is always valid (`valid_local` is None), so an invalid view +// is modeled by directly setting `valid_local` to a wasm i32 local seeded to +// 0 -- mirroring what Task 4's out-of-bounds dynamic subscript will produce. +// Every reducer (including ArraySum) must yield NaN, matching `reduce_view`'s +// leading `if !is_valid { return NaN }`. + +/// Run a reducer over a contiguous static view whose `valid_local` is forced +/// to an i32 local pre-set to 0 (invalid). The harness function reserves the +/// three Apply f64 scratch locals; we add one i32 local after them for the +/// validity flag and initialize it to 0 in the emitted prologue. +fn run_invalid_view_reduce(reduce: Opcode) -> f64 { + let mut context = ByteCodeContext::default(); + // Contiguous 1-D view over 3 curr slots; geometry is valid, but the + // view is flagged invalid. + let view = dense_view(0, &[3]); + let view_id = context.add_static_view(view); + + // Build a custom module: the opcode function declares an extra i32 local + // (index after the standard opcode-fn locals) for the validity flag, + // seeded to 0. We mark the descriptor invalid by post-processing is out + // of reach here, so instead emit the program through a small shim that + // sets `valid_local` on the pushed descriptor. + // + // Simpler: emit PushStaticView, then a hand-rolled reduce over a desc + // with valid_local set, by calling emit_array_reduce directly. + let ctx = EmitCtx { + temp_storage_base: TEMP_BASE, + ctx: &context, + ..ctx_with_cond_depth(0) + }; + + // The validity i32 local index: it is the first index past every standard + // opcode-fn local (the scratch f64, the cond i32s, the Apply f64s, and the + // Phase-6 vector-op f64/i32 scratch blocks), i.e. exactly where the + // dynamic-subscript "extra i32" locals begin. The shim below pushes a single + // i32 local at that index for the validity flag. + let valid_local = extra_i32_local_base(0, 0); + + let mut module = Module::new(); + let helpers = build_helpers(); + let n_helpers = helpers.functions.len() as u32; + let mut types = TypeSection::new(); + types.ty().function([ValType::I32], [ValType::F64]); // eval -> f64 + for hf in &helpers.functions { + types.ty().function(hf.params.clone(), hf.results.clone()); + } + module.section(&types); + let mut functions = FunctionSection::new(); + for (i, _) in helpers.functions.iter().enumerate() { + functions.function(1 + i as u32); + } + functions.function(0); + module.section(&functions); + let mut memories = MemorySection::new(); + memories.memory(MemoryType { + minimum: 1, + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + module.section(&memories); + let mut exports = ExportSection::new(); + exports.export("eval", ExportKind::Func, n_helpers); + exports.export("mem", ExportKind::Memory, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + for hf in helpers.functions { + code.function(&hf.body); + } + // opcode-fn locals plus one extra i32 for the validity flag. + let mut locals = opcode_fn_locals(0, 0, 0); + locals.push((1, ValType::I32)); + let mut func = Function::new(locals); + // valid_local = 0 (invalid). + func.instruction(&Instruction::I32Const(0)); + func.instruction(&Instruction::LocalSet(valid_local)); + // Reduce over a desc built from the registered static view, but with its + // `valid_local` forced to the (zero-seeded) validity flag -- exactly the + // shape Task 4's out-of-bounds dynamic subscript will produce. + let mut desc = ViewDesc::from_static(ctx.ctx.get_static_view(view_id).unwrap()); + desc.valid_local = Some(valid_local); + emit_array_reduce(&reduce, &desc, &ctx, &mut func).expect("reduce lowers"); + func.instruction(&Instruction::End); + code.function(&func); + module.section(&code); + + let bytes = module.finish(); + let info = validate(&bytes).expect("invalid-view module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + // Seed the curr slots so a (wrongly) valid read would produce a finite + // value -- making the NaN assertion meaningful. + let mem = store + .instance_export(inst, "mem") + .unwrap() + .as_mem() + .unwrap(); + store.mem_access_mut_slice(mem, |b| { + for (i, v) in [1.0f64, 2.0, 3.0].iter().enumerate() { + let a = i * 8; + b[a..a + 8].copy_from_slice(&v.to_le_bytes()); + } + }); + let eval = store + .instance_export(inst, "eval") + .unwrap() + .as_func() + .unwrap(); + store.invoke_simple_typed(eval, (0_i32,)).expect("invoke") +} + +#[test] +fn invalid_view_all_reducers_are_nan() { + // Every reducer over an invalid view is NaN -- including ArraySum, whose + // empty-but-valid result is 0.0 but whose invalid-view result is NaN. + for op in [ + Opcode::ArraySum {}, + Opcode::ArrayMax {}, + Opcode::ArrayMin {}, + Opcode::ArrayMean {}, + Opcode::ArrayStddev {}, + ] { + let got = run_invalid_view_reduce(op); + assert!( + got.is_nan(), + "{}: an invalid view must reduce to NaN, got {got}", + op.name() + ); + } +} + +#[test] +fn invalid_view_size_is_still_the_size() { + // ArraySize is defined regardless of validity (`vm.rs:2306` reads + // `view.size()` with no validity gate), so an invalid 3-element view + // still reports size 3. + let got = run_invalid_view_reduce(Opcode::ArraySize {}); + assert_eq!(got, 3.0); +} + +// ════════════════════════════════════════════════════════════════════════ +// Phase 5 Task 3: iteration loops (BeginIter..EndIter) + broadcast +// +// The body span between `BeginIter` and `NextIterOrJump` is fully unrolled +// over the compile-time `size()`, so each iteration's reads/writes are +// emitted at constant addresses (mirroring the array reducer's unrolled fold +// and the VM element-for-element). These hand-build the canonical codegen +// shape (`PushStaticView(out); BeginIter; PushStaticView(src); ; +// NextIterOrJump; EndIter; PopView; ...`) and run it under DLR-FT, reading +// the written temp slots back and comparing to a VM-mirrored oracle. +// ════════════════════════════════════════════════════════════════════════ + +/// A contiguous temp `StaticArrayView` over `dims` at `temp_id`. +fn temp_view(temp_id: u32, dims: &[u16]) -> StaticArrayView { + let mut v = dense_view(temp_id, dims); + v.is_temp = true; + v +} + +/// A contiguous temp `StaticArrayView` carrying explicit `dim_ids` (for the +/// broadcast-matching tests). +fn dense_view_ids(base_off: u32, dims: &[u16], dim_ids: &[u16]) -> StaticArrayView { + let mut v = dense_view(base_off, dims); + v.dim_ids = dim_ids.iter().copied().collect(); + v +} + +/// Read `count` temp slots (starting at temp slot 0) back after running a +/// temp-writing program. The temp region base is `TEMP_BASE`. +fn run_and_read_temps( + context: &ByteCodeContext, + code: Vec, + literals: Vec, + seed: &[(u64, f64)], + count: usize, +) -> Vec { + let ctx = ctx_with_arrays(context); + let bytes = build_module(&bc(literals, code), &ctx, false, 0); + let info = validate(&bytes).expect("emitted module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + if !seed.is_empty() { + let mem = store + .instance_export(inst, "mem") + .unwrap() + .as_mem() + .unwrap(); + store.mem_access_mut_slice(mem, |b| { + for &(addr, v) in seed { + let a = addr as usize; + b[a..a + 8].copy_from_slice(&v.to_le_bytes()); + } + }); + } + let eval = store + .instance_export(inst, "eval") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(i32,), ()>(eval, (0_i32,)) + .expect("invoke"); + let mem = store + .instance_export(inst, "mem") + .unwrap() + .as_mem() + .unwrap(); + store.mem_access_mut_slice(mem, |b| { + (0..count) + .map(|i| { + let a = TEMP_BASE as usize + i * 8; + f64::from_le_bytes(b[a..a + 8].try_into().unwrap()) + }) + .collect() + }) +} + +#[test] +fn iter_loop_elementwise_writes_temp_like_vm() { + // out_temp[i] = source[i] * 2 over a 4-element source in curr, written to + // temp 0. Mirrors the codegen shape: output temp view drives iteration, + // the source view is pushed inside, read via LoadIterViewAt{1}. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], 4); // temp 0 spans 4 slots + let out_view = context.add_static_view(temp_view(0, &[4])); + let src_view = context.add_static_view(dense_view(0, &[4])); + let code = vec![ + Opcode::PushStaticView { view_id: out_view }, + Opcode::BeginIter { + write_temp_id: 0, + has_write_temp: true, + }, + Opcode::PushStaticView { view_id: src_view }, + Opcode::LoadIterViewAt { offset: 1 }, + Opcode::LoadConstant { id: 0 }, + Opcode::Op2 { op: Op2::Mul }, + Opcode::StoreIterElement {}, + Opcode::NextIterOrJump { jump_back: -4 }, + Opcode::EndIter {}, + Opcode::PopView {}, + Opcode::PopView {}, + ]; + // source = [10, 20, 30, 40] in curr slots 0..4. + let seed = seed_run(0, &[10.0, 20.0, 30.0, 40.0]); + let temps = run_and_read_temps(&context, code, vec![2.0], &seed, 4); + assert_eq!(temps, vec![20.0, 40.0, 60.0, 80.0]); +} + +#[test] +fn iter_loop_load_iter_element_reads_captured_view() { + // out_temp[i] = iter_view[i] (the captured view *is* the iteration view). + // Here the captured view is the OUTPUT temp itself, so seed the temp and + // copy it to itself -- a degenerate but faithful LoadIterElement check. + // Use a separate source temp captured as the iter view instead: push a + // source temp view, BeginIter captures it, LoadIterElement reads it, and + // StoreIterElement writes the *same* temp's slots (write_temp == source). + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], 3); + let src = context.add_static_view(temp_view(0, &[3])); + let code = vec![ + Opcode::PushStaticView { view_id: src }, + Opcode::BeginIter { + write_temp_id: 0, + has_write_temp: true, + }, + Opcode::LoadIterElement {}, + Opcode::LoadConstant { id: 0 }, + Opcode::Op2 { op: Op2::Add }, + Opcode::StoreIterElement {}, + Opcode::NextIterOrJump { jump_back: -4 }, + Opcode::EndIter {}, + Opcode::PopView {}, + ]; + // temp 0 = [1, 2, 3]; each += 5 in place -> [6, 7, 8]. + let seed = seed_run(u64::from(TEMP_BASE), &[1.0, 2.0, 3.0]); + let temps = run_and_read_temps(&context, code, vec![5.0], &seed, 3); + assert_eq!(temps, vec![6.0, 7.0, 8.0]); +} + +#[test] +fn iter_loop_load_iter_temp_element_reads_temp() { + // out_temp1[i] = temp0[i] + 100, reading temp0 via LoadIterTempElement and + // writing temp1. temp_offsets = [0, 3]: temp0 in slots 0..3, temp1 in 3..6. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0, 3], 6); + let out_view = context.add_static_view(temp_view(1, &[3])); // temp 1 + let code = vec![ + Opcode::PushStaticView { view_id: out_view }, + Opcode::BeginIter { + write_temp_id: 1, + has_write_temp: true, + }, + Opcode::LoadIterTempElement { temp_id: 0 }, + Opcode::LoadConstant { id: 0 }, + Opcode::Op2 { op: Op2::Add }, + Opcode::StoreIterElement {}, + Opcode::NextIterOrJump { jump_back: -4 }, + Opcode::EndIter {}, + Opcode::PopView {}, + ]; + // temp0 = [7, 8, 9] in slots 0..3. + let seed = seed_run(u64::from(TEMP_BASE), &[7.0, 8.0, 9.0]); + // Read 6 temp slots: temp1 is slots 3..6. + let temps = run_and_read_temps(&context, code, vec![100.0], &seed, 6); + assert_eq!(&temps[3..6], &[107.0, 108.0, 109.0]); +} + +#[test] +fn iter_loop_broadcast_smaller_source_matches_vm() { + // out_temp[A,B] = mat[A,B] + vec[A]: the iteration view is 2-D [A(2),B(3)] + // (dim_ids [0,1]); `vec` is 1-D [A(2)] (dim_id 0), broadcast along B. This + // exercises the `LoadIterViewAt` broadcast path (source dims != iter + // dims), which production codegen does not currently emit but the VM + // supports. Cross-checked element-for-element against the VM's broadcast. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], 6); // out temp [2,3] + // Two indexed dims so match_dimensions_two_pass can resolve is_indexed. + let na = context.intern_name("A"); + context.add_dimension(DimensionInfo::indexed(na, 2)); // id 0 + let nb = context.intern_name("B"); + context.add_dimension(DimensionInfo::indexed(nb, 3)); // id 1 + + let out_view = context.add_static_view({ + let mut v = temp_view(0, &[2, 3]); + v.dim_ids = SmallVec::from_slice(&[0, 1]); + v + }); + // mat in curr slots 0..6 (dims [2,3], dim_ids [0,1]). + let mat = context.add_static_view(dense_view_ids(0, &[2, 3], &[0, 1])); + // vec in curr slots 6..8 (dims [2], dim_id 0). + let vec_v = context.add_static_view(dense_view_ids(6, &[2], &[0])); + let code = vec![ + Opcode::PushStaticView { view_id: out_view }, + Opcode::BeginIter { + write_temp_id: 0, + has_write_temp: true, + }, + Opcode::PushStaticView { view_id: mat }, // offset 2 after vec is pushed + Opcode::PushStaticView { view_id: vec_v }, // offset 1 + Opcode::LoadIterViewAt { offset: 2 }, // mat[A,B] + Opcode::LoadIterViewAt { offset: 1 }, // vec[A] broadcast over B + Opcode::Op2 { op: Op2::Add }, + Opcode::StoreIterElement {}, + Opcode::NextIterOrJump { jump_back: -5 }, + Opcode::EndIter {}, + Opcode::PopView {}, + Opcode::PopView {}, + Opcode::PopView {}, + ]; + // mat[a,b] = a*10 + b -> [0,1,2, 10,11,12]; vec[a] = a -> [0, 1]. + let mut seed = seed_run(0, &[0.0, 1.0, 2.0, 10.0, 11.0, 12.0]); + seed.extend(seed_run(6 * 8, &[0.0, 1.0])); + let temps = run_and_read_temps(&context, code, vec![], &seed, 6); + // out[a,b] = mat[a,b] + vec[a]. + let expected = [ + 0.0 + 0.0, + 1.0 + 0.0, + 2.0 + 0.0, + 10.0 + 1.0, + 11.0 + 1.0, + 12.0 + 1.0, + ]; + assert_eq!(temps, expected); +} + +#[test] +fn iter_loop_smaller_source_same_shape_writes_nan() { + // The iteration is over 4 elements but the source view (same dim_ids) has + // only 3: the VM's `LoadIterViewTop`/`LoadIterViewAt` fast path returns + // NaN past the source size (`vm.rs:1972`). Element 3 must be NaN. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], 4); + let out_view = context.add_static_view(temp_view(0, &[4])); + let src = context.add_static_view(dense_view(0, &[3])); // shorter + let code = vec![ + Opcode::PushStaticView { view_id: out_view }, + Opcode::BeginIter { + write_temp_id: 0, + has_write_temp: true, + }, + Opcode::PushStaticView { view_id: src }, + Opcode::LoadIterViewAt { offset: 1 }, + Opcode::StoreIterElement {}, + Opcode::NextIterOrJump { jump_back: -3 }, + Opcode::EndIter {}, + Opcode::PopView {}, + Opcode::PopView {}, + ]; + let seed = seed_run(0, &[5.0, 6.0, 7.0]); + let temps = run_and_read_temps(&context, code, vec![], &seed, 4); + assert_eq!(&temps[0..3], &[5.0, 6.0, 7.0]); + assert!( + temps[3].is_nan(), + "element past the source size must be NaN" + ); +} + +#[test] +fn iter_loop_then_reduce_dotprod_matches_vm() { + // The full SUM(a[*]*b[*]) shape: hoist a[i]*b[i] into a temp via BeginIter, + // then ArraySum the temp. a in curr 0..4, b in curr 4..8, temp 0. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], 4); + let out_view = context.add_static_view(temp_view(0, &[4])); + let a = context.add_static_view(dense_view(0, &[4])); + let b = context.add_static_view(dense_view(4, &[4])); + let temp_read = context.add_static_view(temp_view(0, &[4])); + let code = vec![ + Opcode::PushStaticView { view_id: out_view }, + Opcode::BeginIter { + write_temp_id: 0, + has_write_temp: true, + }, + Opcode::PushStaticView { view_id: a }, // offset 2 after b + Opcode::PushStaticView { view_id: b }, // offset 1 + Opcode::LoadIterViewAt { offset: 2 }, + Opcode::LoadIterViewAt { offset: 1 }, + Opcode::Op2 { op: Op2::Mul }, + Opcode::StoreIterElement {}, + Opcode::NextIterOrJump { jump_back: -5 }, + Opcode::EndIter {}, + Opcode::PopView {}, + Opcode::PopView {}, + Opcode::PopView {}, + Opcode::PushStaticView { view_id: temp_read }, + Opcode::ArraySum {}, + Opcode::PopView {}, + ]; + // a = [1,2,3,4], b = [10,20,30,40] -> dot = 10+40+90+160 = 300. + let mut seed = seed_run(0, &[1.0, 2.0, 3.0, 4.0]); + seed.extend(seed_run(4 * 8, &[10.0, 20.0, 30.0, 40.0])); + let ctx = ctx_with_arrays(&context); + let got = run(&bc(vec![], code), &ctx, true, 0, &seed, None); + assert_eq!(got, 300.0); +} + +#[test] +fn iter_loop_zero_size_writes_nothing() { + // An empty iteration view (size 0): the unroller emits zero body copies, + // so the temp keeps its seeded value (no write). A trailing reducer over + // the empty output is 0 for SUM. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], 1); + let out_view = context.add_static_view({ + let mut v = temp_view(0, &[0]); // zero-size dim + v.dims = SmallVec::from_slice(&[0]); + v + }); + let code = vec![ + Opcode::PushStaticView { view_id: out_view }, + Opcode::BeginIter { + write_temp_id: 0, + has_write_temp: true, + }, + Opcode::LoadIterElement {}, + Opcode::StoreIterElement {}, + Opcode::NextIterOrJump { jump_back: -2 }, + Opcode::EndIter {}, + Opcode::PopView {}, + ]; + // Seed temp slot 0 with a sentinel; the empty loop must not touch it. + let seed = seed_run(u64::from(TEMP_BASE), &[42.0]); + let temps = run_and_read_temps(&context, code, vec![], &seed, 1); + assert_eq!(temps, vec![42.0], "an empty iteration writes nothing"); +} + +// ── Broadcast iteration family (BeginBroadcastIter..EndBroadcastIter) ── +// +// Not emitted by current codegen, but lowered for completeness and pinned +// against the VM's `BeginBroadcastIter`/`LoadBroadcastElement` arms +// (`vm.rs:2314-2421`) here. The result geometry is the union of the source +// dim_ids; a 2-D and a 1-D source broadcast into the 2-D result. + +#[test] +fn broadcast_iter_unions_dims_like_vm() { + // dest[A,B] = mat[A,B] * vec[A]: BeginBroadcastIter with two sources + // (mat 2-D dim_ids [0,1], vec 1-D dim_id 0). The result unions to + // dim_ids [0,1] (dims [2,3]); vec broadcasts along B. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], 6); + let na = context.intern_name("A"); + context.add_dimension(DimensionInfo::indexed(na, 2)); // id 0 + let nb = context.intern_name("B"); + context.add_dimension(DimensionInfo::indexed(nb, 3)); // id 1 + let mat = context.add_static_view(dense_view_ids(0, &[2, 3], &[0, 1])); + let vec_v = context.add_static_view(dense_view_ids(6, &[2], &[0])); + let code = vec![ + // Push the two sources (deepest-first): mat then vec. + Opcode::PushStaticView { view_id: mat }, + Opcode::PushStaticView { view_id: vec_v }, + Opcode::BeginBroadcastIter { + n_sources: 2, + dest_temp_id: 0, + }, + Opcode::LoadBroadcastElement { source_idx: 0 }, // mat + Opcode::LoadBroadcastElement { source_idx: 1 }, // vec + Opcode::Op2 { op: Op2::Mul }, + Opcode::StoreBroadcastElement {}, + Opcode::NextBroadcastOrJump { jump_back: -4 }, + Opcode::EndBroadcastIter {}, + Opcode::PopView {}, + Opcode::PopView {}, + ]; + // mat[a,b] = a*10 + b -> [0,1,2, 10,11,12]; vec[a] = a+1 -> [1, 2]. + let mut seed = seed_run(0, &[0.0, 1.0, 2.0, 10.0, 11.0, 12.0]); + seed.extend(seed_run(6 * 8, &[1.0, 2.0])); + let temps = run_and_read_temps(&context, code, vec![], &seed, 6); + // dest[a,b] = mat[a,b] * vec[a]. + let expected = [ + 0.0 * 1.0, + 1.0 * 1.0, + 2.0 * 1.0, + 10.0 * 2.0, + 11.0 * 2.0, + 12.0 * 2.0, + ]; + assert_eq!(temps, expected); +} + +// ════════════════════════════════════════════════════════════════════════ +// Phase 5 Task 4: dynamic subscripts + OOB->NaN +// +// The legacy scalar subscript (`PushSubscriptIndex` / `LoadSubscript`) and +// the view-stack dynamic subscript (`ViewSubscriptDynamic`) both carry a +// runtime offset + validity flag in fresh i32 wasm locals (reserved by +// `count_extra_i32_locals`). An out-of-bounds index clears the validity +// flag, so the read yields NaN -- matching the VM (`vm.rs:1341-1366` for the +// legacy path; `reduce_view`'s `if !is_valid { NaN }` for the view path). +// ════════════════════════════════════════════════════════════════════════ + +/// Run `code` (with `count_extra_i32_locals` reserved) returning the f64 +/// result, with `curr` seeded from `data` (slot 0 = byte 0). The literal pool +/// holds the runtime index value(s). +fn run_dyn(code: Vec, literals: Vec, data: &[f64]) -> f64 { + let context = ByteCodeContext::default(); + let ctx = ctx_with_arrays(&context); + run(&bc(literals, code), &ctx, true, 0, &seed_run(0, data), None) +} + +#[test] +fn legacy_subscript_1d_in_range_matches_vm() { + // arr[idx] (idx 1-based) over a 4-element array in curr slots 0..4. + // idx = 3 (1-based) -> 0-based 2 -> data[2]. + let data = [10.0, 20.0, 30.0, 40.0]; + let code = vec![ + Opcode::LoadConstant { id: 0 }, // idx = 3.0 + Opcode::PushSubscriptIndex { bounds: 4 }, + Opcode::LoadSubscript { off: 0 }, + ]; + assert_eq!(run_dyn(code, vec![3.0], &data), 30.0); +} + +#[test] +fn legacy_subscript_oob_is_nan() { + let data = [10.0, 20.0, 30.0, 40.0]; + // idx = 5 > bounds 4 -> invalid -> NaN. + let high = vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::PushSubscriptIndex { bounds: 4 }, + Opcode::LoadSubscript { off: 0 }, + ]; + assert!( + run_dyn(high, vec![5.0], &data).is_nan(), + "idx > bounds -> NaN" + ); + // idx = 0 is invalid in 1-based indexing -> NaN. + let zero = vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::PushSubscriptIndex { bounds: 4 }, + Opcode::LoadSubscript { off: 0 }, + ]; + assert!(run_dyn(zero, vec![0.0], &data).is_nan(), "idx 0 -> NaN"); +} + +#[test] +fn legacy_subscript_off_shifts_base_like_vm() { + // LoadSubscript reads curr[module_off + off + flat]; with off=2 the base + // shifts by 2 slots. arr starts at slot 2; idx=2 (1-based) -> slot 3. + let data = [99.0, 99.0, 100.0, 200.0, 300.0]; + let code = vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::PushSubscriptIndex { bounds: 3 }, + Opcode::LoadSubscript { off: 2 }, + ]; + assert_eq!(run_dyn(code, vec![2.0], &data), 200.0); +} + +#[test] +fn legacy_subscript_2d_fold_matches_vm() { + // arr[i, j] over a [2,3] row-major array in curr slots 0..6. The VM folds + // index = i0*bounds1 + i1 (the running index times the current bound plus + // the current index). i=2 (1-based -> 0-based 1), j=3 (1-based -> 0-based + // 2): flat = 1*3 + 2 = 5 -> data[5]. + let data = [0.0, 1.0, 2.0, 10.0, 11.0, 12.0]; + let code = vec![ + Opcode::LoadConstant { id: 0 }, // i = 2.0 + Opcode::PushSubscriptIndex { bounds: 2 }, + Opcode::LoadConstant { id: 1 }, // j = 3.0 + Opcode::PushSubscriptIndex { bounds: 3 }, + Opcode::LoadSubscript { off: 0 }, + ]; + assert_eq!(run_dyn(code, vec![2.0, 3.0], &data), 12.0); +} + +#[test] +fn legacy_subscript_2d_oob_in_either_index_is_nan() { + let data = [0.0, 1.0, 2.0, 10.0, 11.0, 12.0]; + // Second index out of bounds (j=4 > 3) -> NaN even though i is valid. + let code = vec![ + Opcode::LoadConstant { id: 0 }, // i = 1 + Opcode::PushSubscriptIndex { bounds: 2 }, + Opcode::LoadConstant { id: 1 }, // j = 4 (oob) + Opcode::PushSubscriptIndex { bounds: 3 }, + Opcode::LoadSubscript { off: 0 }, + ]; + assert!(run_dyn(code, vec![1.0, 4.0], &data).is_nan()); +} + +#[test] +fn legacy_subscript_floors_fractional_index() { + // The VM does `stack.pop().floor() as u16`; idx 2.9 -> 1-based 2 -> slot 1. + let data = [10.0, 20.0, 30.0]; + let code = vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::PushSubscriptIndex { bounds: 3 }, + Opcode::LoadSubscript { off: 0 }, + ]; + assert_eq!(run_dyn(code, vec![2.9], &data), 20.0); +} + +/// Build a 1-D `PushVarViewDirect` over `dim` slots, apply a dynamic subscript +/// at dim 0 from a constant index, and `ArraySum` the resulting (scalar) view +/// -- the `ViewSubscriptDynamic` end-to-end shape, runnable in isolation. +fn run_view_dyn_subscript(dim: u16, index: f64, data: &[f64]) -> f64 { + let mut context = ByteCodeContext::default(); + // PushVarViewDirect resolves dims from a dim-list of raw sizes. + context.add_dim_list(1, [dim, 0, 0, 0]); + let ctx = ctx_with_arrays(&context); + let code = vec![ + Opcode::PushVarViewDirect { + base_off: 0, + dim_list_id: 0, + }, + Opcode::LoadConstant { id: 0 }, // dynamic index + Opcode::ViewSubscriptDynamic { dim_idx: 0 }, + Opcode::ArraySum {}, // sum of the 1-element view (or NaN if invalid) + Opcode::PopView {}, + ]; + run( + &bc(vec![index], code), + &ctx, + true, + 0, + &seed_run(0, data), + None, + ) +} + +#[test] +fn view_subscript_dynamic_in_range_reads_element() { + // arr[idx] reduced: idx = 3 (1-based) -> data[2]; SUM of the 1-element + // view is that element. + let data = [10.0, 20.0, 30.0, 40.0]; + assert_eq!(run_view_dyn_subscript(4, 3.0, &data), 30.0); +} + +#[test] +fn view_subscript_dynamic_oob_is_nan() { + let data = [10.0, 20.0, 30.0, 40.0]; + // idx = 5 > dim 4 -> view invalid -> reducer (even SUM) yields NaN. + assert!( + run_view_dyn_subscript(4, 5.0, &data).is_nan(), + "idx > dim -> invalid view -> NaN" + ); + // idx = 0 invalid (1-based) -> NaN. + assert!( + run_view_dyn_subscript(4, 0.0, &data).is_nan(), + "idx 0 -> invalid view -> NaN" + ); +} + +#[test] +fn view_subscript_dynamic_offset_picks_right_element() { + // Sweep the in-range indices: each picks the matching element. + let data = [5.0, 6.0, 7.0, 8.0, 9.0]; + for (idx_1based, expected) in [(1, 5.0), (2, 6.0), (3, 7.0), (4, 8.0), (5, 9.0)] { + assert_eq!( + run_view_dyn_subscript(5, idx_1based as f64, &data), + expected, + "arr[{idx_1based}] (1-based)" + ); + } +} + +// ── End-to-end: a runtime-OOB dynamic subscript feeding a real reducer ──── +// +// The white-box `run_invalid_view_reduce` above hand-forces `valid_local`; +// this composes the genuine codegen shape -- `mat[oob_row, *]` where `row` is +// a runtime out-of-range index -- so the invalid-view NaN flows from a real +// `ViewSubscriptDynamic` through `emit_array_reduce`'s validity gate, over a +// multi-element (non-degenerate) row, exactly as a model would produce it. + +/// Build a 2-D `mat[rows][cols]` view via `PushVarViewDirect`, dynamically +/// subscript dim 0 with a runtime `row_1based` index (leaving a `cols`-element +/// row view), and reduce that row. The row is invalid iff `row_1based` is out +/// of `1..=rows`. `data` seeds the row-major curr slab (rows*cols slots). +fn run_view_dyn_row_reduce( + rows: u16, + cols: u16, + row_1based: f64, + reduce: Opcode, + data: &[f64], +) -> f64 { + let mut context = ByteCodeContext::default(); + context.add_dim_list(2, [rows, cols, 0, 0]); + let ctx = ctx_with_arrays(&context); + let code = vec![ + Opcode::PushVarViewDirect { + base_off: 0, + dim_list_id: 0, + }, + Opcode::LoadConstant { id: 0 }, // runtime row index (1-based) + Opcode::ViewSubscriptDynamic { dim_idx: 0 }, + reduce, + Opcode::PopView {}, + ]; + run( + &bc(vec![row_1based], code), + &ctx, + true, + 0, + &seed_run(0, data), + None, + ) +} + +#[test] +fn view_dyn_oob_row_makes_every_reducer_nan() { + // A 3x4 matrix; row index 5 is out of range (rows = 3). The subscripted + // view spans a real 4-element row, but its validity flag is 0, so EVERY + // reducer -- including ArraySum, whose empty-but-valid result is 0.0 -- + // must yield NaN, matching `reduce_view`'s leading `if !is_valid`. + let data: Vec = (0..12).map(|i| i as f64).collect(); + for op in [ + Opcode::ArraySum {}, + Opcode::ArrayMax {}, + Opcode::ArrayMin {}, + Opcode::ArrayMean {}, + Opcode::ArrayStddev {}, + ] { + let got = run_view_dyn_row_reduce(3, 4, 5.0, op, &data); + assert!( + got.is_nan(), + "{}: an out-of-range dynamic row subscript must reduce to NaN, got {got}", + op.name() + ); + } + // ArraySize is defined regardless of validity: a 4-wide row reports 4. + assert_eq!( + run_view_dyn_row_reduce(3, 4, 5.0, Opcode::ArraySize {}, &data), + 4.0 + ); +} + +#[test] +fn view_dyn_in_range_row_reduces_like_vm() { + // The same shape with an in-range row index reduces the real row, so the + // NaN above is genuinely the validity gate, not a broken reducer. Row 2 + // (1-based) of a 3x4 row-major matrix is slots 4..8 -> [4,5,6,7]. + let data: Vec = (0..12).map(|i| i as f64).collect(); + let row = [4.0f64, 5.0, 6.0, 7.0]; + let sum: f64 = row.iter().sum(); + let mean = sum / row.len() as f64; + let var = row.iter().map(|v| (v - mean) * (v - mean)).sum::() / row.len() as f64; + assert_eq!( + run_view_dyn_row_reduce(3, 4, 2.0, Opcode::ArraySum {}, &data), + sum + ); + assert_eq!( + run_view_dyn_row_reduce(3, 4, 2.0, Opcode::ArrayMax {}, &data), + 7.0 + ); + assert_eq!( + run_view_dyn_row_reduce(3, 4, 2.0, Opcode::ArrayMin {}, &data), + 4.0 + ); + assert_eq!( + run_view_dyn_row_reduce(3, 4, 2.0, Opcode::ArrayMean {}, &data), + mean + ); + assert!( + (run_view_dyn_row_reduce(3, 4, 2.0, Opcode::ArrayStddev {}, &data) - var.sqrt()).abs() + < 1e-12 + ); +} + +// ════════════════════════════════════════════════════════════════════════ +// IMPORTANT (review feedback): full-unrolling has a documented size cap. +// +// Reducers, `BeginIter`, and `BeginBroadcastIter` all unroll fully at compile +// time. `EmitState::charge_unroll` bounds the cumulative element count per +// function at `MAX_UNROLL_UNITS`, returning `Unsupported` (so the model falls +// back to the VM) before any oversized body is emitted. These check the cap +// directly via `emit_bytecode`, asserting an over-budget program is rejected +// WITHOUT materializing a giant function, and an under-budget one still emits. +// ════════════════════════════════════════════════════════════════════════ + +/// Lower `bc` into a throwaway function, returning the lowering result. Used +/// to assert that an over-budget program is rejected at emit time without +/// running (or even finishing building) the module. +fn lower_only(bc: &ByteCode, ctx: &EmitCtx) -> Result { + let mut func = Function::new(opcode_fn_locals( + 0, + count_extra_i32_locals(bc), + count_module_input_scratch(bc), + )); + emit_bytecode(bc, ctx, &mut func)?; + func.instruction(&Instruction::End); + Ok(func) +} + +#[test] +fn reducer_over_view_exceeding_cap_is_unsupported() { + // A single static view whose element count exceeds MAX_UNROLL_UNITS. Two + // u16 dims (300 x 300 = 90_000 > 65_536) overflow the budget; the cap is + // checked before the fold, so lowering returns Unsupported with no + // emitted body. The fixture itself is tiny -- proving we reject rather + // than emit a multi-megabyte function. + let mut context = ByteCodeContext::default(); + let view_id = context.add_static_view(dense_view(0, &[300, 300])); + assert!(dense_view(0, &[300, 300]).to_runtime_view().size() > MAX_UNROLL_UNITS); + let ctx = ctx_with_arrays(&context); + let code = vec![ + Opcode::PushStaticView { view_id }, + Opcode::ArraySum {}, + Opcode::PopView {}, + ]; + match lower_only(&bc(vec![], code), &ctx) { + Err(WasmGenError::Unsupported(msg)) => assert!( + msg.contains("unrolling exceeds"), + "expected the unroll-budget message, got: {msg}" + ), + Ok(_) => panic!("a reducer over a view larger than the cap must be Unsupported"), + } +} + +#[test] +fn iteration_over_view_exceeding_cap_is_unsupported() { + // A `BeginIter` whose iteration count exceeds the cap is rejected before + // the body is re-emitted even once past the budget. Geometry: a 300x300 + // temp written elementwise from a same-shaped source. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], 300 * 300); + let out = context.add_static_view(temp_view(0, &[300, 300])); + let src = context.add_static_view(dense_view(0, &[300, 300])); + let ctx = ctx_with_arrays(&context); + let code = vec![ + Opcode::PushStaticView { view_id: out }, + Opcode::BeginIter { + write_temp_id: 0, + has_write_temp: true, + }, + Opcode::PushStaticView { view_id: src }, + Opcode::LoadIterViewAt { offset: 1 }, + Opcode::StoreIterElement {}, + Opcode::NextIterOrJump { jump_back: -3 }, + Opcode::EndIter {}, + Opcode::PopView {}, + Opcode::PopView {}, + ]; + match lower_only(&bc(vec![], code), &ctx) { + Err(WasmGenError::Unsupported(msg)) => assert!( + msg.contains("unrolling exceeds"), + "expected the unroll-budget message, got: {msg}" + ), + Ok(_) => panic!("an iteration larger than the cap must be Unsupported"), + } +} + +#[test] +fn array_size_over_huge_view_is_free() { + // ArraySize emits no element reads (`size() as f64`), so it must NOT be + // charged against the unroll budget: a view far larger than the cap still + // reports its size and lowers fine. + let mut context = ByteCodeContext::default(); + let view_id = context.add_static_view(dense_view(0, &[300, 300])); + let ctx = ctx_with_arrays(&context); + let code = vec![ + Opcode::PushStaticView { view_id }, + Opcode::ArraySize {}, + Opcode::PopView {}, + ]; + assert!( + lower_only(&bc(vec![], code), &ctx).is_ok(), + "ArraySize does no element reads and must not be capped" + ); +} + +#[test] +fn reducer_just_under_cap_compiles_and_matches_vm() { + // A view sized just under the cap still lowers and runs to VM parity. We + // keep the fixture small/fast (a 64-element view) but assert the budget + // accounting admits it: 64 << MAX_UNROLL_UNITS. (The full corpus of small + // arrayed reducer tests above is the broad just-under-cap parity check; + // this pins the boundary intent.) + let data: Vec = (0..64).map(|i| (i as f64) * 0.5).collect(); + let view = dense_view(0, &[64]); + assert!(view.to_runtime_view().size() <= MAX_UNROLL_UNITS); + let got = run_static_reduce(view.clone(), Opcode::ArraySum {}, &data); + assert_eq!(got, vm_sum(&view, &data)); +} + +#[test] +fn unroll_cap_has_headroom_over_realistic_arrays() { + // The cap must be generous enough for real SD models. The test corpus's + // largest single dimension is 9; even a region x sector x cohort nest is + // ~10^3 elements. A compile-time assert pins that the cap clears a + // deliberately roomy 10^4 with margin, documenting that legitimate models + // never trip it. + const _: () = assert!( + MAX_UNROLL_UNITS >= 10_000, + "the unroll cap must leave ample headroom for realistic arrayed models" + ); +} + +// ════════════════════════════════════════════════════════════════════════ +// Phase 6 Task 1: VECTOR SELECT + VECTOR ELM MAP +// +// `VectorSelect` reduces two views (a selector mask + an expression array) to +// ONE scalar pushed on the stack. `VectorElmMap` maps a source array through a +// per-element offset array into a `write_temp_id` temp region. Both are run +// under DLR-FT and cross-checked against the VM: VectorSelect against a faithful +// oracle of the `vm.rs:2444-2502` arm, VectorElmMap against the sibling +// `crate::vm_vector_elm_map::vector_elm_map` function directly. +// ════════════════════════════════════════════════════════════════════════ + +/// The VM `VectorSelect` oracle (mirroring `vm.rs:2444-2502`): zip the two views +/// to the shorter size, collect `expr` where `is_truthy(sel)`, then dispatch the +/// action (1=min, 2=mean, 3=max, 4=product, else sum) with the empty-selection +/// fallback to `max_value`. +fn vm_vector_select_oracle( + sel_view: &StaticArrayView, + expr_view: &StaticArrayView, + sel_data: &[f64], + expr_data: &[f64], + max_value: f64, + action: i32, +) -> f64 { + let sel_rv = sel_view.to_runtime_view(); + let expr_rv = expr_view.to_runtime_view(); + let size = sel_rv.size().min(expr_rv.size()); + let mut selected: Vec = Vec::new(); + let mut sel_idx: SmallVec<[u16; 4]> = smallvec::smallvec![0; sel_rv.dims.len()]; + let mut expr_idx: SmallVec<[u16; 4]> = smallvec::smallvec![0; expr_rv.dims.len()]; + for _ in 0..size { + let sel_off = sel_rv.flat_offset(&sel_idx); + let sel_val = sel_data[sel_rv.base_off as usize + sel_off]; + if crate::vm::is_truthy(sel_val) { + let expr_off = expr_rv.flat_offset(&expr_idx); + selected.push(expr_data[expr_rv.base_off as usize + expr_off]); + } + crate::vm::increment_indices(&mut sel_idx, &sel_rv.dims); + crate::vm::increment_indices(&mut expr_idx, &expr_rv.dims); + } + if selected.is_empty() { + max_value + } else { + match action { + 1 => selected.iter().cloned().fold(f64::INFINITY, f64::min), + 2 => selected.iter().sum::() / selected.len() as f64, + 3 => selected.iter().cloned().fold(f64::NEG_INFINITY, f64::max), + 4 => selected.iter().product(), + _ => selected.iter().sum(), + } + } +} + +/// Run `PushStaticView(sel); PushStaticView(expr); VectorSelect` over a `curr` +/// slab. The two views are pushed sel-then-expr so `expr_view = top`, +/// `sel_view = top-1` (matching the VM). `max_value`/`action` are pushed as the +/// two operands beneath `VectorSelect` (the VM pops `action` then `max_value`). +#[allow(clippy::too_many_arguments)] +fn run_vector_select( + sel_view: StaticArrayView, + expr_view: StaticArrayView, + sel_base: u32, + expr_base: u32, + data: &[f64], + max_value: f64, + action: f64, +) -> f64 { + let mut context = ByteCodeContext::default(); + let sel_id = context.add_static_view(sel_view); + let expr_id = context.add_static_view(expr_view); + let ctx = ctx_with_arrays(&context); + let _ = (sel_base, expr_base); + let code = vec![ + Opcode::LoadConstant { id: 0 }, // max_value (pushed first) + Opcode::LoadConstant { id: 1 }, // action (pushed second, on top) + Opcode::PushStaticView { view_id: sel_id }, + Opcode::PushStaticView { view_id: expr_id }, + Opcode::VectorSelect {}, + Opcode::PopView {}, + Opcode::PopView {}, + ]; + run( + &bc(vec![max_value, action], code), + &ctx, + true, + 0, + &seed_run(0, data), + None, + ) +} + +/// Assert the emitted `VectorSelect` matches the VM oracle for `action`, on the +/// shared `sel`/`expr` fixture seeded from `data` (sel slots 0..4, expr 4..8). +fn assert_vector_select_matches(action: f64, max_value: f64) { + let sel = dense_view(0, &[4]); + let expr = dense_view(4, &[4]); + let data = [1.0, 0.0, 1.0, 1.0, 10.0, 20.0, 30.0, 40.0]; + let got = run_vector_select(sel.clone(), expr.clone(), 0, 4, &data, max_value, action); + let want = vm_vector_select_oracle(&sel, &expr, &data, &data, max_value, action.round() as i32); + if want.is_nan() { + assert!(got.is_nan(), "action {action}: expected NaN, got {got}"); + } else { + assert_eq!(got, want, "action {action}: got {got}, want {want}"); + } +} + +#[test] +fn vector_select_sum_matches_vm() { + // sel = [1, 0, 1, 1] (mask), expr = [10, 20, 30, 40], action 5 (sum). + // Selected = [10, 30, 40] -> 80. + assert_vector_select_matches(5.0, -1.0); + let sel = dense_view(0, &[4]); + let expr = dense_view(4, &[4]); + let data = [1.0, 0.0, 1.0, 1.0, 10.0, 20.0, 30.0, 40.0]; + let got = run_vector_select(sel, expr, 0, 4, &data, -1.0, 5.0); + assert_eq!(got, 80.0); +} + +#[test] +fn vector_select_each_action_matches_vm() { + // 1=min, 2=mean, 3=max, 4=product, and a few "else -> sum" actions. The + // selected set is [10, 30, 40]: min 10, mean 80/3, max 40, product 12000, + // sum 80. + for action in [1.0, 2.0, 3.0, 4.0, 0.0, 5.0, 7.0] { + assert_vector_select_matches(action, -1.0); + } +} + +#[test] +fn vector_select_empty_selection_returns_max_value() { + // An all-false mask selects nothing, so the result is `max_value` for every + // action (the VM's `if selected.is_empty() { max_value }`). + let sel = dense_view(0, &[4]); + let expr = dense_view(4, &[4]); + // Mask all zero. + let data = [0.0, 0.0, 0.0, 0.0, 10.0, 20.0, 30.0, 40.0]; + for action in [1.0, 2.0, 3.0, 4.0, 5.0] { + let got = run_vector_select(sel.clone(), expr.clone(), 0, 4, &data, 123.5, action); + let want = vm_vector_select_oracle(&sel, &expr, &data, &data, 123.5, action.round() as i32); + assert_eq!( + got, want, + "action {action}: empty selection must be max_value" + ); + assert_eq!(got, 123.5); + } +} + +#[test] +fn vector_select_nan_in_mask_is_truthy_like_vm() { + // is_truthy(NaN) is true (approx_eq(NaN, 0) is false), so a NaN mask entry + // SELECTS its expr value, exactly as the VM does. Mask = [NaN, 0, 1]: + // selects expr[0] and expr[2]. + let sel = dense_view(0, &[3]); + let expr = dense_view(3, &[3]); + let data = [f64::NAN, 0.0, 1.0, 100.0, 200.0, 300.0]; + for action in [1.0, 3.0, 5.0] { + let got = run_vector_select(sel.clone(), expr.clone(), 3, 3, &data, -1.0, action); + let want = vm_vector_select_oracle(&sel, &expr, &data, &data, -1.0, action.round() as i32); + assert_eq!( + got, want, + "action {action}: NaN mask entry must select its expr" + ); + } +} + +#[test] +fn vector_select_zip_stops_at_shorter_view() { + // sel has 4 elements, expr has 2: the VM zips to min(4, 2) = 2, so only the + // first two (sel, expr) pairs are considered. Mask [1, 1, ...] selects + // expr[0], expr[1]; the trailing sel entries never read a (nonexistent) + // expr element. + let sel = dense_view(0, &[4]); + let expr = dense_view(4, &[2]); + let data = [1.0, 1.0, 1.0, 1.0, 7.0, 11.0]; + let got = run_vector_select(sel.clone(), expr.clone(), 0, 4, &data, -1.0, 5.0); + let want = vm_vector_select_oracle(&sel, &expr, &data, &data, -1.0, 5); + assert_eq!(got, want); + assert_eq!(got, 18.0, "sum of the first two expr values"); +} + +#[test] +fn vector_select_nan_expr_value_ignored_by_minmax_like_vm() { + // A selected expr value of NaN is ignored by min/max (the VM folds with + // `f64::min`/`f64::max`, which return the non-NaN operand), so wasm `f64.min`/ + // `f64.max` (NaN-propagating) would diverge -- this pins the faithful + // NaN-ignoring fold. Selected = [10, NaN, 40]: min 10, max 40 (NOT NaN); + // sum/mean/product DO see the NaN (VM uses `+`/`*`, which propagate). + let sel = dense_view(0, &[3]); + let expr = dense_view(3, &[3]); + let data = [1.0, 1.0, 1.0, 10.0, f64::NAN, 40.0]; + // min and max must be exactly 10 and 40 (NaN ignored). + assert_eq!( + run_vector_select(sel.clone(), expr.clone(), 3, 3, &data, -1.0, 1.0), + 10.0 + ); + assert_eq!( + run_vector_select(sel.clone(), expr.clone(), 3, 3, &data, -1.0, 3.0), + 40.0 + ); + // sum/product propagate the NaN, matching the VM (cross-checked vs oracle). + for action in [2.0, 4.0, 5.0] { + assert_vector_select_nan_expr(&sel, &expr, &data, action); + } +} + +fn assert_vector_select_nan_expr( + sel: &StaticArrayView, + expr: &StaticArrayView, + data: &[f64], + action: f64, +) { + let got = run_vector_select(sel.clone(), expr.clone(), 3, 3, data, -1.0, action); + let want = vm_vector_select_oracle(sel, expr, data, data, -1.0, action.round() as i32); + if want.is_nan() { + assert!(got.is_nan(), "action {action}: expected NaN, got {got}"); + } else { + assert_eq!(got, want, "action {action}"); + } +} + +// ── VectorElmMap parity vs the sibling VM function ──────────────────────── + +/// Run `PushStaticView(source); PushStaticView(offset); VectorElmMap` over a +/// `curr` slab seeded from `data`, writing temp 0, and read back `count` temp +/// slots. The source view is pushed first (`top-1`), the offset view second +/// (`top`), matching the VM (`offset_view = top, source_view = top-1`). +fn run_vector_elm_map( + source: StaticArrayView, + offset: StaticArrayView, + full_source_len: u32, + data: &[f64], + temp_count: usize, + temp_slots: usize, +) -> Vec { + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], temp_slots); + let src_id = context.add_static_view(source); + let off_id = context.add_static_view(offset); + let code = vec![ + Opcode::PushStaticView { view_id: src_id }, + Opcode::PushStaticView { view_id: off_id }, + Opcode::VectorElmMap { + write_temp_id: 0, + full_source_len, + }, + Opcode::PopView {}, + Opcode::PopView {}, + ]; + run_and_read_temps(&context, code, vec![], &seed_run(0, data), temp_count) +} + +/// The VM oracle for `VectorElmMap`: run the sibling +/// `crate::vm_vector_elm_map::vector_elm_map` over `RuntimeView`s built from the +/// same static views, reading `curr` from `data`. Returns the written temp 0 +/// slots (`temp_slots` wide). +fn vm_elm_map_oracle( + source: &StaticArrayView, + offset: &StaticArrayView, + full_source_len: u32, + data: &[f64], + temp_slots: usize, +) -> Vec { + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], temp_slots); + let mut temp_storage = vec![0.0f64; temp_slots]; + crate::vm_vector_elm_map::vector_elm_map( + &source.to_runtime_view(), + &offset.to_runtime_view(), + 0, + full_source_len, + data, + &mut temp_storage, + &context, + ); + temp_storage +} + +/// Assert the emitted `VectorElmMap` matches the sibling VM function element-for- +/// element over the `offset_view` size (NaN compares as NaN). +fn assert_elm_map_matches( + source: &StaticArrayView, + offset: &StaticArrayView, + full_source_len: u32, + data: &[f64], + temp_slots: usize, +) { + let got = run_vector_elm_map( + source.clone(), + offset.clone(), + full_source_len, + data, + temp_slots, + temp_slots, + ); + let want = vm_elm_map_oracle(source, offset, full_source_len, data, temp_slots); + assert_eq!(got.len(), want.len()); + for (i, (&g, &w)) in got.iter().zip(want.iter()).enumerate() { + if w.is_nan() { + assert!(g.is_nan(), "elm_map slot {i}: expected NaN, got {g}"); + } else { + assert_eq!(g, w, "elm_map slot {i}: got {g}, want {w}"); + } + } +} + +#[test] +fn vector_elm_map_full_array_in_range_matches_vm() { + // Full contiguous source [a,b,c,d] in curr slots 0..4; offset [1,3,0,2] in + // curr slots 4..8 -> result = source[round(offset[i])] = [b, d, a, c]. + let source = dense_view(0, &[4]); + let offset = dense_view(4, &[4]); + let data = [10.0, 20.0, 30.0, 40.0, 1.0, 3.0, 0.0, 2.0]; + assert_elm_map_matches(&source, &offset, 4, &data, 4); + let got = run_vector_elm_map(source, offset, 4, &data, 4, 4); + assert_eq!(got, vec![20.0, 40.0, 10.0, 30.0]); +} + +#[test] +fn vector_elm_map_out_of_range_offset_is_nan() { + // An offset that lands outside [0, full_source_len) yields NaN (no modulo). + // Source len 3; offsets [0, 5, -1] -> [source[0], NaN, NaN]. + let source = dense_view(0, &[3]); + let offset = dense_view(3, &[3]); + let data = [7.0, 8.0, 9.0, 0.0, 5.0, -1.0]; + assert_elm_map_matches(&source, &offset, 3, &data, 3); + let got = run_vector_elm_map(source, offset, 3, &data, 3, 3); + assert_eq!(got[0], 7.0); + assert!(got[1].is_nan() && got[2].is_nan()); +} + +#[test] +fn vector_elm_map_nan_offset_is_nan() { + // A NaN offset yields NaN, regardless of the (would-be) index. + let source = dense_view(0, &[3]); + let offset = dense_view(3, &[3]); + let data = [7.0, 8.0, 9.0, 1.0, f64::NAN, 2.0]; + assert_elm_map_matches(&source, &offset, 3, &data, 3); + let got = run_vector_elm_map(source, offset, 3, &data, 3, 3); + assert_eq!(got[0], 8.0); + assert!(got[1].is_nan()); + assert_eq!(got[2], 9.0); +} + +#[test] +fn vector_elm_map_offset_rounds_half_away_like_vm() { + // The VM rounds the offset with `f64::round` (half away from zero), NOT wasm + // `f64.nearest` (half to even). Offsets [0.5, 1.5, 2.5] round to [1, 2, 3] + // (away from zero), not [0, 2, 2] (to even). Cross-checked vs the sibling. + let source = dense_view(0, &[4]); + let offset = dense_view(4, &[3]); + let data = [10.0, 20.0, 30.0, 40.0, 0.5, 1.5, 2.5]; + assert_elm_map_matches(&source, &offset, 4, &data, 3); + let got = run_vector_elm_map(source, offset, 4, &data, 3, 3); + // round(0.5)=1 -> source[1]=20; round(1.5)=2 -> 30; round(2.5)=3 -> 40. + assert_eq!(got, vec![20.0, 30.0, 40.0]); +} + +// ── emit_round_half_away parity vs f64::round (the VM's rounding oracle) ─── + +/// Build a module exporting `mem` and `eval(module_off: i32)` whose body loads +/// the f64 at memory slot 0 (byte 0), runs [`super::vector::emit_round_half_away`] +/// directly, and stores the rounded result back to slot 0. Mirrors +/// [`build_module`]'s helper-prefix assembly so the function declarations match +/// production; the body is a focused probe of just the round helper. +fn build_round_probe_module() -> Vec { + let mut module = Module::new(); + + let helpers = build_helpers(); + let n_helpers = helpers.functions.len() as u32; + + let mut types = TypeSection::new(); + types.ty().function([ValType::I32], []); // eval(module_off) -> () + for hf in &helpers.functions { + types.ty().function(hf.params.clone(), hf.results.clone()); + } + module.section(&types); + + let mut functions = FunctionSection::new(); + for (i, _) in helpers.functions.iter().enumerate() { + functions.function(1 + i as u32); + } + functions.function(0); + module.section(&functions); + + let mut memories = MemorySection::new(); + memories.memory(MemoryType { + minimum: 1, + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + module.section(&memories); + + let mut exports = ExportSection::new(); + exports.export("eval", ExportKind::Func, n_helpers); + exports.export("mem", ExportKind::Memory, 0); + module.section(&exports); + + let mut code = CodeSection::new(); + for hf in helpers.functions { + code.function(&hf.body); + } + // Same local layout production uses; the round helper draws its two f64 + // temps from `scratch_local` (index 1) and `apply_locals[0]` (index 2). + let ctx = ctx_with_cond_depth(0); + let mut func = Function::new(opcode_fn_locals(0, 0, 0)); + // result_addr (i32) for the trailing store, then x = mem[0]. + func.instruction(&Instruction::I32Const(0)); + func.instruction(&Instruction::I32Const(0)); + func.instruction(&Instruction::F64Load(memarg(0))); + crate::wasmgen::vector::emit_round_half_away(&mut func, ctx.scratch_local, ctx.apply_locals[0]); + func.instruction(&Instruction::F64Store(memarg(0))); + func.instruction(&Instruction::End); + code.function(&func); + module.section(&code); + + module.finish() +} + +/// Run the round probe over input `x` and return the f64 the helper produced. +fn run_round_half_away(x: f64) -> f64 { + let bytes = build_round_probe_module(); + let info = validate(&bytes).expect("round-probe module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("round-probe module must instantiate") + .module_addr; + + let mem = store + .instance_export(module, "mem") + .unwrap() + .as_mem() + .unwrap(); + store.mem_access_mut_slice(mem, |bytes| { + bytes[0..8].copy_from_slice(&x.to_le_bytes()); + }); + + let eval = store + .instance_export(module, "eval") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(i32,), ()>(eval, (0_i32,)) + .expect("round-probe invocation must succeed"); + + store.mem_access_mut_slice(mem, |bytes| { + f64::from_le_bytes(bytes[0..8].try_into().unwrap()) + }) +} + +/// Assert the emitted round helper reproduces `f64::round` (the VM's rounding +/// oracle) bit-for-bit, including the sign of a zero result. Cross-checking +/// against the standard-library oracle is the whole point: the prior +/// `trunc(x + copysign(0.5, x))` form diverged from it for two reachable input +/// classes (see `emit_round_half_away`'s rustdoc). +fn assert_round_matches_f64_round(x: f64) { + let got = run_round_half_away(x); + let want = x.round(); + if want.is_nan() { + assert!(got.is_nan(), "round({x}): expected NaN, got {got}"); + } else { + assert_eq!( + got.to_bits(), + want.to_bits(), + "round({x}): got {got} (bits {:#x}), want {want} (bits {:#x})", + got.to_bits(), + want.to_bits() + ); + } +} + +#[test] +fn round_half_away_matches_f64_round_boundary_classes() { + // Class (a): the largest f64 strictly below 0.5. `trunc(x + 0.5)` rounds the + // sum up to exactly 1.0 and yields 1; `f64::round` yields 0. The sign of the + // zero must be preserved (`-0.0` for the negative input). + let just_below_half = 0.499_999_999_999_999_94_f64; // == 0.5_f64.next_down() + assert_eq!(just_below_half, f64::from_bits(0x3fdf_ffff_ffff_ffff)); + assert_round_matches_f64_round(just_below_half); + assert_round_matches_f64_round(-just_below_half); + assert_eq!(run_round_half_away(just_below_half), 0.0); + assert!(run_round_half_away(-just_below_half).is_sign_negative()); + + // Class (b): an already-integer magnitude in [2^52, 2^53). `x + 0.5` rounds + // up to `x + 1`; `f64::round` returns `x` unchanged. + let big_odd_int = 4_503_599_627_370_497.0_f64; // 2^52 + 1 + assert_round_matches_f64_round(big_odd_int); + assert_round_matches_f64_round(-big_odd_int); + assert_eq!(run_round_half_away(big_odd_int), big_odd_int); + + // Exact-half inputs: round AWAY from zero (the VM's `f64::round`), not the + // half-to-even of wasm `f64.nearest`. + for &x in &[0.5_f64, -0.5, 1.5, 2.5, -2.5, -0.0, 0.0] { + assert_round_matches_f64_round(x); + } +} + +#[test] +fn round_half_away_matches_f64_round_sampled() { + // A deterministic sweep of magnitudes/signs/fractions cross-checked against + // the `f64::round` oracle, so a future change to the helper that drifts from + // the VM's rounding is caught here, not only in the two boundary classes. + let mut state = 0x2545_f491_4f6c_dd1d_u64; // xorshift64* seed + let mut next = || { + state ^= state >> 12; + state ^= state << 25; + state ^= state >> 27; + state.wrapping_mul(0x2545_f491_4f6c_dd1d) + }; + for _ in 0..2000 { + let bits = next(); + // Span small fractions through large integer-grid magnitudes. + let scale = match bits % 5 { + 0 => 1.0, + 1 => 16.0, + 2 => 1024.0, + 3 => 4_503_599_627_370_496.0, // 2^52 + _ => 9_007_199_254_740_992.0, // 2^53 + }; + let frac = (bits >> 8) as f64 / (u64::MAX >> 8) as f64; // [0, 1) + let mag = frac * scale * 2.0; + let x = if bits & 1 == 0 { mag } else { -mag }; + assert_round_matches_f64_round(x); + } +} + +#[test] +fn vector_elm_map_sliced_source_base_i_matches_vm() { + // A strict-slice source: a 2-D source [DimA(2), DimB(3)] (full storage 6 + // elements in curr 0..6), sliced... here we exercise the carried-axis base_i + // projection via a source whose remaining dim shares its dim_id with the + // offset view. Source = matrix[A,B] row-major; offset view is 2-D [A,B] with + // matching dim_ids, so element (a,b) reads source[base_i(a) + round(off)]. + // + // Build source as [A(2), B(3)] dim_ids [0,1] over storage [0..6], and offset + // as [A(2)] dim_id [0] -- but VECTOR ELM MAP needs offset.size() result + // slots, so use a 2-D offset matching the result. We model the genuine + // shape: source full storage len 6, source view is the full [2,3], offset + // [2,3] with the same dim_ids; base_i is 0 (full array) and offset indexes + // the whole storage. To exercise a NON-zero base_i we instead slice the + // source to a single row and give the offset that row's dim. + // + // Simpler faithful base_i case: source view = row 1 of a [2,3] matrix + // (offset folds in 3), dim_ids [1] (DimB); offset view [3] dim_id [1]. Then + // base_i = source.flat_offset([b]) projects DimB, and the result reads + // storage[3 + round(off)]. full_source_len = 6. + let mut source = dense_view(0, &[3]); // the sliced row: dims [3] + source.offset = 3; // row 1 of a [2,3] matrix starts at flat 3 + source.dim_ids = SmallVec::from_slice(&[1]); // DimB + let mut offset = dense_view(6, &[3]); + offset.dim_ids = SmallVec::from_slice(&[1]); // DimB, matching the source + // Storage: matrix rows [r0: 100,101,102][r1: 200,201,202]; offsets [0,1,2]. + let data = [100.0, 101.0, 102.0, 200.0, 201.0, 202.0, 0.0, 1.0, 2.0]; + assert_elm_map_matches(&source, &offset, 6, &data, 3); + let got = run_vector_elm_map(source, offset, 6, &data, 3, 3); + // base_i for element b is source.flat_offset([b]) = 3 + b; + round(off[b]): + // b=0: 3 + 0 -> storage[3]=200; b=1: 4 + 1 -> storage[5]=202; + // b=2: 5 + 2 = 7 -> OOB (>=6) -> NaN. + assert_eq!(got[0], 200.0); + assert_eq!(got[1], 202.0); + assert!(got[2].is_nan()); +} + +// ── VectorSortOrder / Rank parity vs the VM (stable sort) ───────────────── + +/// Run `PushStaticView(input); Vector{SortOrder|Rank}` over a `curr` slab seeded +/// from `data`, writing temp 0, and read back `temp_count` temp slots. The +/// `direction` operand is pushed beneath the op. +fn run_sort_op( + input: StaticArrayView, + op: Opcode, + direction: f64, + data: &[f64], + temp_count: usize, + temp_slots: usize, +) -> Vec { + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], temp_slots); + let in_id = context.add_static_view(input); + let code = vec![ + Opcode::LoadConstant { id: 0 }, // direction + Opcode::PushStaticView { view_id: in_id }, + op, + Opcode::PopView {}, + ]; + run_and_read_temps( + &context, + code, + vec![direction], + &seed_run(0, data), + temp_count, + ) +} + +/// The VM oracle for `VectorSortOrder`: run the sibling +/// `crate::vm_vector_sort_order::vector_sort_order` over a `RuntimeView`. +fn vm_sort_order_oracle( + input: &StaticArrayView, + direction: i32, + data: &[f64], + temp_slots: usize, +) -> Vec { + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], temp_slots); + let mut temp_storage = vec![0.0f64; temp_slots]; + crate::vm_vector_sort_order::vector_sort_order( + &input.to_runtime_view(), + direction, + 0, + data, + &mut temp_storage, + &context, + ); + temp_storage +} + +/// A faithful local oracle for `Rank` (mirroring `vm.rs:2540-2584`): over the +/// whole view, collect `(value, orig_idx)`, stable sort (asc if direction==1 +/// else desc, NaN-as-Equal), write `temp[orig_idx] = rank_0based + 1`. +fn vm_rank_oracle( + input: &StaticArrayView, + direction: i32, + data: &[f64], + temp_slots: usize, +) -> Vec { + let rv = input.to_runtime_view(); + let size = rv.size(); + let mut indexed: Vec<(f64, usize)> = Vec::with_capacity(size); + let mut idx: SmallVec<[u16; 4]> = smallvec::smallvec![0; rv.dims.len()]; + for i in 0..size { + let flat = rv.flat_offset(&idx); + indexed.push((data[rv.base_off as usize + flat], i)); + crate::vm::increment_indices(&mut idx, &rv.dims); + } + if direction == 1 { + indexed.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal)); + } else { + indexed.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal)); + } + let mut temp = vec![0.0f64; temp_slots]; + for (rank_0based, &(_, orig_idx)) in indexed.iter().enumerate() { + temp[orig_idx] = (rank_0based + 1) as f64; + } + temp +} + +fn assert_sort_order_matches(input: &StaticArrayView, direction: f64, data: &[f64], slots: usize) { + let got = run_sort_op( + input.clone(), + Opcode::VectorSortOrder { write_temp_id: 0 }, + direction, + data, + slots, + slots, + ); + let want = vm_sort_order_oracle(input, direction.round() as i32, data, slots); + assert_eq!(got, want, "sort_order direction {direction}"); +} + +fn assert_rank_matches(input: &StaticArrayView, direction: f64, data: &[f64], slots: usize) { + let got = run_sort_op( + input.clone(), + Opcode::Rank { write_temp_id: 0 }, + direction, + data, + slots, + slots, + ); + let want = vm_rank_oracle(input, direction.round() as i32, data, slots); + assert_eq!(got, want, "rank direction {direction}"); +} + +#[test] +fn vector_sort_order_1d_ascending_matches_vm() { + // input [30, 10, 20, 40]; ascending -> the sorted in-row source indices are + // [1 (10), 2 (20), 0 (30), 3 (40)]. + let input = dense_view(0, &[4]); + let data = [30.0, 10.0, 20.0, 40.0]; + assert_sort_order_matches(&input, 1.0, &data, 4); + let got = run_sort_op( + input, + Opcode::VectorSortOrder { write_temp_id: 0 }, + 1.0, + &data, + 4, + 4, + ); + assert_eq!(got, vec![1.0, 2.0, 0.0, 3.0]); +} + +#[test] +fn vector_sort_order_1d_descending_matches_vm() { + // direction != 1 sorts descending: [30,10,20,40] -> indices of [40,30,20,10] + // = [3, 0, 2, 1]. + let input = dense_view(0, &[4]); + let data = [30.0, 10.0, 20.0, 40.0]; + assert_sort_order_matches(&input, 0.0, &data, 4); + let got = run_sort_op( + input, + Opcode::VectorSortOrder { write_temp_id: 0 }, + 0.0, + &data, + 4, + 4, + ); + assert_eq!(got, vec![3.0, 0.0, 2.0, 1.0]); +} + +#[test] +fn vector_sort_order_tie_stability_matches_vm() { + // Equal values keep input order (stable). [5, 5, 1, 5]: ascending sorts the + // single 1 (index 2) first, then the three 5s in input order [0, 1, 3]. + let input = dense_view(0, &[4]); + let data = [5.0, 5.0, 1.0, 5.0]; + assert_sort_order_matches(&input, 1.0, &data, 4); + let got = run_sort_op( + input, + Opcode::VectorSortOrder { write_temp_id: 0 }, + 1.0, + &data, + 4, + 4, + ); + assert_eq!(got, vec![2.0, 0.0, 1.0, 3.0]); +} + +#[test] +fn vector_sort_order_multi_row_matches_vm() { + // A 2x3 source: each ROW is sorted independently (the innermost dim is the + // sorted axis), and result indices are 0-based WITHIN the row. Row 0 + // [30,10,20] asc -> [1,2,0]; row 1 [5,9,7] asc -> [0,2,1]. The output is + // row-major, so temp = [1,2,0, 0,2,1]. + let input = dense_view(0, &[2, 3]); + let data = [30.0, 10.0, 20.0, 5.0, 9.0, 7.0]; + assert_sort_order_matches(&input, 1.0, &data, 6); + let got = run_sort_op( + input, + Opcode::VectorSortOrder { write_temp_id: 0 }, + 1.0, + &data, + 6, + 6, + ); + assert_eq!(got, vec![1.0, 2.0, 0.0, 0.0, 2.0, 1.0]); +} + +#[test] +fn vector_sort_order_nan_element_is_stable_like_vm() { + // A NaN element compares Equal to everything (the VM's + // partial_cmp.unwrap_or(Equal) under a stable sort), so it neither displaces + // a non-NaN nor reorders -- it stays in input order. Cross-checked + // element-for-element vs the sibling VM function. + let input = dense_view(0, &[4]); + let data = [3.0, f64::NAN, 1.0, 2.0]; + assert_sort_order_matches(&input, 1.0, &data, 4); + assert_sort_order_matches(&input, 0.0, &data, 4); +} + +#[test] +fn vector_sort_order_transposed_view_matches_vm() { + // A non-contiguous (transposed) view exercises the strided element reads in + // the gather. Cross-checked vs the sibling over every element. + let view = StaticArrayView { + base_off: 0, + is_temp: false, + dims: SmallVec::from_slice(&[3, 2]), + strides: SmallVec::from_slice(&[1, 3]), + offset: 0, + sparse: SmallVec::new(), + dim_ids: SmallVec::from_slice(&[0, 0]), + }; + assert!(!view.to_runtime_view().is_contiguous()); + let data = [11.0, 12.0, 13.0, 21.0, 22.0, 23.0]; + assert_sort_order_matches(&view, 1.0, &data, 6); + assert_sort_order_matches(&view, 0.0, &data, 6); +} + +#[test] +fn rank_whole_view_ascending_matches_vm() { + // Rank over the WHOLE view, 1-based, indexed by ORIGINAL position. [30,10,20, + // 40] ascending: 10 is rank 1, 20 rank 2, 30 rank 3, 40 rank 4, so the result + // at the original positions is [3, 1, 2, 4]. + let input = dense_view(0, &[4]); + let data = [30.0, 10.0, 20.0, 40.0]; + assert_rank_matches(&input, 1.0, &data, 4); + let got = run_sort_op(input, Opcode::Rank { write_temp_id: 0 }, 1.0, &data, 4, 4); + assert_eq!(got, vec![3.0, 1.0, 2.0, 4.0]); +} + +#[test] +fn rank_whole_view_descending_matches_vm() { + // Descending: 40 rank 1, 30 rank 2, 20 rank 3, 10 rank 4 -> [2, 4, 3, 1]. + let input = dense_view(0, &[4]); + let data = [30.0, 10.0, 20.0, 40.0]; + assert_rank_matches(&input, 0.0, &data, 4); + let got = run_sort_op(input, Opcode::Rank { write_temp_id: 0 }, 0.0, &data, 4, 4); + assert_eq!(got, vec![2.0, 4.0, 3.0, 1.0]); +} + +#[test] +fn rank_multi_dim_ranks_whole_view_not_per_row() { + // Unlike VectorSortOrder, Rank ranks the WHOLE view (not per-row). A 2x3 + // view ranks all 6 cells together. Cross-checked vs the faithful oracle. + let input = dense_view(0, &[2, 3]); + let data = [30.0, 10.0, 20.0, 5.0, 9.0, 7.0]; + assert_rank_matches(&input, 1.0, &data, 6); + // Sorted ascending: 5(idx3),9(idx4),7(idx5)... actually [5,7,9,10,20,30] + // -> ranks at original positions: 30->6, 10->4, 20->5, 5->1, 9->3, 7->2. + let got = run_sort_op(input, Opcode::Rank { write_temp_id: 0 }, 1.0, &data, 6, 6); + assert_eq!(got, vec![6.0, 4.0, 5.0, 1.0, 3.0, 2.0]); +} + +#[test] +fn rank_tie_stability_matches_vm() { + // Equal values keep input order: [5, 5, 1, 5] ascending. The 1 (idx 2) is + // rank 1; the three 5s get ranks 2, 3, 4 in input order (idx 0, 1, 3). + let input = dense_view(0, &[4]); + let data = [5.0, 5.0, 1.0, 5.0]; + assert_rank_matches(&input, 1.0, &data, 4); + let got = run_sort_op(input, Opcode::Rank { write_temp_id: 0 }, 1.0, &data, 4, 4); + assert_eq!(got, vec![2.0, 3.0, 1.0, 4.0]); +} + +#[test] +fn rank_nan_element_matches_vm() { + // A NaN element compares Equal (stable). Cross-checked vs the faithful oracle + // (the NaN keeps its input position in the stable sort, so its rank is its + // sorted slot among the Equal-treated elements). + let input = dense_view(0, &[4]); + let data = [3.0, f64::NAN, 1.0, 2.0]; + assert_rank_matches(&input, 1.0, &data, 4); + assert_rank_matches(&input, 0.0, &data, 4); +} + +/// Build `mat[rows][cols]` via `PushVarViewDirect`, dynamically subscript dim 0 +/// with an out-of-range `row_1based` (so the resulting `cols`-element row view's +/// validity flag is 0), run `op` writing temp 0, and read back the `cols` temp +/// slots. An invalid input view must fill the whole temp region with NaN. +fn run_dyn_sort_op(rows: u16, cols: u16, row_1based: f64, op: Opcode, data: &[f64]) -> Vec { + let mut context = ByteCodeContext::default(); + context.add_dim_list(2, [rows, cols, 0, 0]); + context.set_temp_info(vec![0], cols as usize); + let code = vec![ + Opcode::PushVarViewDirect { + base_off: 0, + dim_list_id: 0, + }, + Opcode::LoadConstant { id: 0 }, // direction + Opcode::LoadConstant { id: 1 }, // runtime row index (1-based) + Opcode::ViewSubscriptDynamic { dim_idx: 0 }, + op, + Opcode::PopView {}, + ]; + run_and_read_temps( + &context, + code, + vec![1.0, row_1based], + &seed_run(0, data), + cols as usize, + ) +} + +#[test] +fn vector_sort_order_invalid_view_fills_temp_with_nan() { + // A 3x4 matrix; row 5 is out of range, so the dynamically-subscripted row + // view is invalid and VectorSortOrder must fill the whole temp with NaN + // (the VM's `!is_valid -> fill_temp_nan`). + let data: Vec = (0..12).map(|i| i as f64).collect(); + let got = run_dyn_sort_op( + 3, + 4, + 5.0, + Opcode::VectorSortOrder { write_temp_id: 0 }, + &data, + ); + assert!( + got.iter().all(|v| v.is_nan()), + "invalid view must fill the temp with NaN, got {got:?}" + ); + // A valid row (row 2) writes real 0-based in-row ranks (no NaN). + let ok = run_dyn_sort_op( + 3, + 4, + 2.0, + Opcode::VectorSortOrder { write_temp_id: 0 }, + &data, + ); + assert!(ok.iter().all(|v| !v.is_nan()), "valid row must not be NaN"); +} + +#[test] +fn rank_invalid_view_fills_temp_with_nan() { + let data: Vec = (0..12).map(|i| i as f64).collect(); + let got = run_dyn_sort_op(3, 4, 5.0, Opcode::Rank { write_temp_id: 0 }, &data); + assert!( + got.iter().all(|v| v.is_nan()), + "invalid view must fill the temp with NaN, got {got:?}" + ); + let ok = run_dyn_sort_op(3, 4, 2.0, Opcode::Rank { write_temp_id: 0 }, &data); + assert!(ok.iter().all(|v| !v.is_nan()), "valid row must not be NaN"); +} + +// ── LookupArray parity vs the VM (per-element arrayed GF) ───────────────── + +// GF region base for the LookupArray tests: past the curr/next chunks +// (4096..8192), TEMP_BASE (8192), and VECTOR_SCRATCH_BASE (16384), within the +// harness's single 64 KiB page. +const LA_GF_BASE: u32 = 24576; + +/// Seed `tables` into the GF directory + data regions at `LA_GF_BASE` (the +/// directory's N 8-byte entries, then each table's knots), matching the +/// production layout the `LookupArray`/`Lookup` opcodes read. +fn seed_gf_tables(tables: &[&[(f64, f64)]]) -> Vec<(u64, f64)> { + let n = tables.len() as u32; + let data_base = LA_GF_BASE + n * 8; // past the N directory entries + let mut seed = Vec::new(); + let mut data_rel = 0u32; + for (t, knots) in tables.iter().enumerate() { + let abs = data_base + data_rel; + seed.push(( + u64::from(LA_GF_BASE) + (t as u64) * 8, + dir_entry_f64(abs, knots.len() as u32), + )); + for (k, &(x, y)) in knots.iter().enumerate() { + let knot = u64::from(abs) + (k as u64) * 16; + seed.push((knot, x)); + seed.push((knot + 8, y)); + } + data_rel += knots.len() as u32 * 16; + } + seed +} + +/// Run `PushStaticView(input); LookupArray{base_gf, table_count, mode}; PopView` +/// over the seeded GF tables, writing temp 0, and read back `temp_count` slots. +/// `index` (the shared scalar lookup index) is pushed beneath the opcode. +#[allow(clippy::too_many_arguments)] +fn run_lookup_array( + input: StaticArrayView, + base_gf: GraphicalFunctionId, + table_count: u16, + mode: LookupMode, + index: f64, + tables: &[&[(f64, f64)]], + temp_count: usize, + temp_slots: usize, + input_data: &[f64], +) -> Vec { + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], temp_slots); + let in_id = context.add_static_view(input); + let ctx = EmitCtx { + gf_directory_base: LA_GF_BASE, + gf_data_base: LA_GF_BASE, + temp_storage_base: TEMP_BASE, + ctx: &context, + ..ctx_with_cond_depth(0) + }; + let code = vec![ + Opcode::LoadConstant { id: 0 }, // index + Opcode::PushStaticView { view_id: in_id }, + Opcode::LookupArray { + base_gf, + table_count, + mode, + write_temp_id: 0, + }, + Opcode::PopView {}, + ]; + let mut seed = seed_run(0, input_data); + seed.extend(seed_gf_tables(tables)); + let bytes = build_module(&bc(vec![index], code), &ctx, false, 0); + let info = validate(&bytes).expect("emitted module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let mem = store + .instance_export(inst, "mem") + .unwrap() + .as_mem() + .unwrap(); + store.mem_access_mut_slice(mem, |b| { + for &(addr, v) in &seed { + let a = addr as usize; + b[a..a + 8].copy_from_slice(&v.to_le_bytes()); + } + }); + let eval = store + .instance_export(inst, "eval") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(i32,), ()>(eval, (0_i32,)) + .expect("invoke"); + store.mem_access_mut_slice(mem, |b| { + (0..temp_count) + .map(|i| { + let a = TEMP_BASE as usize + i * 8; + f64::from_le_bytes(b[a..a + 8].try_into().unwrap()) + }) + .collect() + }) +} + +/// Faithful oracle for `LookupArray` (mirroring `vm.rs:2586-2629`): for each +/// element `i`, `elem_off = flat_offset(indices)`; NaN if `elem_off >= +/// table_count`, else the VM lookup over `tables[base_gf + elem_off]` at `index`. +fn vm_lookup_array_oracle( + input: &StaticArrayView, + base_gf: GraphicalFunctionId, + table_count: u16, + mode: LookupMode, + index: f64, + tables: &[&[(f64, f64)]], + temp_slots: usize, +) -> Vec { + let rv = input.to_runtime_view(); + let size = rv.size(); + let mut idx: SmallVec<[u16; 4]> = smallvec::smallvec![0; rv.dims.len()]; + let mut temp = vec![0.0f64; temp_slots]; + for slot in temp.iter_mut().take(size) { + let elem_off = rv.flat_offset(&idx); + *slot = if elem_off >= table_count as usize { + f64::NAN + } else { + let gf = tables[base_gf as usize + elem_off]; + vm_lookup_oracle(mode, gf, index) + }; + crate::vm::increment_indices(&mut idx, &rv.dims); + } + temp +} + +#[allow(clippy::too_many_arguments)] +fn assert_lookup_array_matches( + input: &StaticArrayView, + base_gf: GraphicalFunctionId, + table_count: u16, + mode: LookupMode, + index: f64, + tables: &[&[(f64, f64)]], + slots: usize, + input_data: &[f64], +) { + let got = run_lookup_array( + input.clone(), + base_gf, + table_count, + mode, + index, + tables, + slots, + slots, + input_data, + ); + let want = vm_lookup_array_oracle(input, base_gf, table_count, mode, index, tables, slots); + assert_eq!(got.len(), want.len()); + for (i, (&g, &w)) in got.iter().zip(want.iter()).enumerate() { + if w.is_nan() { + assert!(g.is_nan(), "lookup_array slot {i}: expected NaN, got {g}"); + } else { + assert_eq!(g, w, "lookup_array slot {i}: got {g}, want {w}"); + } + } +} + +#[test] +fn lookup_array_interp_matches_vm() { + // Three per-element tables; a contiguous 3-element input view -> elem_off + // [0, 1, 2]. Each element looks up its own table at the shared index. + let t0: &[(f64, f64)] = &[(0.0, 0.0), (10.0, 100.0)]; // y = 10x + let t1: &[(f64, f64)] = &[(0.0, 1.0), (10.0, 2.0)]; // y = x/10 + 1 + let t2: &[(f64, f64)] = &[(0.0, 5.0), (10.0, 5.0)]; // constant 5 + let tables = [t0, t1, t2]; + let input = dense_view(0, &[3]); + let input_data = [0.0, 0.0, 0.0]; + assert_lookup_array_matches( + &input, + 0, + 3, + LookupMode::Interpolate, + 5.0, + &tables, + 3, + &input_data, + ); + let got = run_lookup_array( + input, + 0, + 3, + LookupMode::Interpolate, + 5.0, + &tables, + 3, + 3, + &input_data, + ); + // index 5: t0 interp 50, t1 interp 1.5, t2 constant 5. + assert_eq!(got, vec![50.0, 1.5, 5.0]); +} + +/// A monotonic-x table fixture (reused across modes/indices). +const LA_TABLE_A: &[(f64, f64)] = &[(0.0, 10.0), (1.0, 20.0), (2.5, 5.0), (4.0, 40.0)]; +const LA_TABLE_B: &[(f64, f64)] = &[(0.0, 0.0), (2.0, 8.0), (2.0, 12.0), (5.0, 50.0)]; + +#[test] +fn lookup_array_all_modes_over_domain_match_vm() { + // Two per-element tables, a 2-element input view (elem_off [0, 1]). For each + // mode, probe several indices spanning below/at/between/above the knots; each + // element's result must match the corresponding VM lookup over its table. + let tables = [LA_TABLE_A, LA_TABLE_B]; + let input = dense_view(0, &[2]); + let input_data = [0.0, 0.0]; + for mode in [ + LookupMode::Interpolate, + LookupMode::Forward, + LookupMode::Backward, + ] { + for &index in &[-1.0, 0.0, 0.5, 1.0, 2.0, 2.001, 3.25, 4.0, 100.0] { + assert_lookup_array_matches(&input, 0, 2, mode, index, &tables, 2, &input_data); + } + } +} + +#[test] +fn lookup_array_out_of_range_element_offset_is_nan() { + // table_count = 2, but the input view has 3 elements -> elem_off [0, 1, 2]. + // Element 2's offset (2) is >= table_count (2), so its result is NaN + // (matching the scalar Lookup bound), while elements 0 and 1 look up tables + // 0 and 1. + let tables = [LA_TABLE_A, LA_TABLE_B]; + let input = dense_view(0, &[3]); + let input_data = [0.0, 0.0, 0.0]; + assert_lookup_array_matches( + &input, + 0, + 2, + LookupMode::Interpolate, + 1.0, + &tables, + 3, + &input_data, + ); + let got = run_lookup_array( + input, + 0, + 2, + LookupMode::Interpolate, + 1.0, + &tables, + 3, + 3, + &input_data, + ); + assert_eq!(got[0], 20.0); // t0 at index 1 (exact knot) + assert!(got[2].is_nan(), "element offset 2 >= table_count 2 -> NaN"); +} + +#[test] +fn lookup_array_base_gf_offsets_into_directory() { + // base_gf selects a starting table; a 2-element view with base_gf=1 reads + // tables 1 and 2 (NOT 0 and 1). Three tables, table_count covers all three. + let t0: &[(f64, f64)] = &[(0.0, 0.0), (10.0, 100.0)]; + let t1: &[(f64, f64)] = &[(0.0, 1.0), (10.0, 2.0)]; + let t2: &[(f64, f64)] = &[(0.0, 7.0), (10.0, 7.0)]; + let tables = [t0, t1, t2]; + let input = dense_view(0, &[2]); + let input_data = [0.0, 0.0]; + // base_gf=1, table_count=3 (the bound is on elem_off, not base_gf+elem_off, + // matching the VM): elem_off [0,1], tables base_gf+elem_off = [1, 2]. + assert_lookup_array_matches( + &input, + 1, + 3, + LookupMode::Interpolate, + 5.0, + &tables, + 2, + &input_data, + ); + let got = run_lookup_array( + input, + 1, + 3, + LookupMode::Interpolate, + 5.0, + &tables, + 2, + 2, + &input_data, + ); + // t1 interp at 5 -> 1.5; t2 constant 7. + assert_eq!(got, vec![1.5, 7.0]); +} + +#[test] +fn lookup_array_strided_view_offsets_match_vm() { + // A transposed (non-contiguous) input view exercises the per-element + // flat_offset projection for elem_off. dim_ids/strides differ from row-major, + // so a mis-addressed elem_off would pick the wrong table. Cross-checked vs the + // faithful oracle, which uses the same `flat_offset`. + let t0: &[(f64, f64)] = &[(0.0, 0.0), (10.0, 100.0)]; + let t1: &[(f64, f64)] = &[(0.0, 1.0), (10.0, 2.0)]; + let t2: &[(f64, f64)] = &[(0.0, 20.0), (10.0, 30.0)]; + let t3: &[(f64, f64)] = &[(0.0, 5.0), (10.0, 5.0)]; + let tables = [t0, t1, t2, t3]; + // 2x2 transposed: dims [2,2], strides [1,2] -> elem_offs visited row-major + // are [0, 2, 1, 3]. + let input = StaticArrayView { + base_off: 0, + is_temp: false, + dims: SmallVec::from_slice(&[2, 2]), + strides: SmallVec::from_slice(&[1, 2]), + offset: 0, + sparse: SmallVec::new(), + dim_ids: SmallVec::from_slice(&[0, 0]), + }; + let input_data = [0.0, 0.0, 0.0, 0.0]; + assert_lookup_array_matches( + &input, + 0, + 4, + LookupMode::Interpolate, + 5.0, + &tables, + 4, + &input_data, + ); +} + +#[test] +fn lookup_array_invalid_view_fills_temp_with_nan() { + // A dynamically-subscripted-out-of-range input view -> the whole temp region + // is filled with NaN (the VM's `!is_valid -> fill_temp_nan`). + let t0: &[(f64, f64)] = &[(0.0, 0.0), (10.0, 100.0)]; + let tables = [t0, t0, t0, t0]; + let mut context = ByteCodeContext::default(); + context.add_dim_list(2, [3, 4, 0, 0]); // mat[3][4] + context.set_temp_info(vec![0], 4); + let ctx = EmitCtx { + gf_directory_base: LA_GF_BASE, + gf_data_base: LA_GF_BASE, + temp_storage_base: TEMP_BASE, + ctx: &context, + ..ctx_with_cond_depth(0) + }; + // mat[5, *]: row 5 out of range -> invalid 4-element row view. + let code = vec![ + Opcode::LoadConstant { id: 0 }, // index + Opcode::PushVarViewDirect { + base_off: 0, + dim_list_id: 0, + }, + Opcode::LoadConstant { id: 1 }, // runtime row index (1-based) + Opcode::ViewSubscriptDynamic { dim_idx: 0 }, + Opcode::LookupArray { + base_gf: 0, + table_count: 4, + mode: LookupMode::Interpolate, + write_temp_id: 0, + }, + Opcode::PopView {}, + ]; + let mut seed = seed_run(0, &(0..12).map(|i| i as f64).collect::>()); + seed.extend(seed_gf_tables(&tables)); + let bytes = build_module(&bc(vec![5.0, 5.0], code), &ctx, false, 0); + let info = validate(&bytes).expect("module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let mem = store + .instance_export(inst, "mem") + .unwrap() + .as_mem() + .unwrap(); + store.mem_access_mut_slice(mem, |b| { + for &(addr, v) in &seed { + let a = addr as usize; + b[a..a + 8].copy_from_slice(&v.to_le_bytes()); + } + }); + let eval = store + .instance_export(inst, "eval") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(i32,), ()>(eval, (0_i32,)) + .expect("invoke"); + let temps: Vec = store.mem_access_mut_slice(mem, |b| { + (0..4) + .map(|i| { + let a = TEMP_BASE as usize + i * 8; + f64::from_le_bytes(b[a..a + 8].try_into().unwrap()) + }) + .collect() + }); + assert!( + temps.iter().all(|v| v.is_nan()), + "invalid input view must fill the LookupArray temp with NaN, got {temps:?}" + ); +} + +// ════════════════════════════════════════════════════════════════════════ +// Phase 6 Task 4: AllocateAvailable + AllocateByPriority (opcode lowering) +// +// These run the emitted opcode programs under DLR-FT and cross-check the +// written temp region against the VM's own arm logic (`vm.rs:2631-2794`), +// which gathers requests/profiles from the views and calls +// `crate::alloc::allocate_available`. The oracle below reproduces that gather +// (the `pp_cols`/defaults for AllocateAvailable, the rectangular-profile +// synthesis for AllocateByPriority) and calls the same `allocate_available`, +// so a passing test proves the wasm opcode == the VM opcode element-for-element. +// The full `Vm::new(sim).run_to_end()` parity on a real model lives in +// `module.rs`'s `compile_simulation_allocate_available_matches_vm`. +// ════════════════════════════════════════════════════════════════════════ + +/// Run `PushStaticView(requests); PushStaticView(profile); AllocateAvailable; +/// PopView; PopView` over a `curr` slab seeded from `data`, writing temp 0, and +/// read back `n` temp slots. The views are pushed requests-then-profile so +/// `profile_view = top`, `requests_view = top-1` (matching the VM); `avail` is +/// the single operand pushed beneath the opcode. +fn run_allocate_available( + requests: StaticArrayView, + profile: StaticArrayView, + avail: f64, + data: &[f64], + n: usize, +) -> Vec { + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], n); + let req_id = context.add_static_view(requests); + let prof_id = context.add_static_view(profile); + let code = vec![ + Opcode::LoadConstant { id: 0 }, // avail + Opcode::PushStaticView { view_id: req_id }, + Opcode::PushStaticView { view_id: prof_id }, + Opcode::AllocateAvailable { write_temp_id: 0 }, + Opcode::PopView {}, + Opcode::PopView {}, + ]; + run_and_read_temps(&context, code, vec![avail], &seed_run(0, data), n) +} + +/// Run `PushStaticView(requests); PushStaticView(priority); AllocateByPriority; +/// PopView; PopView`. The operands are `width` (pushed first) then `supply` +/// (pushed last, on top) -- matching the VM's `supply = pop`, `width = pop`. +fn run_allocate_by_priority( + requests: StaticArrayView, + priority: StaticArrayView, + width: f64, + supply: f64, + data: &[f64], + n: usize, +) -> Vec { + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], n); + let req_id = context.add_static_view(requests); + let pri_id = context.add_static_view(priority); + let code = vec![ + Opcode::LoadConstant { id: 0 }, // width (pushed first) + Opcode::LoadConstant { id: 1 }, // supply (pushed second, on top) + Opcode::PushStaticView { view_id: req_id }, + Opcode::PushStaticView { view_id: pri_id }, + Opcode::AllocateByPriority { write_temp_id: 0 }, + Opcode::PopView {}, + Opcode::PopView {}, + ]; + run_and_read_temps(&context, code, vec![width, supply], &seed_run(0, data), n) +} + +/// The VM `AllocateAvailable` oracle (`vm.rs:2631-2721`): gather requests + +/// the flattened profile array from the views, build the per-requester +/// `(ptype, ppriority, pwidth, pextra)` tuples via the `pp_cols`/defaults logic, +/// then call `crate::alloc::allocate_available`. +fn vm_allocate_available_oracle( + requests_view: &StaticArrayView, + profile_view: &StaticArrayView, + avail: f64, + data: &[f64], +) -> Vec { + let requests: Vec = (0..requests_view.to_runtime_view().size()) + .map(|i| vm_view_element(requests_view, data, i)) + .collect(); + let n = requests.len(); + let pp_size = profile_view.to_runtime_view().size(); + let pp_values: Vec = (0..pp_size) + .map(|i| vm_view_element(profile_view, data, i)) + .collect(); + let pp_cols = if !pp_values.is_empty() && n > 0 && pp_size.is_multiple_of(n) { + pp_size / n + } else { + 4 + }; + let profiles: Vec<(f64, f64, f64, f64)> = (0..n) + .map(|i| { + let base = i * pp_cols; + let g = |k: usize, dflt: f64| pp_values.get(base + k).copied().unwrap_or(dflt); + (g(0, 0.0), g(1, 0.0), g(2, 1.0), g(3, 0.0)) + }) + .collect(); + crate::alloc::allocate_available(&requests, &profiles, avail) +} + +/// The VM `AllocateByPriority` oracle (`vm.rs:2723-2794`): gather requests + +/// priorities, synthesize rectangular profiles `(1, priorities[i] or 0, width, +/// 0)`, then call `crate::alloc::allocate_available` with `supply`. +fn vm_allocate_by_priority_oracle( + requests_view: &StaticArrayView, + priority_view: &StaticArrayView, + width: f64, + supply: f64, + data: &[f64], +) -> Vec { + let requests: Vec = (0..requests_view.to_runtime_view().size()) + .map(|i| vm_view_element(requests_view, data, i)) + .collect(); + let n = requests.len(); + let priorities: Vec = (0..priority_view.to_runtime_view().size()) + .map(|i| vm_view_element(priority_view, data, i)) + .collect(); + let profiles: Vec<(f64, f64, f64, f64)> = (0..n) + .map(|i| (1.0, priorities.get(i).copied().unwrap_or(0.0), width, 0.0)) + .collect(); + crate::alloc::allocate_available(&requests, &profiles, supply) +} + +/// Assert the emitted `AllocateAvailable` matches the VM oracle (NaN as NaN, +/// else exact -- the wasm helpers are bit-faithful ports, so the only drift is +/// the leaf `exp`/`pow` approximations; use a tight tolerance). +fn assert_allocate_available_matches( + requests_view: &StaticArrayView, + profile_view: &StaticArrayView, + avail: f64, + data: &[f64], + n: usize, +) { + let got = run_allocate_available(requests_view.clone(), profile_view.clone(), avail, data, n); + let want = vm_allocate_available_oracle(requests_view, profile_view, avail, data); + assert_eq!(got.len(), want.len()); + for (i, (&g, &w)) in got.iter().zip(want.iter()).enumerate() { + if w.is_nan() { + assert!( + g.is_nan(), + "allocate_available slot {i}: expected NaN, got {g}" + ); + } else { + let diff = (g - w).abs(); + let rel = if w != 0.0 { diff / w.abs() } else { diff }; + assert!( + diff <= 1e-9 || rel <= 1e-9, + "allocate_available slot {i}: got {g}, want {w} (diff {diff:.3e})" + ); + } + } +} + +#[test] +fn allocate_available_full_grant_matches_vm() { + // avail >= total_demand: each requester gets request.max(0). requests in + // curr slots 0..3, the flat profile [3 requesters x 4 fields] in slots 3..15. + // Rectangular (ptype 1) profiles. total_demand = 3+2+4 = 9 < avail 100. + let requests = dense_view(0, &[3]); + let profile = dense_view(3, &[3, 4]); + let mut data = vec![3.0, 2.0, 4.0]; + // Profile rows (region-major): (ptype, ppriority, pwidth, pextra). + data.extend_from_slice(&[1.0, 1.0, 1.0, 0.0]); + data.extend_from_slice(&[1.0, 2.0, 1.0, 0.0]); + data.extend_from_slice(&[1.0, 3.0, 1.0, 0.0]); + assert_allocate_available_matches(&requests, &profile, 100.0, &data, 3); + // Full grant returns the requests verbatim. + let got = run_allocate_available(requests, profile, 100.0, &data, 3); + assert_eq!(got, vec![3.0, 2.0, 4.0]); +} + +#[test] +fn allocate_available_zeros_when_supply_nonpositive_matches_vm() { + let requests = dense_view(0, &[3]); + let profile = dense_view(3, &[3, 4]); + let mut data = vec![3.0, 2.0, 4.0]; + data.extend_from_slice(&[1.0, 1.0, 1.0, 0.0]); + data.extend_from_slice(&[1.0, 2.0, 1.0, 0.0]); + data.extend_from_slice(&[1.0, 3.0, 1.0, 0.0]); + assert_allocate_available_matches(&requests, &profile, 0.0, &data, 3); + let got = run_allocate_available(requests, profile, -5.0, &data, 3); + assert_eq!(got, vec![0.0, 0.0, 0.0]); +} + +#[test] +fn allocate_available_partial_bisection_rectangular_matches_vm() { + // 0 < avail < total_demand forces the bisection. Rectangular profiles. + let requests = dense_view(0, &[3]); + let profile = dense_view(3, &[3, 4]); + let mut data = vec![3.0, 2.0, 4.0]; + data.extend_from_slice(&[1.0, 1.0, 1.0, 0.0]); + data.extend_from_slice(&[1.0, 2.0, 1.0, 0.0]); + data.extend_from_slice(&[1.0, 3.0, 1.0, 0.0]); + for avail in [1.0, 3.0, 5.0, 7.0, 8.5] { + assert_allocate_available_matches(&requests, &profile, avail, &data, 3); + } +} + +#[test] +fn allocate_available_partial_bisection_across_profile_types_matches_vm() { + // A mix of profile types (fixed/triangular/normal/exponential/CES), so the + // search-range `spread` per type and each curve at the converged price are + // exercised. 5 requesters x 4 profile fields. + let requests = dense_view(0, &[5]); + let profile = dense_view(5, &[5, 4]); + let mut data = vec![4.0, 3.0, 5.0, 2.0, 6.0]; + data.extend_from_slice(&[0.0, 2.0, 1.0, 0.0]); // fixed + data.extend_from_slice(&[2.0, 3.0, 1.5, 0.0]); // triangular + data.extend_from_slice(&[3.0, 2.5, 1.0, 0.0]); // normal + data.extend_from_slice(&[4.0, 2.0, 1.2, 0.0]); // exponential + data.extend_from_slice(&[5.0, 3.0, 1.0, 2.0]); // CES + for avail in [2.0, 6.0, 10.0, 15.0, 19.0] { + assert_allocate_available_matches(&requests, &profile, avail, &data, 5); + } +} + +#[test] +fn allocate_available_pp_cols_defaults_when_not_divisible_matches_vm() { + // When pp_size is not a multiple of n, pp_cols falls back to 4 and the + // out-of-range profile fields take the defaults (0,0,1,0). Here n=3 but the + // profile view is 1-D of size 5 (not a multiple of 3), so pp_cols=4 and + // every requester reads past the end -> all-default profiles. + let requests = dense_view(0, &[3]); + let profile = dense_view(3, &[5]); + let data = vec![3.0, 2.0, 4.0, 9.0, 9.0, 9.0, 9.0, 9.0]; + for avail in [0.5, 4.0, 100.0] { + assert_allocate_available_matches(&requests, &profile, avail, &data, 3); + } +} + +#[test] +fn allocate_available_invalid_view_fills_temp_with_nan() { + // A dynamically-subscripted requests view made invalid at runtime (row index + // out of bounds) takes the VM's `fill_temp_nan` short-circuit. Build the + // requests view via PushVarViewDirect + an out-of-bounds ViewSubscriptDynamic. + let mut context = ByteCodeContext::default(); + context.set_temp_info(vec![0], 3); + context.add_dim_list(2, [3, 3, 0, 0]); // a [3,3] base for the dynamic subscript + let prof_id = context.add_static_view(dense_view(20, &[3, 4])); + let ctx = ctx_with_arrays(&context); + let code = vec![ + Opcode::LoadConstant { id: 0 }, // avail + // requests view: PushVarViewDirect over a [3,3] base, then subscript row + // 9 (out of bounds) -> invalid view. + Opcode::PushVarViewDirect { + base_off: 0, + dim_list_id: 0, + }, + Opcode::LoadConstant { id: 1 }, // runtime row index (1-based, OOB) + Opcode::ViewSubscriptDynamic { dim_idx: 0 }, + Opcode::PushStaticView { view_id: prof_id }, + Opcode::AllocateAvailable { write_temp_id: 0 }, + Opcode::PopView {}, + Opcode::PopView {}, + ]; + let bytes = build_module(&bc(vec![5.0, 9.0], code), &ctx, false, 0); + let info = validate(&bytes).expect("module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let eval = store + .instance_export(inst, "eval") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(i32,), ()>(eval, (0_i32,)) + .expect("invoke"); + let mem = store + .instance_export(inst, "mem") + .unwrap() + .as_mem() + .unwrap(); + let temps: Vec = store.mem_access_mut_slice(mem, |b| { + (0..3) + .map(|i| { + let a = TEMP_BASE as usize + i * 8; + f64::from_le_bytes(b[a..a + 8].try_into().unwrap()) + }) + .collect() + }); + assert!( + temps.iter().all(|v| v.is_nan()), + "invalid input view must fill the AllocateAvailable temp with NaN, got {temps:?}" + ); +} + +#[test] +fn allocate_by_priority_full_grant_matches_vm() { + // avail >= total_demand: full grant. requests in slots 0..3, priorities in + // slots 3..6. width=1, supply=100, total_demand=9. + let requests = dense_view(0, &[3]); + let priority = dense_view(3, &[3]); + let data = vec![3.0, 2.0, 4.0, 1.0, 2.0, 3.0]; + assert_allocate_by_priority_matches(&requests, &priority, 1.0, 100.0, &data, 3); + let got = run_allocate_by_priority(requests, priority, 1.0, 100.0, &data, 3); + assert_eq!(got, vec![3.0, 2.0, 4.0]); +} + +#[test] +fn allocate_by_priority_zeros_when_supply_nonpositive_matches_vm() { + let requests = dense_view(0, &[3]); + let priority = dense_view(3, &[3]); + let data = vec![3.0, 2.0, 4.0, 1.0, 2.0, 3.0]; + assert_allocate_by_priority_matches(&requests, &priority, 1.0, 0.0, &data, 3); +} + +#[test] +fn allocate_by_priority_partial_bisection_matches_vm() { + // 0 < supply < total_demand forces the bisection over the synthesized + // rectangular (ptype 1) profiles. Sweep several partial supplies and widths. + let requests = dense_view(0, &[3]); + let priority = dense_view(3, &[3]); + let data = vec![3.0, 2.0, 4.0, 1.0, 2.0, 3.0]; + for &(width, supply) in &[(1.0, 1.0), (1.0, 5.0), (2.0, 4.0), (0.5, 7.0), (3.0, 8.5)] { + assert_allocate_by_priority_matches(&requests, &priority, width, supply, &data, 3); + } +} + +/// Assert the emitted `AllocateByPriority` matches the VM oracle. +fn assert_allocate_by_priority_matches( + requests_view: &StaticArrayView, + priority_view: &StaticArrayView, + width: f64, + supply: f64, + data: &[f64], + n: usize, +) { + let got = run_allocate_by_priority( + requests_view.clone(), + priority_view.clone(), + width, + supply, + data, + n, + ); + let want = vm_allocate_by_priority_oracle(requests_view, priority_view, width, supply, data); + assert_eq!(got.len(), want.len()); + for (i, (&g, &w)) in got.iter().zip(want.iter()).enumerate() { + if w.is_nan() { + assert!( + g.is_nan(), + "allocate_by_priority slot {i}: expected NaN, got {g}" + ); + } else { + let diff = (g - w).abs(); + let rel = if w != 0.0 { diff / w.abs() } else { diff }; + assert!( + diff <= 1e-9 || rel <= 1e-9, + "allocate_by_priority slot {i}: got {g}, want {w} (diff {diff:.3e})" + ); + } + } +} diff --git a/src/simlin-engine/src/wasmgen/math.rs b/src/simlin-engine/src/wasmgen/math.rs new file mode 100644 index 000000000..ff42f8403 --- /dev/null +++ b/src/simlin-engine/src/wasmgen/math.rs @@ -0,0 +1,1319 @@ +// Copyright 2026 The Simlin Authors. All rights reserved. +// Use of this source code is governed by the Apache License, +// Version 2.0, that can be found in the LICENSE file. + +// pattern: Functional Core +// Pure transformation: each public function emits a self-contained wasm helper +// `Function` (instruction sequence) for one transcendental. No I/O; the only +// side effect is in `#[cfg(test)]`, which executes the emitted helpers under the +// DLR-FT interpreter and compares against Rust `f64`. + +//! Open-coded transcendental helpers for the wasm simulation backend. +//! +//! WebAssembly's MVP numeric instruction set provides `f64.sqrt`/`abs`/`floor`/ +//! `ceil`/`trunc`/`nearest`/`min`/`max` and the arithmetic/compare ops, but *no* +//! transcendental instructions (`sin`/`cos`/`exp`/`ln`/...). The bytecode VM +//! reaches those through libm (`f64::sin` etc., `vm.rs::apply`). To stay a +//! self-contained module that imports no host math, this backend emits one wasm +//! helper function per transcendental, each built from range reduction plus a +//! polynomial/rational kernel over only the natively-available ops (plus +//! `i64.reinterpret_f64`/`f64.reinterpret_i64` for the exponent/mantissa bit +//! tricks `exp`/`ln` need). +//! +//! ## Accuracy bar +//! +//! These need not be bit-identical to libm. The bar is the `simulate.rs` +//! corpus tolerances (abs `2e-3` / rel `5e-6`, VDF `1%`): a model run through +//! this backend must clear the same comparison the VM clears. The kernels here +//! are chosen so each helper's worst-case error over its domain sits *far* +//! inside that bar (each emitter's rustdoc records the measured worst-case error +//! and the test that pins it); the slack absorbs any DLR-FT-vs-native rounding +//! drift. The per-helper unit tests assert against Rust `f64` with a documented +//! tolerance comfortably tighter than the corpus bar. +//! +//! ## Composition +//! +//! `tan = sin/cos`, `log10 = ln * (1/ln10)`, `asin = atan(x/sqrt(1-x^2))`, +//! `acos = pi/2 - asin`, and `pow(x, y) = exp(y * ln x)`. `pow` therefore +//! matches `f64::powf` only for a positive base; a negative base diverges +//! (`ln` of a negative is NaN). That is a documented limitation -- no corpus +//! model raises a negative base to a power -- so it is not chased here. +//! +//! ## Wiring +//! +//! Each emitter is pushed once by [`super::lower::build_helpers`], which records +//! the resulting function index in [`super::lower::HelperFns`]; the `Apply` +//! lowering (`lower.rs`, Phase 2 Task 4) and `Op2::Exp` (Task 3) reference a +//! helper by that index via `call`. No index is hard-coded. + +use wasm_encoder::{Function, Instruction as Ins, ValType}; + +use super::lower::f64_const; + +// ── Shared numeric constants (the kernels' magic numbers) ────────────────── + +/// `ln(2)` (the exp/ln exponent <-> natural-log conversion). +const LN2: f64 = std::f64::consts::LN_2; +/// `1/ln(2) = log2(e)` (scales `x` to a base-2 exponent count in `exp`). +const LOG2E: f64 = std::f64::consts::LOG2_E; +/// `2/pi` (scales `x` to a count of `pi/2` quadrants in `sin`/`cos`). +const FRAC_2_PI: f64 = std::f64::consts::FRAC_2_PI; +/// `1/ln(10)` (converts a natural log to a base-10 log). +const INV_LN10: f64 = 1.0 / std::f64::consts::LN_10; + +// IEEE-754 binary64 field geometry, used by the exp/ln bit tricks. +const EXP_BIAS: i64 = 1023; +const EXP_MASK: i64 = 0x7ff; // 11 exponent bits +const MANTISSA_BITS: i64 = 52; +const MANTISSA_MASK: i64 = 0x000f_ffff_ffff_ffff; +/// The exponent field of `1.0` (bias), pre-shifted into place: makes a raw +/// mantissa into a value in `[1, 2)`. +const ONE_EXP_FIELD: i64 = EXP_BIAS << MANTISSA_BITS; + +// `exp` overflow/underflow thresholds (matching `f64::exp`): just past these, +// `exp(x)` rounds to `+inf` / `0`. Guarding here keeps the `2^k` exponent +// assembly inside the representable exponent range. +const EXP_OVERFLOW: f64 = 709.782_712_893_384; +const EXP_UNDERFLOW: f64 = -745.133_219_101_941_2; + +// Cody-Waite three-part split of `pi/2` (the canonical fdlibm constants, each +// exactly representable in f64; `PIO2_1`'s low mantissa bits are zero so +// `x - k*PIO2_1` is exact). This keeps `r = x - k*(pi/2)` full-precision for +// `|k|` up to ~2^20 (sin/cos argument up to ~1e6). +const PIO2_1: f64 = 1.570_796_251_296_997; // pi/2, high ~33 bits +const PIO2_2: f64 = 7.549_789_415_861_596e-8; // next chunk +const PIO2_3: f64 = 5.390_302_529_957_765e-15; // remaining chunk + +// atan reduction constants. +const SQRT3: f64 = 1.732_050_807_568_877_2; +const TAN_PI_12: f64 = 0.267_949_192_431_122_7; // 2 - sqrt(3) = tan(pi/12) + +// ── Horner polynomial evaluation ──────────────────────────────────────────── + +/// Emit a Horner evaluation of `sum(coeffs[i] * v^i)` where `v` is the f64 in +/// `var_local`. Coefficients are given low-order-first; the emitter folds them +/// high-order-first (`acc = acc*v + c`), leaving the result on the stack. +/// +/// `v` must already be materialized in `var_local` (a plain f64 local) because +/// Horner reads it once per term and the wasm operand stack is strict LIFO. +/// +/// Shared with `super::alloc` (the `erfc_approx` Abramowitz-Stegun polynomial +/// folds with the identical `acc = acc*v + c` order, so reusing this keeps the +/// emitted op sequence bit-faithful to the Rust reference). +pub(crate) fn emit_horner(f: &mut Function, var_local: u32, coeffs: &[f64]) { + // Start from the highest-order coefficient. + let mut it = coeffs.iter().rev(); + let first = *it + .next() + .expect("polynomial needs at least one coefficient"); + f.instruction(&f64_const(first)); + for &c in it { + // acc = acc * v + c + f.instruction(&Ins::LocalGet(var_local)); + f.instruction(&Ins::F64Mul); + f.instruction(&f64_const(c)); + f.instruction(&Ins::F64Add); + } +} + +// ── exp ───────────────────────────────────────────────────────────────────── + +// `exp` local layout. Param 0 is `x`; the rest are scratch. +const EXP_X: u32 = 0; +const EXP_K: u32 = 1; // f64 k = round(x * log2e) +const EXP_R: u32 = 2; // f64 reduced argument r = x - k*ln2 +const EXP_KI: u32 = 3; // i64 k as integer (the power of two to apply) + +/// Taylor coefficients of `exp(r)` (`1/n!`, n = 0..=11). On `|r| <= ln2/2 ~= +/// 0.347` the degree-11 truncation is ~5e-15 relative -- far inside the bar. +const EXP_COEFFS: [f64; 12] = [ + 1.0, + 1.0, + 1.0 / 2.0, + 1.0 / 6.0, + 1.0 / 24.0, + 1.0 / 120.0, + 1.0 / 720.0, + 1.0 / 5040.0, + 1.0 / 40320.0, + 1.0 / 362880.0, + 1.0 / 3628800.0, + 1.0 / 39916800.0, +]; + +/// Emit `exp(x: f64) -> f64`. +/// +/// Range reduction `x = k*ln2 + r`, `|r| <= ln2/2`, then `exp(x) = 2^k * +/// exp(r)`: `exp(r)` is the Taylor poly ([`EXP_COEFFS`]), and `2^k` is applied +/// by adding `k` to the result's IEEE exponent field (`f64.reinterpret_i64`). +/// Guards: `NaN -> NaN`, `x > EXP_OVERFLOW -> +inf`, `x < EXP_UNDERFLOW -> 0`. +/// Because the post-guard `exp(r)` is always a normal number in `[0.70, 1.42]` +/// (exponent field `EXP_BIAS-1` or `EXP_BIAS`) and `k` is bounded by the +/// guards, the exponent-assembly path needs no subnormal special-case; an +/// out-of-range assembled exponent still saturates to `+inf`/`0` to be safe. +/// +/// Worst-case error vs `f64::exp` over `[-700, 700]`: rel `~8e-14`. Pinned by +/// `exp_matches_f64`. +pub(crate) fn emit_exp() -> Function { + // Locals (param 0 = x): f64 EXP_K(1)/EXP_R(2), i64 EXP_KI(3), then the + // `emit_ldexp_exp_field` scratch f64 LDEXP_VAL(4) + i64 LDEXP_BITS(5)/ + // LDEXP_NEWEXP(6). Declaration order fixes these indices. + let mut f = Function::new([ + (2, ValType::F64), + (1, ValType::I64), + (1, ValType::F64), + (2, ValType::I64), + ]); + + // NaN guard: x != x. If NaN, return x (which is NaN). + f.instruction(&Ins::LocalGet(EXP_X)); + f.instruction(&Ins::LocalGet(EXP_X)); + f.instruction(&Ins::F64Ne); + f.instruction(&Ins::If(wasm_encoder::BlockType::Empty)); + f.instruction(&Ins::LocalGet(EXP_X)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // Overflow guard: x > EXP_OVERFLOW -> +inf. + f.instruction(&Ins::LocalGet(EXP_X)); + f.instruction(&f64_const(EXP_OVERFLOW)); + f.instruction(&Ins::F64Gt); + f.instruction(&Ins::If(wasm_encoder::BlockType::Empty)); + f.instruction(&f64_const(f64::INFINITY)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // Underflow guard: x < EXP_UNDERFLOW -> 0. + f.instruction(&Ins::LocalGet(EXP_X)); + f.instruction(&f64_const(EXP_UNDERFLOW)); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::If(wasm_encoder::BlockType::Empty)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // k = nearest(x * log2e) + f.instruction(&Ins::LocalGet(EXP_X)); + f.instruction(&f64_const(LOG2E)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::F64Nearest); + f.instruction(&Ins::LocalTee(EXP_K)); + // ki = trunc(k) as i64 (k is integer-valued; saturating is safe). + f.instruction(&Ins::I64TruncSatF64S); + f.instruction(&Ins::LocalSet(EXP_KI)); + + // r = x - k*ln2 + f.instruction(&Ins::LocalGet(EXP_X)); + f.instruction(&Ins::LocalGet(EXP_K)); + f.instruction(&f64_const(LN2)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::LocalSet(EXP_R)); + + // poly = exp(r) via Horner; leaves exp(r) on the stack. + emit_horner(&mut f, EXP_R, &EXP_COEFFS); + + // Apply 2^k by adding ki to exp(r)'s exponent field. + // bits = reinterpret(exp(r)); exp_field = (bits >> 52) & 0x7ff; + // new_exp = exp_field + ki. + emit_ldexp_exp_field(&mut f, EXP_KI); + + f.instruction(&Ins::End); + f +} + +// `emit_ldexp_exp_field` scratch (declared at the END of exp's locals so it does +// not collide with EXP_X/K/R/KI). The f64 `exp(r)` value is consumed off the +// stack into a fresh local. +const LDEXP_VAL: u32 = 4; // f64 exp(r) +const LDEXP_BITS: u32 = 5; // i64 raw bits of exp(r) +const LDEXP_NEWEXP: u32 = 6; // i64 candidate new exponent field + +/// Consume the f64 on the stack (a *normal* value `e`, here always `exp(r) in +/// [0.70, 1.42]`) and push `e * 2^ki`, by adding `ki` (in `ki_local`) to `e`'s +/// IEEE exponent field. If the resulting exponent field is `>= EXP_MASK` push +/// `+inf` (e is positive here); if `<= 0` push `0`. Both saturations are +/// defensive: the `exp` over/underflow guards already bound `ki` so the in-range +/// branch is the one taken across the supported domain. +/// +/// Requires three scratch locals declared by the caller: a f64 (`LDEXP_VAL`) +/// and two i64 (`LDEXP_BITS`, `LDEXP_NEWEXP`). +fn emit_ldexp_exp_field(f: &mut Function, ki_local: u32) { + f.instruction(&Ins::LocalSet(LDEXP_VAL)); + + // bits = reinterpret(val) + f.instruction(&Ins::LocalGet(LDEXP_VAL)); + f.instruction(&Ins::I64ReinterpretF64); + f.instruction(&Ins::LocalSet(LDEXP_BITS)); + + // new_exp = ((bits >> 52) & 0x7ff) + ki + f.instruction(&Ins::LocalGet(LDEXP_BITS)); + f.instruction(&Ins::I64Const(MANTISSA_BITS)); + f.instruction(&Ins::I64ShrU); + f.instruction(&Ins::I64Const(EXP_MASK)); + f.instruction(&Ins::I64And); + f.instruction(&Ins::LocalGet(ki_local)); + f.instruction(&Ins::I64Add); + f.instruction(&Ins::LocalSet(LDEXP_NEWEXP)); + + // if new_exp >= 0x7ff -> +inf + f.instruction(&Ins::LocalGet(LDEXP_NEWEXP)); + f.instruction(&Ins::I64Const(EXP_MASK)); + f.instruction(&Ins::I64GeS); + f.instruction(&Ins::If(wasm_encoder::BlockType::Result(ValType::F64))); + f.instruction(&f64_const(f64::INFINITY)); + f.instruction(&Ins::Else); + // if new_exp <= 0 -> 0 + f.instruction(&Ins::LocalGet(LDEXP_NEWEXP)); + f.instruction(&Ins::I64Const(0)); + f.instruction(&Ins::I64LeS); + f.instruction(&Ins::If(wasm_encoder::BlockType::Result(ValType::F64))); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::Else); + // in range: rebuild bits with the new exponent field. + // new_bits = (bits & ~(0x7ff << 52)) | (new_exp << 52) + f.instruction(&Ins::LocalGet(LDEXP_BITS)); + f.instruction(&Ins::I64Const(!(EXP_MASK << MANTISSA_BITS))); + f.instruction(&Ins::I64And); + f.instruction(&Ins::LocalGet(LDEXP_NEWEXP)); + f.instruction(&Ins::I64Const(MANTISSA_BITS)); + f.instruction(&Ins::I64Shl); + f.instruction(&Ins::I64Or); + f.instruction(&Ins::F64ReinterpretI64); + f.instruction(&Ins::End); // end inner if + f.instruction(&Ins::End); // end outer if +} + +// ── ln ───────────────────────────────────────────────────────────────────── + +// `ln` local layout. Param 0 is `x`. +const LN_X: u32 = 0; +const LN_E: u32 = 1; // f64 exponent (after centering) +const LN_M: u32 = 2; // f64 mantissa in [sqrt(2)/2, sqrt(2)) +const LN_S: u32 = 3; // f64 s = (m-1)/(m+1) +const LN_S2: u32 = 4; // f64 s^2 +const LN_BITS: u32 = 5; // i64 raw bits of x + +/// atanh-series coefficients `1/(2k+1)`, k = 0..=6, in `s^2`. On `|s| <= 0.1716` +/// (`m in [sqrt(2)/2, sqrt(2))`) the degree-13 truncation is ~1e-15 relative. +const LN_COEFFS: [f64; 7] = [ + 1.0, + 1.0 / 3.0, + 1.0 / 5.0, + 1.0 / 7.0, + 1.0 / 9.0, + 1.0 / 11.0, + 1.0 / 13.0, +]; + +/// Emit `ln(x: f64) -> f64`. +/// +/// Decompose `x = m * 2^e` with `m in [1, 2)` by reading the IEEE exponent and +/// mantissa fields; center `m` to `[sqrt(2)/2, sqrt(2))` (halve `m` and bump +/// `e` when `m > sqrt(2)`) so the atanh series in `s = (m-1)/(m+1)` converges +/// fast; `ln(x) = e*ln2 + 2*(s + s^3/3 + ...)`. Guards: `NaN or x < 0 -> NaN`, +/// `x == 0 -> -inf`, `+inf -> +inf`. Subnormal `x` (exponent field 0) is +/// normalized by scaling with `2^54` and subtracting 54 from `e`. +/// +/// Worst-case error vs `f64::ln` over `[1e-10, 1e10]`: abs `~5e-13`. Pinned by +/// `ln_matches_f64`. +pub(crate) fn emit_ln() -> Function { + // Locals (param 0 = x): f64 LN_E(1)/LN_M(2)/LN_S(3)/LN_S2(4), i64 LN_BITS(5). + let mut f = Function::new([(4, ValType::F64), (1, ValType::I64)]); + + // NaN-or-negative guard: !(x >= 0) (true for NaN and x<0) -> NaN. + // x < 0 -> NaN; NaN handled by the same (x != x) check folded in below. + // Use: if (x < 0) | (x != x) -> return NaN. + f.instruction(&Ins::LocalGet(LN_X)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::LocalGet(LN_X)); + f.instruction(&Ins::LocalGet(LN_X)); + f.instruction(&Ins::F64Ne); + f.instruction(&Ins::I32Or); + f.instruction(&Ins::If(wasm_encoder::BlockType::Empty)); + f.instruction(&f64_const(f64::NAN)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // x == 0 -> -inf. + f.instruction(&Ins::LocalGet(LN_X)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Eq); + f.instruction(&Ins::If(wasm_encoder::BlockType::Empty)); + f.instruction(&f64_const(f64::NEG_INFINITY)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // +inf -> +inf. + f.instruction(&Ins::LocalGet(LN_X)); + f.instruction(&f64_const(f64::INFINITY)); + f.instruction(&Ins::F64Eq); + f.instruction(&Ins::If(wasm_encoder::BlockType::Empty)); + f.instruction(&f64_const(f64::INFINITY)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // Decompose. Handle subnormal (exponent field == 0) by scaling up first. + // if ((reinterpret(x) >> 52) & 0x7ff) == 0 { x *= 2^54; e_adjust = -54 } + // We fold the adjust into LN_E after extracting the (now-normal) fields. + f.instruction(&Ins::LocalGet(LN_X)); + f.instruction(&Ins::I64ReinterpretF64); + f.instruction(&Ins::I64Const(MANTISSA_BITS)); + f.instruction(&Ins::I64ShrU); + f.instruction(&Ins::I64Const(EXP_MASK)); + f.instruction(&Ins::I64And); + f.instruction(&Ins::I64Eqz); // exponent field == 0 (subnormal/zero; zero already handled) + f.instruction(&Ins::If(wasm_encoder::BlockType::Result(ValType::F64))); + // subnormal: x_scaled = x * 2^54, and remember -54 in LN_E. + f.instruction(&Ins::LocalGet(LN_X)); + f.instruction(&f64_const(f64::from_bits(((EXP_BIAS + 54) as u64) << 52))); + f.instruction(&Ins::F64Mul); + f.instruction(&f64_const(-54.0)); + f.instruction(&Ins::LocalSet(LN_E)); + f.instruction(&Ins::Else); + // normal: x unchanged, e adjust 0. + f.instruction(&Ins::LocalGet(LN_X)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::LocalSet(LN_E)); + f.instruction(&Ins::End); + // stack: [x_norm]. bits = reinterpret(x_norm). + f.instruction(&Ins::I64ReinterpretF64); + f.instruction(&Ins::LocalSet(LN_BITS)); + + // m = mantissa-with-exponent-of-1.0 (value in [1,2)). + f.instruction(&Ins::LocalGet(LN_BITS)); + f.instruction(&Ins::I64Const(MANTISSA_MASK)); + f.instruction(&Ins::I64And); + f.instruction(&Ins::I64Const(ONE_EXP_FIELD)); + f.instruction(&Ins::I64Or); + f.instruction(&Ins::F64ReinterpretI64); + f.instruction(&Ins::LocalSet(LN_M)); + + // e += (exponent_field - bias). + f.instruction(&Ins::LocalGet(LN_E)); + f.instruction(&Ins::LocalGet(LN_BITS)); + f.instruction(&Ins::I64Const(MANTISSA_BITS)); + f.instruction(&Ins::I64ShrU); + f.instruction(&Ins::I64Const(EXP_MASK)); + f.instruction(&Ins::I64And); + f.instruction(&Ins::I64Const(EXP_BIAS)); + f.instruction(&Ins::I64Sub); + f.instruction(&Ins::F64ConvertI64S); + f.instruction(&Ins::F64Add); + f.instruction(&Ins::LocalSet(LN_E)); + + // Center: if m > sqrt(2) { m *= 0.5; e += 1 }. + f.instruction(&Ins::LocalGet(LN_M)); + f.instruction(&f64_const(std::f64::consts::SQRT_2)); + f.instruction(&Ins::F64Gt); + f.instruction(&Ins::If(wasm_encoder::BlockType::Empty)); + f.instruction(&Ins::LocalGet(LN_M)); + f.instruction(&f64_const(0.5)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::LocalSet(LN_M)); + f.instruction(&Ins::LocalGet(LN_E)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::F64Add); + f.instruction(&Ins::LocalSet(LN_E)); + f.instruction(&Ins::End); + + // s = (m - 1) / (m + 1); s2 = s*s. + f.instruction(&Ins::LocalGet(LN_M)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::LocalGet(LN_M)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::F64Add); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::LocalTee(LN_S)); + f.instruction(&Ins::LocalGet(LN_S)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::LocalSet(LN_S2)); + + // ln(m) = 2 * s * poly(s2); result = e*ln2 + ln(m). + f.instruction(&Ins::LocalGet(LN_E)); + f.instruction(&f64_const(LN2)); + f.instruction(&Ins::F64Mul); + f.instruction(&f64_const(2.0)); + f.instruction(&Ins::LocalGet(LN_S)); + f.instruction(&Ins::F64Mul); + emit_horner(&mut f, LN_S2, &LN_COEFFS); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::F64Add); + + f.instruction(&Ins::End); + f +} + +// ── sin / cos (shared kernel) ──────────────────────────────────────────────── + +// sin/cos local layout. Param 0 is `x`. +const SC_X: u32 = 0; +const SC_K: u32 = 1; // f64 quadrant count +const SC_R: u32 = 2; // f64 reduced argument in [-pi/4, pi/4] +const SC_R2: u32 = 3; // f64 r^2 +const SC_SR: u32 = 4; // f64 sin(r) +const SC_CR: u32 = 5; // f64 cos(r) +const SC_KQ: u32 = 6; // i64 quadrant index k mod 4 + +/// `sin(r)/r` Taylor coefficients in `r^2` (so the series is `r * poly(r^2)`): +/// `(-1)^n / (2n+1)!`, n = 0..=5 (through `r^11`). +const SIN_COEFFS: [f64; 6] = [ + 1.0, + -1.0 / 6.0, + 1.0 / 120.0, + -1.0 / 5040.0, + 1.0 / 362880.0, + -1.0 / 39916800.0, +]; + +/// `cos(r)` Taylor coefficients in `r^2`: `(-1)^n / (2n)!`, n = 0..=5 (through +/// `r^10`). +const COS_COEFFS: [f64; 6] = [ + 1.0, + -1.0 / 2.0, + 1.0 / 24.0, + -1.0 / 720.0, + 1.0 / 40320.0, + -1.0 / 3628800.0, +]; + +/// Emit the shared sin/cos body. `want_sin` selects which result the function +/// returns; both `sin(r)` and `cos(r)` are computed (cheap) and the quadrant +/// `k mod 4` selects/sign-flips the right one, exactly mirroring the kernel +/// the prototype validated. +fn emit_sincos(want_sin: bool) -> Function { + // Locals (param 0 = x): f64 SC_K(1)/SC_R(2)/SC_R2(3)/SC_SR(4)/SC_CR(5), + // i64 SC_KQ(6). + let mut f = Function::new([(5, ValType::F64), (1, ValType::I64)]); + + // NaN/inf guard: if !(|x| < +inf) return NaN. (|x| < inf is false for NaN + // and for +-inf.) + f.instruction(&Ins::LocalGet(SC_X)); + f.instruction(&Ins::F64Abs); + f.instruction(&f64_const(f64::INFINITY)); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::I32Eqz); + f.instruction(&Ins::If(wasm_encoder::BlockType::Empty)); + f.instruction(&f64_const(f64::NAN)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // k = nearest(x * 2/pi); kq = k mod 4 (normalized to 0..=3). + f.instruction(&Ins::LocalGet(SC_X)); + f.instruction(&f64_const(FRAC_2_PI)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::F64Nearest); + f.instruction(&Ins::LocalTee(SC_K)); + // kq = ((k as i64) % 4 + 4) % 4 + f.instruction(&Ins::I64TruncSatF64S); + f.instruction(&Ins::I64Const(4)); + f.instruction(&Ins::I64RemS); + f.instruction(&Ins::I64Const(4)); + f.instruction(&Ins::I64Add); + f.instruction(&Ins::I64Const(4)); + f.instruction(&Ins::I64RemS); + f.instruction(&Ins::LocalSet(SC_KQ)); + + // r = ((x - k*PIO2_1) - k*PIO2_2) - k*PIO2_3. + f.instruction(&Ins::LocalGet(SC_X)); + f.instruction(&Ins::LocalGet(SC_K)); + f.instruction(&f64_const(PIO2_1)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::LocalGet(SC_K)); + f.instruction(&f64_const(PIO2_2)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::LocalGet(SC_K)); + f.instruction(&f64_const(PIO2_3)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::LocalTee(SC_R)); + // r2 = r*r + f.instruction(&Ins::LocalGet(SC_R)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::LocalSet(SC_R2)); + + // sr = r * poly_sin(r2) + f.instruction(&Ins::LocalGet(SC_R)); + emit_horner(&mut f, SC_R2, &SIN_COEFFS); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::LocalSet(SC_SR)); + // cr = poly_cos(r2) + emit_horner(&mut f, SC_R2, &COS_COEFFS); + f.instruction(&Ins::LocalSet(SC_CR)); + + // Quadrant select. For sin: kq 0->sr, 1->cr, 2->-sr, 3->-cr. + // For cos: kq 0->cr, 1->-sr, 2->-cr, 3->sr. + // Emit a 4-way nested select keyed on kq. + emit_quadrant_select(&mut f, want_sin); + + f.instruction(&Ins::End); + f +} + +/// Push the quadrant-selected result for sin (`want_sin`) or cos. Reads +/// `SC_SR`/`SC_CR`/`SC_KQ`. Implemented as three chained `select`s, keyed on +/// `kq != n`, avoiding branches. +/// +/// wasm `select` pops `[a, b, cond]` and yields the *deeper* operand `a` when +/// `cond != 0`, else the shallower `b`. The running result (the default for +/// `kq == 0`, refined by earlier iterations) is already on the stack as the +/// deeper operand; pushing the override `q_n` above it and selecting on +/// `kq != n` keeps the running value when `kq != n` and switches to `q_n` +/// otherwise. +fn emit_quadrant_select(f: &mut Function, want_sin: bool) { + // The four results per quadrant (one `push_*` emitter each). + let [q0, q1, q2, q3]: [PushFn; 4] = if want_sin { + [push_sr, push_cr, push_neg_sr, push_neg_cr] + } else { + [push_cr, push_neg_sr, push_neg_cr, push_sr] + }; + + q0(f); // running result, default for kq == 0 + for (n, push_q) in [(1i64, q1), (2, q2), (3, q3)] { + push_q(f); // override candidate (shallower) + push_kq_ne(f, n); // cond: keep the running (deeper) value when kq != n + f.instruction(&Ins::Select); + } +} + +/// An emitter that pushes one quadrant result (`sr`/`cr`/`-sr`/`-cr`) onto the +/// stack from the precomputed `SC_SR`/`SC_CR` locals. +type PushFn = fn(&mut Function); + +fn push_sr(f: &mut Function) { + f.instruction(&Ins::LocalGet(SC_SR)); +} +fn push_cr(f: &mut Function) { + f.instruction(&Ins::LocalGet(SC_CR)); +} +fn push_neg_sr(f: &mut Function) { + f.instruction(&Ins::LocalGet(SC_SR)); + f.instruction(&Ins::F64Neg); +} +fn push_neg_cr(f: &mut Function) { + f.instruction(&Ins::LocalGet(SC_CR)); + f.instruction(&Ins::F64Neg); +} +/// Push i32 `1` when `SC_KQ != n`, else `0`. Used as the `select` condition so +/// the deeper (running) operand is kept when `kq != n`. +fn push_kq_ne(f: &mut Function, n: i64) { + f.instruction(&Ins::LocalGet(SC_KQ)); + f.instruction(&Ins::I64Const(n)); + f.instruction(&Ins::I64Ne); +} + +/// Emit `sin(x: f64) -> f64`. Worst-case error vs `f64::sin` over `[-1e6, 1e6]`: +/// abs `~1.2e-10`. Pinned by `sin_matches_f64`. +pub(crate) fn emit_sin() -> Function { + emit_sincos(true) +} + +/// Emit `cos(x: f64) -> f64`. Worst-case error vs `f64::cos` over `[-1e6, 1e6]`: +/// abs `~1.2e-10`. Pinned by `cos_matches_f64`. +pub(crate) fn emit_cos() -> Function { + emit_sincos(false) +} + +// ── tan = sin / cos ────────────────────────────────────────────────────────── + +const TAN_X: u32 = 0; + +/// Emit `tan(x: f64) -> f64` as `sin(x) / cos(x)` by `call`ing the sin/cos +/// helpers. Worst-case relative error over `[-1.5, 1.5]` (away from the poles): +/// `~1.5e-10`. Pinned by `tan_matches_f64`. +/// +/// `sin_idx`/`cos_idx` are the module function indices of [`emit_sin`] / +/// [`emit_cos`]. +pub(crate) fn emit_tan(sin_idx: u32, cos_idx: u32) -> Function { + let mut f = Function::new([]); + f.instruction(&Ins::LocalGet(TAN_X)); + f.instruction(&Ins::Call(sin_idx)); + f.instruction(&Ins::LocalGet(TAN_X)); + f.instruction(&Ins::Call(cos_idx)); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::End); + f +} + +// ── atan ───────────────────────────────────────────────────────────────────── + +// atan local layout. Param 0 is `x`. +const AT_X: u32 = 0; +const AT_AX: u32 = 1; // f64 |x| +const AT_Z: u32 = 2; // f64 reduced argument +const AT_Z2: u32 = 3; // f64 z^2 +const AT_RECIP: u32 = 4; // i32 1 if |x| > 1 +const AT_SHIFT: u32 = 5; // i32 1 if the pi/6 shift was applied +const AT_SIGN: u32 = 6; // f64 sign of x (+-1) + +/// `atan(z)/z` Taylor coefficients in `z^2`: `(-1)^n / (2n+1)`, n = 0..=6 +/// (through `z^13`). On `|z| <= tan(pi/12) ~= 0.268` the truncation is +/// ~1e-10 relative. +const ATAN_COEFFS: [f64; 7] = [ + 1.0, + -1.0 / 3.0, + 1.0 / 5.0, + -1.0 / 7.0, + 1.0 / 9.0, + -1.0 / 11.0, + 1.0 / 13.0, +]; + +/// Emit `atan(x: f64) -> f64`. +/// +/// Two-stage range reduction to a small argument: +/// 1. `|x| > 1` -> `atan(|x|) = pi/2 - atan(1/|x|)` (so `z0 in [0, 1]`). +/// 2. `z0 > tan(pi/12)` -> `atan(z0) = pi/6 + atan((z0*sqrt3 - 1)/(sqrt3 + z0))` +/// (so the poly argument `z in [-(2-sqrt3), 2-sqrt3]`). +/// +/// then `atan(z) = z * poly(z^2)`, undoing the shifts and applying the sign. +/// `+-inf -> +-pi/2`, `NaN -> NaN` (the poly of a NaN is NaN, and the +/// reductions preserve it). Worst-case error vs `f64::atan` over `[-1000, +/// 1000]`: rel `~6e-10`. Pinned by `atan_matches_f64`. +pub(crate) fn emit_atan() -> Function { + use wasm_encoder::BlockType; + // Locals (param 0 = x): f64 AT_AX(1)/AT_Z(2)/AT_Z2(3), i32 AT_RECIP(4)/ + // AT_SHIFT(5), f64 AT_SIGN(6). + let mut f = Function::new([(3, ValType::F64), (2, ValType::I32), (1, ValType::F64)]); + + // +inf -> pi/2, -inf -> -pi/2 (handled first so the reciprocal 1/inf = 0 + // path is not relied upon). + f.instruction(&Ins::LocalGet(AT_X)); + f.instruction(&f64_const(f64::INFINITY)); + f.instruction(&Ins::F64Eq); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&f64_const(std::f64::consts::FRAC_PI_2)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + f.instruction(&Ins::LocalGet(AT_X)); + f.instruction(&f64_const(f64::NEG_INFINITY)); + f.instruction(&Ins::F64Eq); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&f64_const(-std::f64::consts::FRAC_PI_2)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // sign = x < 0 ? -1 : 1 ; ax = |x|. + f.instruction(&f64_const(-1.0)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::LocalGet(AT_X)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::Select); + f.instruction(&Ins::LocalSet(AT_SIGN)); + f.instruction(&Ins::LocalGet(AT_X)); + f.instruction(&Ins::F64Abs); + f.instruction(&Ins::LocalSet(AT_AX)); + + // recip = ax > 1 ; z0 = recip ? 1/ax : ax. + f.instruction(&Ins::LocalGet(AT_AX)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::F64Gt); + f.instruction(&Ins::LocalSet(AT_RECIP)); + // z0 = select(1/ax, ax, recip) + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::LocalGet(AT_AX)); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::LocalGet(AT_AX)); + f.instruction(&Ins::LocalGet(AT_RECIP)); + f.instruction(&Ins::Select); + f.instruction(&Ins::LocalSet(AT_Z)); + + // shift = z0 > tan(pi/12) ; z = shift ? (z0*sqrt3 - 1)/(sqrt3 + z0) : z0. + f.instruction(&Ins::LocalGet(AT_Z)); + f.instruction(&f64_const(TAN_PI_12)); + f.instruction(&Ins::F64Gt); + f.instruction(&Ins::LocalSet(AT_SHIFT)); + // shifted = (z0*sqrt3 - 1)/(sqrt3 + z0) + f.instruction(&Ins::LocalGet(AT_Z)); + f.instruction(&f64_const(SQRT3)); + f.instruction(&Ins::F64Mul); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::F64Sub); + f.instruction(&f64_const(SQRT3)); + f.instruction(&Ins::LocalGet(AT_Z)); + f.instruction(&Ins::F64Add); + f.instruction(&Ins::F64Div); + // select(shifted, z0, shift) + f.instruction(&Ins::LocalGet(AT_Z)); + f.instruction(&Ins::LocalGet(AT_SHIFT)); + f.instruction(&Ins::Select); + f.instruction(&Ins::LocalTee(AT_Z)); + // z2 = z*z + f.instruction(&Ins::LocalGet(AT_Z)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::LocalSet(AT_Z2)); + + // at = z * poly(z2) + f.instruction(&Ins::LocalGet(AT_Z)); + emit_horner(&mut f, AT_Z2, &ATAN_COEFFS); + f.instruction(&Ins::F64Mul); + // at += shift ? pi/6 : 0 + f.instruction(&f64_const(std::f64::consts::FRAC_PI_6)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::LocalGet(AT_SHIFT)); + f.instruction(&Ins::Select); + f.instruction(&Ins::F64Add); + // at = recip ? pi/2 - at : at + // compute (pi/2 - at) and select. + f.instruction(&Ins::LocalSet(AT_Z)); // reuse AT_Z to hold the running atan value + f.instruction(&f64_const(std::f64::consts::FRAC_PI_2)); + f.instruction(&Ins::LocalGet(AT_Z)); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::LocalGet(AT_Z)); + f.instruction(&Ins::LocalGet(AT_RECIP)); + f.instruction(&Ins::Select); + // * sign + f.instruction(&Ins::LocalGet(AT_SIGN)); + f.instruction(&Ins::F64Mul); + + f.instruction(&Ins::End); + f +} + +// ── asin / acos ─────────────────────────────────────────────────────────────── + +const AS_X: u32 = 0; + +/// Emit `asin(x: f64) -> f64` as `atan(x / sqrt(1 - x^2))` with endpoint and +/// domain handling: `|x| > 1 -> NaN`, `x == 1 -> pi/2`, `x == -1 -> -pi/2` +/// (at the endpoints `sqrt(1-x^2)=0` would divide by zero). `NaN -> NaN`. +/// Worst-case error vs `f64::asin` over `[-1, 1]`: abs `~1.6e-10`. Pinned by +/// `asin_matches_f64`. `atan_idx` is [`emit_atan`]'s module function index. +pub(crate) fn emit_asin(atan_idx: u32) -> Function { + use wasm_encoder::BlockType; + let mut f = Function::new([]); + + // |x| > 1 -> NaN (also catches nothing for NaN; NaN handled by falling + // through to the poly which yields NaN, but be explicit:) + // if (x > 1) | (x < -1) -> NaN + f.instruction(&Ins::LocalGet(AS_X)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::F64Gt); + f.instruction(&Ins::LocalGet(AS_X)); + f.instruction(&f64_const(-1.0)); + f.instruction(&Ins::F64Lt); + f.instruction(&Ins::I32Or); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&f64_const(f64::NAN)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // x == 1 -> pi/2. + f.instruction(&Ins::LocalGet(AS_X)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::F64Eq); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&f64_const(std::f64::consts::FRAC_PI_2)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + // x == -1 -> -pi/2. + f.instruction(&Ins::LocalGet(AS_X)); + f.instruction(&f64_const(-1.0)); + f.instruction(&Ins::F64Eq); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&f64_const(-std::f64::consts::FRAC_PI_2)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // atan(x / sqrt(1 - x*x)) + f.instruction(&Ins::LocalGet(AS_X)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::LocalGet(AS_X)); + f.instruction(&Ins::LocalGet(AS_X)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::F64Sqrt); + f.instruction(&Ins::F64Div); + f.instruction(&Ins::Call(atan_idx)); + f.instruction(&Ins::End); + f +} + +const AC_X: u32 = 0; + +/// Emit `acos(x: f64) -> f64` as `pi/2 - asin(x)`. Domain `|x| > 1 -> NaN` +/// (inherited from asin), `NaN -> NaN`. Worst-case error vs `f64::acos` over +/// `[-1, 1]`: abs `~1.6e-10`. Pinned by `acos_matches_f64`. `asin_idx` is +/// [`emit_asin`]'s module function index. +pub(crate) fn emit_acos(asin_idx: u32) -> Function { + let mut f = Function::new([]); + f.instruction(&f64_const(std::f64::consts::FRAC_PI_2)); + f.instruction(&Ins::LocalGet(AC_X)); + f.instruction(&Ins::Call(asin_idx)); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::End); + f +} + +// ── log10 = ln * (1/ln10) ───────────────────────────────────────────────────── + +const LOG10_X: u32 = 0; + +/// Emit `log10(x: f64) -> f64` as `ln(x) * (1/ln10)`. Inherits `ln`'s domain +/// handling (`x < 0 -> NaN`, `x == 0 -> -inf`). Worst-case error vs +/// `f64::log10` over `[1e-10, 1e10]`: abs `~2e-13`. Pinned by +/// `log10_matches_f64`. `ln_idx` is [`emit_ln`]'s module function index. +pub(crate) fn emit_log10(ln_idx: u32) -> Function { + let mut f = Function::new([]); + f.instruction(&Ins::LocalGet(LOG10_X)); + f.instruction(&Ins::Call(ln_idx)); + f.instruction(&f64_const(INV_LN10)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::End); + f +} + +// ── pow = exp(y * ln x) ──────────────────────────────────────────────────────── + +const POW_X: u32 = 0; +const POW_Y: u32 = 1; + +/// Emit `pow(x: f64, y: f64) -> f64` as `exp(y * ln x)`. +/// +/// Matches `f64::powf` for a positive base `x`. Special cases mirrored from +/// `powf`: `y == 0 -> 1` (including `pow(anything, 0) == 1`), `x == 1 -> 1`. +/// A negative base yields NaN (`ln` of a negative is NaN) -- this is the +/// documented limitation; no corpus model raises a negative base to a power. +/// Worst-case relative error over `x in [0.01, 100]`, `y in [-5, 5]`: +/// `~2.3e-12`. Pinned by `pow_matches_f64`. `exp_idx`/`ln_idx` are the module +/// function indices of [`emit_exp`] / [`emit_ln`]. +pub(crate) fn emit_pow(exp_idx: u32, ln_idx: u32) -> Function { + use wasm_encoder::BlockType; + let mut f = Function::new([]); + + // y == 0 -> 1. + f.instruction(&Ins::LocalGet(POW_Y)); + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::F64Eq); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + // x == 1 -> 1. + f.instruction(&Ins::LocalGet(POW_X)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::F64Eq); + f.instruction(&Ins::If(BlockType::Empty)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::Return); + f.instruction(&Ins::End); + + // exp(y * ln(x)) + f.instruction(&Ins::LocalGet(POW_Y)); + f.instruction(&Ins::LocalGet(POW_X)); + f.instruction(&Ins::Call(ln_idx)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::Call(exp_idx)); + f.instruction(&Ins::End); + f +} + +#[cfg(test)] +mod tests { + use super::super::lower::build_helpers; + use checked::Store; + use wasm::validate; + use wasm_encoder::{ + CodeSection, ExportKind, ExportSection, Function, FunctionSection, Instruction, + MemorySection, MemoryType, Module, TypeSection, ValType, + }; + + /// Which transcendental helper a test module exports as `f`. + #[derive(Clone, Copy)] + enum Which { + Exp, + Ln, + Sin, + Cos, + Tan, + Atan, + Asin, + Acos, + Log10, + Pow, + } + + /// Resolve a [`Which`] to its function index in the assembled helper table. + fn helper_index(which: Which) -> u32 { + let h = build_helpers().fns; + match which { + Which::Exp => h.exp, + Which::Ln => h.ln, + Which::Sin => h.sin, + Which::Cos => h.cos, + Which::Tan => h.tan, + Which::Atan => h.atan, + Which::Asin => h.asin, + Which::Acos => h.acos, + Which::Log10 => h.log10, + Which::Pow => h.pow, + } + } + + /// Build a module containing *every* helper body (so inter-helper `call`s + /// resolve) plus a thin exported wrapper `f` that forwards to the + /// helper-under-test. Unary helpers export `f(x: f64) -> f64`; `pow` exports + /// `f(x: f64, y: f64) -> f64`. Mirrors `lower.rs`'s production assembly: + /// helpers occupy function indices `0..N`, the wrapper follows at `N`. + fn build_helper_module(which: Which) -> Vec { + let helpers = build_helpers(); + let n_helpers = helpers.functions.len() as u32; + let target = helper_index(which); + let binary = matches!(which, Which::Pow); + + let mut module = Module::new(); + + // Type 0 is the wrapper's signature; each helper's signature follows. + let mut types = TypeSection::new(); + if binary { + types + .ty() + .function([ValType::F64, ValType::F64], [ValType::F64]); + } else { + types.ty().function([ValType::F64], [ValType::F64]); + } + for hf in &helpers.functions { + types.ty().function(hf.params.clone(), hf.results.clone()); + } + module.section(&types); + + let mut functions = FunctionSection::new(); + for (i, _) in helpers.functions.iter().enumerate() { + functions.function(1 + i as u32); + } + functions.function(0); + module.section(&functions); + + // The GF lookup helpers (`super::lookup`) `f64.load` from memory 0, so + // a module that includes every helper body must declare a memory even + // though the transcendental wrappers here never touch it. + let mut memories = MemorySection::new(); + memories.memory(MemoryType { + minimum: 1, + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + module.section(&memories); + + let mut exports = ExportSection::new(); + exports.export("f", ExportKind::Func, n_helpers); + module.section(&exports); + + let mut code = CodeSection::new(); + for hf in &helpers.functions { + code.function(&hf.body); + } + let mut wrapper = Function::new([]); + wrapper.instruction(&Instruction::LocalGet(0)); + if binary { + wrapper.instruction(&Instruction::LocalGet(1)); + } + wrapper.instruction(&Instruction::Call(target)); + wrapper.instruction(&Instruction::End); + code.function(&wrapper); + module.section(&code); + + module.finish() + } + + /// Run a unary helper on `x` under the DLR-FT interpreter. The module is + /// (re)built per call; the samples are deliberately small (a few hundred + /// points each) so this stays well under the per-test time budget. + fn run_unary(which: Which, x: f64) -> f64 { + let bytes = build_helper_module(which); + let info = validate(&bytes).expect("helper module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("helper module must instantiate") + .module_addr; + let f = store + .instance_export(module, "f") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(f64,), f64>(f, (x,)) + .expect("invocation must succeed") + } + + /// Run `pow(x, y)` under the interpreter. + fn run_pow(x: f64, y: f64) -> f64 { + let bytes = build_helper_module(Which::Pow); + let info = validate(&bytes).expect("pow module must validate"); + let mut store = Store::new(()); + let module = store + .module_instantiate(&info, Vec::new(), None) + .expect("pow module must instantiate") + .module_addr; + let f = store + .instance_export(module, "f") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(f64, f64), f64>(f, (x, y)) + .expect("invocation must succeed") + } + + /// A linear sample of `n+1` points across `[lo, hi]` inclusive. + fn linspace(lo: f64, hi: f64, n: usize) -> Vec { + (0..=n) + .map(|i| lo + (hi - lo) * (i as f64) / (n as f64)) + .collect() + } + + /// Assert `got` matches `want` within absolute *or* relative tolerance, + /// propagating the float specials the way the kernels are documented to. + fn assert_close(name: &str, x: f64, got: f64, want: f64, abs_tol: f64, rel_tol: f64) { + if want.is_nan() { + assert!(got.is_nan(), "{name}({x}): expected NaN, got {got}"); + return; + } + assert!(!got.is_nan(), "{name}({x}): got NaN, expected {want}"); + if want.is_infinite() { + assert_eq!(got, want, "{name}({x}): expected {want}, got {got}"); + return; + } + let abs = (got - want).abs(); + let rel = if want != 0.0 { abs / want.abs() } else { abs }; + assert!( + abs <= abs_tol || rel <= rel_tol, + "{name}({x}): got {got}, want {want} (abs {abs:.3e}, rel {rel:.3e})", + ); + } + + // The corpus bar is abs 2e-3 / rel 5e-6. Every per-helper tolerance below is + // far inside that, leaving ample slack for DLR-FT-vs-native rounding drift. + + // ── exp ─────────────────────────────────────────────────────────────── + + #[test] + fn exp_matches_f64() { + // Anchor values exercise the wrapper end-to-end. + assert_eq!(run_unary(Which::Exp, 0.0), 1.0); + assert_close( + "exp", + 1.0, + run_unary(Which::Exp, 1.0), + std::f64::consts::E, + 0.0, + 1e-12, + ); + // Dense sweep across the representable exponent range. + for x in linspace(-700.0, 700.0, 300) { + assert_close("exp", x, run_unary(Which::Exp, x), x.exp(), 0.0, 1e-12); + } + // Edge / special cases. + assert!(run_unary(Which::Exp, f64::NAN).is_nan()); + assert_eq!(run_unary(Which::Exp, f64::INFINITY), f64::INFINITY); + assert_eq!(run_unary(Which::Exp, f64::NEG_INFINITY), 0.0); + assert_eq!(run_unary(Which::Exp, 720.0), f64::INFINITY); // overflow + assert_eq!(run_unary(Which::Exp, -750.0), 0.0); // underflow + } + + // ── ln ──────────────────────────────────────────────────────────────── + + #[test] + fn ln_matches_f64() { + assert_eq!(run_unary(Which::Ln, 1.0), 0.0); + assert_close( + "ln", + std::f64::consts::E, + run_unary(Which::Ln, std::f64::consts::E), + 1.0, + 1e-12, + 1e-12, + ); + // Geometric sweep over many decades (where ln is interesting). + for e in linspace(-300.0, 300.0, 300) { + let x = 10f64.powf(e / 30.0); + assert_close("ln", x, run_unary(Which::Ln, x), x.ln(), 1e-12, 1e-11); + } + // Subnormal input (exercises the 2^54 normalization path). + let sub = f64::from_bits(1); + assert_close("ln", sub, run_unary(Which::Ln, sub), sub.ln(), 1e-9, 1e-12); + // Domain edges. + assert_eq!(run_unary(Which::Ln, 0.0), f64::NEG_INFINITY); + assert!(run_unary(Which::Ln, -1.0).is_nan()); + assert!(run_unary(Which::Ln, f64::NAN).is_nan()); + assert_eq!(run_unary(Which::Ln, f64::INFINITY), f64::INFINITY); + } + + // ── sin / cos ─────────────────────────────────────────────────────────── + + #[test] + fn sin_matches_f64() { + assert_eq!(run_unary(Which::Sin, 0.0), 0.0); + for x in linspace(-100.0, 100.0, 400) { + assert_close("sin", x, run_unary(Which::Sin, x), x.sin(), 1e-9, 1e-9); + } + // A few large arguments to exercise the Cody-Waite reduction. + for &x in &[1.0e3, -1.0e4, 1.0e5, -650_400.0] { + assert_close("sin", x, run_unary(Which::Sin, x), x.sin(), 1e-8, 1e-7); + } + assert!(run_unary(Which::Sin, f64::NAN).is_nan()); + assert!(run_unary(Which::Sin, f64::INFINITY).is_nan()); + } + + #[test] + fn cos_matches_f64() { + assert_eq!(run_unary(Which::Cos, 0.0), 1.0); + for x in linspace(-100.0, 100.0, 400) { + assert_close("cos", x, run_unary(Which::Cos, x), x.cos(), 1e-9, 1e-9); + } + for &x in &[1.0e3, -1.0e4, 1.0e5, -650_400.0] { + assert_close("cos", x, run_unary(Which::Cos, x), x.cos(), 1e-8, 1e-7); + } + assert!(run_unary(Which::Cos, f64::NAN).is_nan()); + assert!(run_unary(Which::Cos, f64::NEG_INFINITY).is_nan()); + } + + // ── tan ───────────────────────────────────────────────────────────────── + + #[test] + fn tan_matches_f64() { + assert_eq!(run_unary(Which::Tan, 0.0), 0.0); + // Stay away from the +-pi/2 poles where the function is ill-conditioned. + for x in linspace(-1.4, 1.4, 400) { + assert_close("tan", x, run_unary(Which::Tan, x), x.tan(), 1e-9, 1e-8); + } + assert!(run_unary(Which::Tan, f64::NAN).is_nan()); + } + + // ── atan ──────────────────────────────────────────────────────────────── + + #[test] + fn atan_matches_f64() { + assert_eq!(run_unary(Which::Atan, 0.0), 0.0); + for x in linspace(-1000.0, 1000.0, 400) { + assert_close("atan", x, run_unary(Which::Atan, x), x.atan(), 1e-9, 1e-9); + } + // Dense small region around the two reduction breakpoints (1 and + // tan(pi/12)). + for x in linspace(-2.0, 2.0, 200) { + assert_close("atan", x, run_unary(Which::Atan, x), x.atan(), 1e-9, 1e-9); + } + assert_close( + "atan", + f64::INFINITY, + run_unary(Which::Atan, f64::INFINITY), + std::f64::consts::FRAC_PI_2, + 1e-12, + 0.0, + ); + assert_close( + "atan", + f64::NEG_INFINITY, + run_unary(Which::Atan, f64::NEG_INFINITY), + -std::f64::consts::FRAC_PI_2, + 1e-12, + 0.0, + ); + assert!(run_unary(Which::Atan, f64::NAN).is_nan()); + } + + // ── asin / acos ─────────────────────────────────────────────────────────── + + #[test] + fn asin_matches_f64() { + for x in linspace(-1.0, 1.0, 400) { + assert_close("asin", x, run_unary(Which::Asin, x), x.asin(), 1e-9, 1e-9); + } + // Exact endpoints. + assert_close( + "asin", + 1.0, + run_unary(Which::Asin, 1.0), + std::f64::consts::FRAC_PI_2, + 1e-12, + 0.0, + ); + assert_close( + "asin", + -1.0, + run_unary(Which::Asin, -1.0), + -std::f64::consts::FRAC_PI_2, + 1e-12, + 0.0, + ); + // Out of domain. + assert!(run_unary(Which::Asin, 1.5).is_nan()); + assert!(run_unary(Which::Asin, -1.5).is_nan()); + assert!(run_unary(Which::Asin, f64::NAN).is_nan()); + } + + #[test] + fn acos_matches_f64() { + for x in linspace(-1.0, 1.0, 400) { + assert_close("acos", x, run_unary(Which::Acos, x), x.acos(), 1e-9, 1e-9); + } + assert_close("acos", 1.0, run_unary(Which::Acos, 1.0), 0.0, 1e-9, 0.0); + assert_close( + "acos", + -1.0, + run_unary(Which::Acos, -1.0), + std::f64::consts::PI, + 1e-12, + 1e-12, + ); + assert!(run_unary(Which::Acos, 1.5).is_nan()); + assert!(run_unary(Which::Acos, f64::NAN).is_nan()); + } + + // ── log10 ────────────────────────────────────────────────────────────── + + #[test] + fn log10_matches_f64() { + assert_close( + "log10", + 1000.0, + run_unary(Which::Log10, 1000.0), + 3.0, + 1e-12, + 1e-12, + ); + for e in linspace(-300.0, 300.0, 300) { + let x = 10f64.powf(e / 30.0); + assert_close( + "log10", + x, + run_unary(Which::Log10, x), + x.log10(), + 1e-12, + 1e-11, + ); + } + assert_eq!(run_unary(Which::Log10, 0.0), f64::NEG_INFINITY); + assert!(run_unary(Which::Log10, -1.0).is_nan()); + } + + // ── pow ───────────────────────────────────────────────────────────────── + + #[test] + fn pow_matches_f64() { + // y == 0 and x == 1 short-circuits. + assert_eq!(run_pow(123.4, 0.0), 1.0); + assert_eq!(run_pow(1.0, 567.8), 1.0); + // Positive-base grid (the supported regime), integer and fractional y. + for i in 0..40 { + for j in 0..40 { + let x = 0.01 + 100.0 * (i as f64) / 40.0; + let y = -5.0 + 10.0 * (j as f64) / 40.0; + let want = x.powf(y); + if want.is_finite() { + assert_close("pow", x, run_pow(x, y), want, 1e-9, 1e-9); + } + } + } + // Known limitation: a negative base diverges (ln of negative is NaN). + assert!(run_pow(-2.0, 2.0).is_nan()); + } +} diff --git a/src/simlin-engine/src/wasmgen/mod.rs b/src/simlin-engine/src/wasmgen/mod.rs new file mode 100644 index 000000000..1057672f2 --- /dev/null +++ b/src/simlin-engine/src/wasmgen/mod.rs @@ -0,0 +1,64 @@ +// Copyright 2026 The Simlin Authors. All rights reserved. +// Use of this source code is governed by the Apache License, +// Version 2.0, that can be found in the LICENSE file. + +//! WebAssembly code-generation backend. +//! +//! This backend is an alternative to the bytecode VM (`crate::vm`). Instead of +//! interpreting opcodes, it lowers a salsa-compiled `CompiledSimulation` (the +//! VM's own input) into a self-contained WebAssembly module that runs the whole +//! simulation in one exported call, writing results into its own linear memory. +//! The intended use case is interactive scrubbing: compile a model to wasm +//! once, then re-run it on every slider change at display refresh rates. +//! +//! The backend walks every module instance's un-fused opcode programs +//! (`compiled_initials`/`compiled_flows`/`compiled_stocks`) and emits a wasm +//! function-triple per `(model, input_set)` instance plus a `run` driver (see +//! `lower` for the per-opcode lowering and `module` for whole-model assembly). +//! Modules are emitted with the `wasm-encoder` crate; correctness is validated +//! in tests by executing the emitted module under the DLR-FT `wasm-interpreter` +//! and comparing against the bytecode VM. +//! +//! Status: the full scalar + array opcode set (every `Op2` operator, every +//! `Apply` builtin, the view/reducer/iteration/vector ops, scalar/array +//! lookups), Euler/RK2/RK4 integration, and nested modules (incl. SMOOTH/DELAY +//! stdlib expansions) are in place. A genuine runtime view range +//! (`ViewRangeDynamic`) or array unrolling past the per-function budget returns +//! `WasmGenError::Unsupported`. + +mod alloc; +mod lookup; +mod lower; +mod math; +mod module; +mod vector; +mod views; + +pub use module::{ + WasmArtifact, WasmLayout, compile_datamodel_to_artifact, compile_datamodel_to_wasm, + compile_simulation, +}; + +use std::fmt; + +/// Error from the WebAssembly code-generation backend. +/// +/// The backend covers the full scalar + array opcode set, Euler/RK2/RK4 +/// integration, and nested modules (including SMOOTH/DELAY stdlib expansions). +/// A genuine runtime view range (`ViewRangeDynamic`) or array unrolling past the +/// per-function budget returns `Unsupported` rather than silently emitting an +/// incorrect module. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum WasmGenError { + Unsupported(String), +} + +impl fmt::Display for WasmGenError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + WasmGenError::Unsupported(what) => write!(f, "{what}"), + } + } +} + +impl std::error::Error for WasmGenError {} diff --git a/src/simlin-engine/src/wasmgen/module.rs b/src/simlin-engine/src/wasmgen/module.rs new file mode 100644 index 000000000..8839646f4 --- /dev/null +++ b/src/simlin-engine/src/wasmgen/module.rs @@ -0,0 +1,4557 @@ +// Copyright 2026 The Simlin Authors. All rights reserved. +// Use of this source code is governed by the Apache License, +// Version 2.0, that can be found in the LICENSE file. + +// pattern: Functional Core +// Pure transformation: a `CompiledSimulation` (or datamodel routed through the +// in-memory salsa compile) in, a self-contained wasm module (`Vec`) plus its +// `WasmLayout` out. No filesystem/network I/O; tests execute the result under +// the DLR-FT interpreter. + +//! Whole-model code generation: lower a salsa-compiled `CompiledSimulation` to +//! a self-contained WebAssembly module that runs an entire simulation in one +//! exported call. +//! +//! The emitted module exports its own linear `memory`, a `run` function, and +//! three i32 geometry globals (`n_slots`/`n_chunks`/`results_offset`). It emits +//! one `initials`/`flows`/`stocks` function-triple *per unique `(model, +//! input_set)` module instance* in `CompiledSimulation.modules`, each taking a +//! runtime `module_off: i32` plus its module inputs as f64 params and lowered by +//! [`super::lower::emit_bytecode`] over the shared slab. An `EvalModule` `call`s +//! the child instance's function for the current phase (passing `module_off + +//! decl.off` and the inputs), so one shared `CompiledModule` runs at every base +//! offset it is instantiated at. A final `run` function seeds the reserved +//! globals, calls the *root* instance's initials, and drives the integration +//! loop. `run` lays the slab out as: a `curr` working chunk, a `next` working +//! chunk, then a results region of `n_chunks` step-major snapshots. It records a +//! snapshot of `curr` on the same cadence the bytecode VM uses (`vm.rs::run_to`): +//! the t=start sample is forced, then every `save_every = round(save_step/dt)` +//! steps, up to `n_chunks` samples. +//! +//! Unlike the VM's chunk-ring buffer, this uses a single `curr` chunk plus a +//! `next` chunk that holds only the freshly integrated stock values (including +//! nested-module stocks, collected by recursing through `EvalModule`): after +//! recording a snapshot, the updated stocks are copied back into `curr` and time +//! is advanced. Auxiliaries/flows are recomputed each step, so `curr` always +//! holds the full, correct state for the timestep it represents. +//! +//! Current scope: the full scalar + array opcode set, Euler/RK2/RK4 integration, +//! and nested modules (incl. SMOOTH/DELAY stdlib expansions). A genuine runtime +//! view range (`ViewRangeDynamic`) or array unrolling past the per-function +//! budget returns `WasmGenError::Unsupported`. + +use wasm_encoder::Instruction as I; +use wasm_encoder::{ + BlockType, CodeSection, ConstExpr, DataSection, ExportKind, ExportSection, Function, + FunctionSection, GlobalSection, GlobalType, MemorySection, MemoryType, Module as WasmModule, + TypeSection, ValType, +}; + +use std::collections::HashMap; + +use crate::bytecode::{ByteCode, CompiledModule, Opcode}; +use crate::results::{Method, Specs}; +use crate::vm::{CompiledSimulation, ModuleKey, StepPart}; + +use super::WasmGenError; +use super::lower::{self, BuiltHelpers, build_helpers, f64_const, max_condition_depth, memarg}; + +// Reserved global slots, mirroring `crate::vm`. +const TIME_OFF: usize = 0; +const DT_OFF: usize = 1; +const INITIAL_TIME_OFF: usize = 2; +const FINAL_TIME_OFF: usize = 3; + +const SLOT_SIZE: u32 = 8; +const WASM_PAGE_SIZE: u32 = 65536; + +// Slot-0 byte base of the `curr` chunk, and the byte address of `curr[TIME]` +// (an absolute, module-independent global slot). Both run-loop and snapshot +// code address `curr` from byte 0. +const CURR_BASE: u32 = 0; +const TIME_ADDR: u64 = TIME_OFF as u64 * SLOT_SIZE as u64; + +// Global indices. The three self-describing geometry globals come first (so the +// exported indices 0/1/2 stay stable for hosts); `use_prev_fallback` -- the only +// mutable global -- follows at index 3. It gates `LoadPrev`: init 1 (return the +// fallback) until the first `prev_values` snapshot clears it (`vm.rs:668`). +const G_N_SLOTS: u32 = 0; +const G_N_CHUNKS: u32 = 1; +const G_RESULTS_OFFSET: u32 = 2; +const G_USE_PREV_FALLBACK: u32 = 3; + +// `run`'s i32 locals. +const L_SAVED: u32 = 0; +const L_STEP_ACCUM: u32 = 1; +const L_DST: u32 = 2; + +/// Compile the named model of a datamodel `Project` to a full [`WasmArtifact`] +/// (the wasm blob plus its [`WasmLayout`]), through the salsa incremental +/// pipeline and [`compile_simulation`]. +/// +/// This is the entry point `libsimlin` uses across the FFI boundary +/// (`simlin_model_compile_to_wasm`): it works from a datamodel alone, with no +/// `Vm`/`SimlinSim`, returning both the blob and the name->offset layout. An +/// incremental-compile failure or an unsupported construct surfaces as +/// [`WasmGenError`] (the FFI maps it to a `SimlinError`, never a panic). +pub fn compile_datamodel_to_artifact( + datamodel: &crate::datamodel::Project, + model_name: &str, +) -> Result { + let mut db = crate::db::SimlinDb::default(); + let sync = crate::db::sync_from_datamodel_incremental(&mut db, datamodel, None); + let sim = + crate::db::compile_project_incremental(&db, sync.project, model_name).map_err(|e| { + WasmGenError::Unsupported(format!("wasmgen: incremental compile failed: {e:?}")) + })?; + compile_simulation(&sim) +} + +/// Compile the named model of a datamodel `Project` to a self-contained wasm +/// module, dropping the [`WasmLayout`] (callers that need the layout use +/// [`compile_datamodel_to_artifact`]). Kept as the stable raw-bytes entry point +/// for the `wasm-backend-poc.mjs` exploratory script and any blob-only consumer. +pub fn compile_datamodel_to_wasm( + datamodel: &crate::datamodel::Project, + model_name: &str, +) -> Result, WasmGenError> { + Ok(compile_datamodel_to_artifact(datamodel, model_name)?.wasm) +} + +// ============================================================================ +// CompiledSimulation -> wasm (the production path; consumes salsa bytecode) +// ============================================================================ + +/// A compiled simulation wasm module together with the layout metadata a host +/// needs to read its results by variable name. +pub struct WasmArtifact { + pub wasm: Vec, + pub layout: WasmLayout, +} + +/// Geometry + variable-offset map describing a [`WasmArtifact`]'s results +/// region. The wasm module also exports `n_slots`/`n_chunks`/`results_offset` +/// as i32 globals so a host can stride results with no external metadata; this +/// struct mirrors those values and adds the canonical-name -> slot map needed +/// for by-name reads. +pub struct WasmLayout { + pub n_slots: usize, + pub n_chunks: usize, + /// Byte offset of the results region within linear memory. + pub results_offset: usize, + /// Byte offset of the GF directory region (8 bytes/entry, indexed by global + /// table index: `(data_byte_offset: i32, n_points: i32)`). Zero when the + /// model has no graphical functions. + pub gf_directory_offset: usize, + /// Byte offset of the GF data region (every table's `(x,y)` knots as + /// consecutive f64 LE pairs). Zero when the model has no graphical + /// functions. + pub gf_data_offset: usize, + /// Canonical variable name -> slot offset within a chunk. + pub var_offsets: Vec<(String, usize)>, +} + +impl WasmLayout { + /// Serialize the layout to a self-describing, length-prefixed byte buffer for + /// the FFI (no protobuf -- it rides the same malloc-return convention as the + /// wasm blob). The format is, all integers little-endian: + /// + /// ```text + /// n_slots: u64 + /// n_chunks: u64 + /// results_offset: u64 + /// count: u32 (number of var_offsets entries) + /// repeated count times: + /// name_len: u32 + /// name: name_len bytes (UTF-8, the canonical variable name) + /// offset: u64 (slot offset within a chunk) + /// ``` + /// + /// The GF region offsets are intentionally NOT serialized: a host reads + /// results by name (via `n_slots`/`results_offset` + the name->offset map), + /// never the GF regions directly. [`deserialize`] is the exact inverse over + /// the geometry + name map (it leaves the GF offsets 0). + /// + /// [`deserialize`]: Self::deserialize + pub fn serialize(&self) -> Vec { + let mut out = Vec::new(); + out.extend_from_slice(&(self.n_slots as u64).to_le_bytes()); + out.extend_from_slice(&(self.n_chunks as u64).to_le_bytes()); + out.extend_from_slice(&(self.results_offset as u64).to_le_bytes()); + out.extend_from_slice(&(self.var_offsets.len() as u32).to_le_bytes()); + for (name, offset) in &self.var_offsets { + let bytes = name.as_bytes(); + out.extend_from_slice(&(bytes.len() as u32).to_le_bytes()); + out.extend_from_slice(bytes); + out.extend_from_slice(&(*offset as u64).to_le_bytes()); + } + out + } + + /// Parse a buffer produced by [`serialize`]. Returns `None` if the buffer is + /// truncated, an integer is malformed, or a name is not valid UTF-8 -- a host + /// gets a clean failure rather than a panic on a corrupt buffer. The GF region + /// offsets are reconstructed as 0 (they are not in the serialized format). + /// + /// This is the inverse used by the libsimlin FFI tests and any host that wants + /// to round-trip the layout in Rust; a non-Rust host re-implements the same + /// little-endian parse against the documented format. + /// + /// [`serialize`]: Self::serialize + pub fn deserialize(bytes: &[u8]) -> Option { + let mut pos = 0usize; + let take = |pos: &mut usize, n: usize| -> Option<&[u8]> { + let end = pos.checked_add(n)?; + let slice = bytes.get(*pos..end)?; + *pos = end; + Some(slice) + }; + let read_u64 = |pos: &mut usize| -> Option { + Some(u64::from_le_bytes(take(pos, 8)?.try_into().ok()?)) + }; + let read_u32 = |pos: &mut usize| -> Option { + Some(u32::from_le_bytes(take(pos, 4)?.try_into().ok()?)) + }; + + let n_slots = read_u64(&mut pos)? as usize; + let n_chunks = read_u64(&mut pos)? as usize; + let results_offset = read_u64(&mut pos)? as usize; + let count = read_u32(&mut pos)? as usize; + let mut var_offsets = Vec::with_capacity(count); + for _ in 0..count { + let name_len = read_u32(&mut pos)? as usize; + let name_bytes = take(&mut pos, name_len)?; + let name = std::str::from_utf8(name_bytes).ok()?.to_string(); + let offset = read_u64(&mut pos)? as usize; + var_offsets.push((name, offset)); + } + Some(WasmLayout { + n_slots, + n_chunks, + results_offset, + gf_directory_offset: 0, + gf_data_offset: 0, + var_offsets, + }) + } +} + +// GF region geometry. The directory holds one 8-byte entry per global table +// index (two i32: the table's absolute data byte offset, and its point count); +// the data region holds every table's knots as consecutive f64 LE `(x, y)` +// pairs (16 bytes/point). +const GF_DIRECTORY_ENTRY_BYTES: u32 = 8; // i32 data_offset + i32 n_points +const GF_KNOT_BYTES: u32 = 16; // f64 x + f64 y + +/// The two read-only graphical-function regions for a model, laid out at a +/// caller-chosen `region_base` byte offset within the module's linear memory. +/// +/// `directory_base` == `region_base`; the data region follows the directory. +/// Each directory entry's first i32 is the *absolute* byte offset of its +/// table's first knot (so the lookup helpers can `f64.load` a knot with no +/// further base arithmetic); the second i32 is the table's point count. The +/// concatenation order is the global table order in +/// `ByteCodeContext.graphical_functions`, so the `Lookup` opcode's +/// `base_gf + element_offset` indexes directly into the directory. +struct GfRegions { + directory_base: u32, + data_base: u32, + /// `directory` ++ `data` would be the full image, but they are kept + /// separate so each can be emitted as its own active `DataSection` segment + /// at its own base. + directory: Vec, + data: Vec, + /// Total byte span of both regions (directory + data), for growing `pages`. + total_bytes: u32, +} + +/// Build the GF directory + data regions for `tables` (the root's +/// `graphical_functions`) at `region_base`. Returns `None` (no regions, no +/// growth) when there are no tables. Returns a layout error if the regions +/// would overflow a u32 byte address. +fn build_gf_regions( + tables: &[Vec<(f64, f64)>], + region_base: u32, +) -> Result, WasmGenError> { + if tables.is_empty() { + return Ok(None); + } + let too_large = + || WasmGenError::Unsupported("wasmgen: graphical functions too large".to_string()); + + let n_tables = u32::try_from(tables.len()).map_err(|_| too_large())?; + let directory_bytes = n_tables + .checked_mul(GF_DIRECTORY_ENTRY_BYTES) + .ok_or_else(too_large)?; + let directory_base = region_base; + let data_base = directory_base + .checked_add(directory_bytes) + .ok_or_else(too_large)?; + + let mut directory = Vec::with_capacity(directory_bytes as usize); + let mut data: Vec = Vec::new(); + // The running byte offset of the next table's first knot, relative to + // `data_base`. Promoted to an absolute address when written into the + // directory so a helper can load a knot directly. + let mut data_rel_offset: u32 = 0; + for table in tables { + let n_points = u32::try_from(table.len()).map_err(|_| too_large())?; + let abs_data_offset = data_base + .checked_add(data_rel_offset) + .ok_or_else(too_large)?; + directory.extend_from_slice(&(abs_data_offset as i32).to_le_bytes()); + directory.extend_from_slice(&(n_points as i32).to_le_bytes()); + + for &(x, y) in table { + data.extend_from_slice(&x.to_le_bytes()); + data.extend_from_slice(&y.to_le_bytes()); + } + let table_bytes = n_points.checked_mul(GF_KNOT_BYTES).ok_or_else(too_large)?; + data_rel_offset = data_rel_offset + .checked_add(table_bytes) + .ok_or_else(too_large)?; + } + + let total_bytes = directory_bytes + .checked_add(data_rel_offset) + .ok_or_else(too_large)?; + Ok(Some(GfRegions { + directory_base, + data_base, + directory, + data, + total_bytes, + })) +} + +// Offsets of an instance's three program functions within its function-triple. +// The module's function slots are: the emitted helper functions +// ([`lower::build_helpers`]) at `0..n_helpers`, then one +// `[initials, flows, stocks]` triple per module instance (in `instance_order`), +// then `run` last. So instance `i`'s `StepPart` function is at +// `n_helpers + i*FUNCS_PER_INSTANCE + {F_INITIALS,F_FLOWS,F_STOCKS}`, and `run` +// is at `n_helpers + n_instances*FUNCS_PER_INSTANCE`. Keeping these relative +// (and adding `n_helpers`/the triple base at the call/export sites) means new +// helpers or instances shift the indices automatically. +const F_INITIALS: u32 = 0; +const F_FLOWS: u32 = 1; +const F_STOCKS: u32 = 2; +const FUNCS_PER_INSTANCE: u32 = 3; + +// Type-section indices. The `run` type comes first; one opcode-program type per +// distinct module-input count follows (`(i32, f64*k) -> ()`), and helper types +// are appended after those. `run` is `() -> ()`. +const TYPE_RUN_FN: u32 = 0; // () -> () + +// Param 0 of every opcode-program function is `module_off` (i32); params +// `1..=n_inputs` are the f64 module inputs. Declared locals follow. +const L_MODULE_OFF: u32 = 0; + +/// Everything an instance's `EmitCtx` needs that varies per `(model, input_set)` +/// module instance: its own `ByteCodeContext`, the disjoint linear-memory bases +/// the emitter threads in for that instance's array tables / GF lookups, its +/// module-input parameter count, and (when it has graphical functions) its slice +/// of the combined GF region. Computed once in [`compile_simulation`] before any +/// function is emitted, in `instance_order`. +struct PerInstance<'a> { + module: &'a CompiledModule, + /// Number of f64 module-input parameters this instance's three functions + /// take (param 0 is `module_off`, params `1..=n_inputs` are the inputs). + /// `0` for the root and any uninstantiated module. Drawn from the + /// `EvalModule { n_inputs }` of its call sites (the count the VM passes). + n_inputs: u32, + /// Byte base of this instance's GF directory region (`0` when it has no + /// graphical functions). Threaded into the instance's `EmitCtx`. + gf_directory_base: u32, + /// Byte base of this instance's GF data region (`0` when it has no GFs). + gf_data_base: u32, + /// Byte base of this instance's disjoint `temp_storage` region. + temp_storage_base: u32, + /// This instance's GF region image (directory + data + bases), for the + /// `DataSection`; `None` when the instance has no graphical functions. + gf_regions: Option, + /// The relative offsets this instance's module assigns via a flows + /// `AssignConstCurr` -- its overridable constants (Phase 7 Task 2). Threaded + /// into the instance's `EmitCtx` so an `AssignConstCurr { off }` whose `off` + /// is in this set sources from the constants-override region. + flows_const_offsets: std::collections::HashSet, +} + +/// Compile a `CompiledSimulation` (produced by the salsa incremental pipeline) +/// into a self-contained wasm module. +/// +/// Every unique `(model, input_set)` module instance in `sim.modules` becomes its +/// own initials/flows/stocks wasm function-triple taking `(module_off: i32, +/// in_0..in_{k-1}: f64)`; an `EvalModule` resolves the child instance and `call`s +/// its function for the current phase (passing `module_off + decl.off` and the +/// inputs), so one shared `CompiledModule` runs at every base offset it is +/// instantiated at. The opcode programs a `CompiledSimulation` carries are the +/// plain, un-fused scalar set (the VM's superinstruction fusion runs on a private +/// execution copy), so each `Opcode` lowers via [`lower::emit_bytecode`]. +/// Anything outside the supported set -- an unsupported opcode, or array +/// unrolling past the per-function budget -- returns [`WasmGenError::Unsupported`] +/// rather than emitting a wrong module. +pub fn compile_simulation(sim: &CompiledSimulation) -> Result { + // `wasmgen` is in-crate, so it reads `CompiledSimulation`'s `pub(crate)` + // fields directly rather than through accessors. + let specs = &sim.specs; + // The run-loop shape is selected from `specs.method` below; all three + // methods (`Euler`/`RungeKutta2`/`RungeKutta4`) are supported. + + let root = sim + .modules + .get(&sim.root) + .ok_or_else(|| WasmGenError::Unsupported("wasmgen: root module not found".to_string()))?; + let too_large = || WasmGenError::Unsupported("wasmgen: model too large to lower".to_string()); + + // Enumerate every module instance in a deterministic order (sorted by key), + // and the count of inputs each receives. The root receives 0 inputs (it is + // called by `run`); every other instance's input count is the `n_inputs` of + // its `EvalModule` call sites -- exactly what the VM sizes `module_inputs` to. + let mut instance_order: Vec = sim.modules.keys().cloned().collect(); + instance_order.sort(); + let instance_n_inputs = collect_instance_input_counts(sim); + + // The stock data-buffer offsets the *whole simulation* integrates, recursing + // through `EvalModule` so submodule (SMOOTH/DELAY) stocks are included -- + // mirroring the VM's `collect_stock_offsets` (`vm.rs:512-543`). The Euler + // advance copies these `next -> curr`; the RK loops index `rk_scratch` by + // their position here. Collected up front so the RK scratch region is sized + // below. + let stock_offsets = collect_all_stock_offsets(&sim.modules, &sim.root, 0); + let n_stocks = u32::try_from(stock_offsets.len()).map_err(|_| too_large())?; + // `n_slots` is the ROOT module's slot count, which spans the whole slab + // including every nested module's slots (`vm.rs::n_slots` returns the root's). + let n_slots = u32::try_from(root.n_slots).map_err(|_| too_large())?; + let n_chunks = u32::try_from(specs.n_chunks).map_err(|_| too_large())?; + let stride = n_slots.checked_mul(SLOT_SIZE).ok_or_else(too_large)?; + let curr_base = 0u32; + let next_base = stride; + let results_base = stride.checked_mul(2).ok_or_else(too_large)?; + let results_bytes = n_chunks.checked_mul(stride).ok_or_else(too_large)?; + let mut total_bytes = results_base + .checked_add(results_bytes) + .ok_or_else(too_large)?; + + // Per-instance GF regions follow the results region, concatenated in + // `instance_order` (each instance's directory+data sits at its own base, so + // its directory entry 0 maps to its own table 0). The `Lookup` opcode reads + // the directory at `instance_gf_directory_base + table_idx*8`, so each + // instance's `EmitCtx` carries its own base. They are initialized at + // instantiation by active `DataSection` segments. + let mut instance_gf: HashMap)> = HashMap::new(); + for key in &instance_order { + let module = &sim.modules[key]; + let regions = build_gf_regions(&module.context.graphical_functions, total_bytes)?; + let (dir_base, data_base) = regions + .as_ref() + .map(|r| (r.directory_base, r.data_base)) + .unwrap_or((0, 0)); + if let Some(r) = ®ions { + total_bytes = total_bytes + .checked_add(r.total_bytes) + .ok_or_else(too_large)?; + } + instance_gf.insert(key.clone(), (dir_base, data_base, regions)); + } + // The layout reports the ROOT instance's GF bases (a host reads results, not + // GF directly; this preserves the single-root-model layout exactly). + let (root_gf_directory_base, root_gf_data_base) = instance_gf + .get(&sim.root) + .map(|(d, dd, _)| (*d, *dd)) + .unwrap_or((0, 0)); + + // The two snapshot regions follow the GF regions, each `n_slots` wide + // (`vm.rs:617-618`). `initial_values` backs `INIT(x)` (captured once after + // initials); `prev_values` backs `PREVIOUS(x)` (captured after each step, or + // after the end-of-step flows re-eval under RK). Their bases are threaded + // into every `EmitCtx` so `LoadInitial`/`LoadPrev` can address them. They are + // shared across instances: a child reads `initial_values[module_off + off]`, + // the same single snapshot the VM keeps. + let snapshot_bytes = n_slots.checked_mul(SLOT_SIZE).ok_or_else(too_large)?; + let initial_values_base = total_bytes; + let prev_values_base = initial_values_base + .checked_add(snapshot_bytes) + .ok_or_else(too_large)?; + total_bytes = prev_values_base + .checked_add(snapshot_bytes) + .ok_or_else(too_large)?; + + // The RK scratch region (`saved`(n_stocks) ++ `accum`(n_stocks)) follows the + // snapshot regions. It holds each stock's stage-1 value and running RK + // accumulator across the stages (`vm.rs:655`, the VM's `rk_scratch` + // split). `n_stocks` now spans nested module stocks. Euler needs neither, so + // the region is only reserved for RK. + let rk = matches!(specs.method, Method::RungeKutta2 | Method::RungeKutta4); + let stock_scratch_bytes = n_stocks.checked_mul(SLOT_SIZE).ok_or_else(too_large)?; + let rk_saved_base = total_bytes; + let rk_accum_base = rk_saved_base + .checked_add(stock_scratch_bytes) + .ok_or_else(too_large)?; + if rk { + total_bytes = rk_accum_base + .checked_add(stock_scratch_bytes) + .ok_or_else(too_large)?; + } + + // Per-instance `temp_storage` regions follow the snapshot/RK regions, one + // disjoint region per instance (sized by that instance's `temp_total_size`). + // The VM shares one `temp_storage` buffer across modules (per-module + // `temp_offsets`); disjoint regions are unconditionally correct because a + // parent's temps never survive across an `EvalModule` call (the child would + // otherwise clobber a shared slot the VM relies on not surviving), so giving + // each instance its own region cannot diverge from the VM. The largest + // per-instance `temp_total_size` also bounds the shared vector/alloc scratch. + let mut instance_temp_base: HashMap = HashMap::new(); + let mut max_temp_total_size = 0u32; + for key in &instance_order { + let module = &sim.modules[key]; + let temp_total_size = + u32::try_from(module.context.temp_total_size).map_err(|_| too_large())?; + max_temp_total_size = max_temp_total_size.max(temp_total_size); + instance_temp_base.insert(key.clone(), total_bytes); + let temp_bytes = temp_total_size + .checked_mul(SLOT_SIZE) + .ok_or_else(too_large)?; + total_bytes = total_bytes.checked_add(temp_bytes).ok_or_else(too_large)?; + } + + // The vector-op + allocation scratch regions follow the temp regions. They + // are shared across instances (the staging is within a single opcode, never + // live across an `EvalModule` boundary -- the same reason the VM shares + // them). A vector/alloc op's element count is bounded by the largest view it + // processes, in turn bounded by the largest per-instance `temp_total_size` + // and the slab's `n_slots`; see the detailed sizing invariant retained on the + // per-region comments below. `2 * max(...)` f64 for the sort-pair vector + // scratch, `6 * max(...)` f64 for the allocation staging. + let scratch_view_bound = max_temp_total_size.max(n_slots); + let vector_scratch_base = total_bytes; + let vector_scratch_slots = scratch_view_bound.checked_mul(2).ok_or_else(too_large)?; + let vector_scratch_bytes = vector_scratch_slots + .checked_mul(SLOT_SIZE) + .ok_or_else(too_large)?; + total_bytes = vector_scratch_base + .checked_add(vector_scratch_bytes) + .ok_or_else(too_large)?; + + let alloc_scratch_base = total_bytes; + let alloc_scratch_slots = scratch_view_bound.checked_mul(6).ok_or_else(too_large)?; + let alloc_scratch_bytes = alloc_scratch_slots + .checked_mul(SLOT_SIZE) + .ok_or_else(too_large)?; + total_bytes = alloc_scratch_base + .checked_add(alloc_scratch_bytes) + .ok_or_else(too_large)?; + + // The constants-override region (Phase 7 Task 2) follows the scratch regions: + // an `n_slots`-wide f64 region indexed by ABSOLUTE slab offset, holding each + // overridable constant's current value (initialized to the compiled default). + // It is `n_slots` wide -- not `n_overridable` -- so a redirected + // `AssignConstCurr { off }` reads it with the same `module_off`-relative + // addressing the slab uses (`const_region_base + (module_off + off) * 8`), + // which is what lets one shared `CompiledModule` running at several + // `module_off`s pick up each instance's distinct override. A parallel + // `n_slots`-byte validity region marks which absolute slots `set_value` may + // write (1 = overridable). Both are initialized by active `DataSection` + // segments built from `collect_overridable_defaults` (which mirrors the VM's + // `collect_constant_info` recursion). + let const_region_base = total_bytes; + let const_region_bytes = n_slots.checked_mul(SLOT_SIZE).ok_or_else(too_large)?; + total_bytes = const_region_base + .checked_add(const_region_bytes) + .ok_or_else(too_large)?; + let const_valid_base = total_bytes; + // One validity byte per slot. + total_bytes = const_valid_base + .checked_add(n_slots) + .ok_or_else(too_large)?; + + let overridable_defaults = collect_overridable_defaults(&sim.modules, &sim.root, 0); + // Defense in depth: the offsets `collect_overridable_defaults` reports must + // be exactly the set the VM considers overridable (`constant_offsets`, the + // keys of `cached_constant_info`). Both walk the same flows-`AssignConstCurr` + // overridability rule, so any divergence is a bug -- a blob's `set_value` + // would then accept/reject a different set than the VM. Checked only in debug. + debug_assert!( + { + let mut ours: Vec = overridable_defaults.iter().map(|(off, _)| *off).collect(); + ours.sort_unstable(); + ours.dedup(); + let mut theirs: Vec = sim.constant_offsets().collect(); + theirs.sort_unstable(); + ours == theirs + }, + "wasmgen overridable-constant offsets diverged from CompiledSimulation::constant_offsets" + ); + + let pages = total_bytes.div_ceil(WASM_PAGE_SIZE).max(1); + + // save_every mirrors vm.rs::run_to: max(1, round(save_step / dt)). + let save_every = ((specs.save_step / specs.dt).round() as i64).max(1); + let save_every = i32::try_from(save_every).map_err(|_| too_large())?; + + // Emitted helper functions occupy the module's first function slots; the + // per-instance function-triples follow (at `n_helpers + i*FUNCS_PER_INSTANCE` + // for instance `i`), and `run` is last. Build the helpers up front so the + // index registry threaded into each `EmitCtx` matches the assembled module's + // layout, and so `emit_bytecode`'s `call`s resolve. + let helpers = build_helpers(); + let helper_fns = helpers.fns; + let n_helpers = helpers.functions.len() as u32; + + // Assemble the per-instance descriptors and the `(ModuleKey, StepPart) -> fn + // index` map. The map is built for ALL instances before any function body is + // emitted, so an `EvalModule` in one instance's program resolves to the + // child's already-known function index (the instantiation graph is acyclic, + // but the index map does not depend on emit order regardless). + let mut instances: Vec = Vec::with_capacity(instance_order.len()); + let mut module_fn_index: HashMap<(ModuleKey, StepPart), u32> = HashMap::new(); + for (i, key) in instance_order.iter().enumerate() { + let module = &sim.modules[key]; + let base = n_helpers + (i as u32) * FUNCS_PER_INSTANCE; + module_fn_index.insert((key.clone(), StepPart::Initials), base + F_INITIALS); + module_fn_index.insert((key.clone(), StepPart::Flows), base + F_FLOWS); + module_fn_index.insert((key.clone(), StepPart::Stocks), base + F_STOCKS); + let (gf_directory_base, gf_data_base, gf_regions) = + instance_gf.remove(key).expect("gf entry per instance"); + instances.push(PerInstance { + module, + n_inputs: instance_n_inputs.get(key).copied().unwrap_or(0), + gf_directory_base, + gf_data_base, + temp_storage_base: instance_temp_base[key], + gf_regions, + flows_const_offsets: flows_const_offsets_for(module), + }); + } + + // Emit each instance's three program functions (initials/flows/stocks) over + // the shared f64 slab, each lowered with that instance's own `ByteCodeContext` + // and per-instance bases. `step_part` is per-program so `LoadInitial` picks + // its `curr`-vs-snapshot branch at compile time (`vm.rs:1332-1340`), and an + // `EvalModule` resolves the child's function for that same phase. + let mut program_fns: Vec = Vec::with_capacity(instances.len() * 3); + for inst in &instances { + // `module_off` is the function's i32 param 0; inputs are params + // `1..=n_inputs`. The reverse-pop scratch f64 base sits past all other + // declared locals; the index helpers shift everything by `n_inputs`. + let make_ctx = |cond_depth: usize, extra_i32: u32, step_part: StepPart| lower::EmitCtx { + curr_base, + next_base, + gf_directory_base: inst.gf_directory_base, + gf_data_base: inst.gf_data_base, + initial_values_base, + prev_values_base, + use_prev_fallback_global: G_USE_PREV_FALLBACK, + step_part, + dt: specs.dt, + start_time: specs.start, + final_time: specs.stop, + module_off_local: L_MODULE_OFF, + scratch_local: lower::scratch_local_for(inst.n_inputs), + condition_locals: lower::condition_locals_for(inst.n_inputs, cond_depth), + apply_locals: lower::apply_locals_for(inst.n_inputs, cond_depth), + helpers: helper_fns, + temp_storage_base: inst.temp_storage_base, + extra_i32_local_base: lower::extra_i32_local_base(inst.n_inputs, cond_depth), + vector_f64_locals: lower::vector_f64_locals_for(inst.n_inputs, cond_depth), + vector_i32_locals: lower::vector_i32_locals_for(inst.n_inputs, cond_depth), + vector_scratch_base, + alloc_scratch_base, + module_input_scratch_base: lower::module_input_scratch_base( + inst.n_inputs, + cond_depth, + extra_i32, + ), + const_region_base, + flows_const_offsets: &inst.flows_const_offsets, + module_fn_index: &module_fn_index, + ctx: &inst.module.context, + }; + program_fns.push(emit_initials_fn(inst.module, inst.n_inputs, &make_ctx)?); + program_fns.push(emit_opcode_fn( + &inst.module.compiled_flows, + inst.n_inputs, + StepPart::Flows, + &make_ctx, + )?); + program_fns.push(emit_opcode_fn( + &inst.module.compiled_stocks, + inst.n_inputs, + StepPart::Stocks, + &make_ctx, + )?); + } + + // `run` calls the ROOT instance's initials/flows/stocks with `module_off = 0` + // and no inputs (the root takes none) -- unchanged from the single-module + // path. Its child `EvalModule`s recurse from there. + let root_idx = instance_order + .iter() + .position(|k| *k == sim.root) + .expect("root is among the instances"); + let root_fn_base = n_helpers + (root_idx as u32) * FUNCS_PER_INSTANCE; + let run_fn = emit_run_simulation( + specs, + RunRegions { + n_slots, + results_base, + stride, + n_chunks, + initial_values_base, + prev_values_base, + rk_saved_base, + rk_accum_base, + }, + save_every, + &stock_offsets, + root_fn_base, + ); + + // The constants-override exports (Phase 7 Task 2): `set_value` writes an + // override into the constants region (validated against the validity bytes), + // `reset` resets the run state (`use_prev_fallback`) without clearing the + // region, and `clear_values` restores the compiled defaults. + let set_value_fn = emit_set_value(n_slots, const_region_base, const_valid_base); + let reset_fn = emit_reset(); + let clear_values_fn = emit_clear_values(const_region_base, &overridable_defaults); + + // The constants region + validity bytes are initialized at instantiation by + // active data segments built from the overridable defaults (sparse writes, + // one f64 + one validity byte per overridable absolute offset). + let const_init = + build_const_region_init(&overridable_defaults, const_region_base, const_valid_base); + + let instance_input_counts: Vec = instances.iter().map(|inst| inst.n_inputs).collect(); + let gf_images: Vec<&GfRegions> = instances + .iter() + .filter_map(|inst| inst.gf_regions.as_ref()) + .collect(); + let wasm = assemble_simulation(AssembleParts { + helpers, + program_fns, + run_fn, + set_value_fn, + reset_fn, + clear_values_fn, + instance_input_counts: &instance_input_counts, + pages, + n_slots, + n_chunks, + results_base, + gf_regions: &gf_images, + const_init: &const_init, + }); + + let var_offsets = sim + .offsets + .iter() + .map(|(k, v)| (k.as_str().to_string(), *v)) + .collect(); + + Ok(WasmArtifact { + wasm, + layout: WasmLayout { + n_slots: root.n_slots, + n_chunks: specs.n_chunks, + results_offset: results_base as usize, + gf_directory_offset: root_gf_directory_base as usize, + gf_data_offset: root_gf_data_base as usize, + var_offsets, + }, + }) +} + +/// The `n_inputs` (module-input parameter count) of each module instance, drawn +/// from the `EvalModule { n_inputs }` opcodes across every instance's three +/// programs. The root receives 0 inputs (it is invoked by `run` with none); a +/// child receives the count its callers pass -- the same value the VM sizes +/// `module_inputs` to. All call sites for a given `(model, input_set)` key agree +/// (the `input_set` is part of the key and `n_inputs == args.len()` at codegen, +/// `codegen.rs:1094-1109`); first-seen wins, which is therefore unambiguous. +fn collect_instance_input_counts(sim: &CompiledSimulation) -> HashMap { + let mut counts: HashMap = HashMap::new(); + for module in sim.modules.values() { + let programs: [&ByteCode; 2] = [&module.compiled_flows, &module.compiled_stocks]; + let initial_codes = module.compiled_initials.iter().map(|ci| &ci.bytecode); + for bc in programs.into_iter().chain(initial_codes) { + for op in &bc.code { + if let Opcode::EvalModule { id, n_inputs } = op { + let decl = &module.context.modules[*id as usize]; + let child_key = crate::vm::make_module_key(&decl.model_name, &decl.input_set); + counts.entry(child_key).or_insert(u32::from(*n_inputs)); + } + } + } + } + counts +} + +/// Build an instance's `initials` function: every `CompiledInitial`'s bytecode +/// in order, over the shared slab. The shared condition-local count is the max +/// nesting depth across all the initials (they run sequentially in one function); +/// the reverse-pop scratch covers the max `EvalModule { n_inputs }` over them. +/// `n_inputs` is the instance's module-input parameter count (shifts the locals). +fn emit_initials_fn<'a>( + module: &CompiledModule, + n_inputs: u32, + make_ctx: &impl Fn(usize, u32, StepPart) -> lower::EmitCtx<'a>, +) -> Result { + let cond_depth = module + .compiled_initials + .iter() + .map(|ci| max_condition_depth(&ci.bytecode)) + .max() + .unwrap_or(0); + // The initials run sequentially in one function; each fragment's dynamic- + // subscript accumulation (and `EvalModule` reverse-pop) completes before the + // next, so reserving the *max* per-fragment count -- not the sum -- is + // correct, and the fragments reuse the same scratch locals. + let extra_i32 = module + .compiled_initials + .iter() + .map(|ci| lower::count_extra_i32_locals(&ci.bytecode)) + .max() + .unwrap_or(0); + let module_input_scratch = module + .compiled_initials + .iter() + .map(|ci| lower::count_module_input_scratch(&ci.bytecode)) + .max() + .unwrap_or(0); + let ctx = make_ctx(cond_depth, extra_i32, StepPart::Initials); + let mut f = new_opcode_fn(n_inputs, cond_depth, extra_i32, module_input_scratch); + for ci in module.compiled_initials.iter() { + lower::emit_bytecode(&ci.bytecode, &ctx, &mut f)?; + } + f.instruction(&I::End); + Ok(f) +} + +/// Build one opcode-program function from a single `ByteCode`, lowering it as +/// `step_part` (which `LoadInitial` reads to pick its `curr`-vs-snapshot branch, +/// and which an `EvalModule` calls the child's matching phase function for). +/// `n_inputs` is the instance's module-input parameter count. +fn emit_opcode_fn<'a>( + bc: &ByteCode, + n_inputs: u32, + step_part: StepPart, + make_ctx: &impl Fn(usize, u32, StepPart) -> lower::EmitCtx<'a>, +) -> Result { + let cond_depth = max_condition_depth(bc); + let extra_i32 = lower::count_extra_i32_locals(bc); + let module_input_scratch = lower::count_module_input_scratch(bc); + let ctx = make_ctx(cond_depth, extra_i32, step_part); + let mut f = new_opcode_fn(n_inputs, cond_depth, extra_i32, module_input_scratch); + lower::emit_bytecode(bc, &ctx, &mut f)?; + f.instruction(&I::End); + Ok(f) +} + +/// A fresh opcode-program `Function` for an instance with `n_inputs` f64 input +/// params: the scratch f64 local, `cond_depth` i32 condition locals, the three +/// `Apply` scratch f64 locals, the vector-op scratch, `extra_i32` +/// dynamic-subscript scratch i32 locals, and `module_input_scratch` `EvalModule` +/// reverse-pop f64 locals (param 0 = `module_off`, params `1..=n_inputs` = +/// inputs). The declaration list lives in [`lower::opcode_fn_locals`] (which is +/// param-count-independent); the index helpers shift by `n_inputs`. +fn new_opcode_fn( + n_inputs: u32, + cond_depth: usize, + extra_i32: u32, + module_input_scratch: u32, +) -> Function { + // `n_inputs` is in the function's *type* (its params), not the declared + // locals list; it is applied at `assemble_simulation` where the type is + // chosen, so it does not appear here. + let _ = n_inputs; + Function::new(lower::opcode_fn_locals( + cond_depth, + extra_i32, + module_input_scratch, + )) +} + +/// Collect absolute offsets of all stock variables across the whole simulation, +/// recursing into child modules via `EvalModule` so submodule (SMOOTH/DELAY) +/// stocks are included. Mirrors the VM's `collect_stock_offsets` +/// (`vm.rs:512-543`) exactly: a stock writes via `AssignNext` or its +/// peephole-fused `BinOpAssignNext` (most integrations are `stock + delta`), and +/// an `EvalModule` recurses with `base_off + decl.off` (each instance addresses +/// its slot at `base_off + off`). After each step these slots are copied `next -> +/// curr`; the RK loops index `rk_scratch[saved/accum]` by their sorted position. +fn collect_all_stock_offsets( + modules: &HashMap, + key: &ModuleKey, + base_off: usize, +) -> Vec { + let module = match modules.get(key) { + Some(m) => m, + None => return Vec::new(), + }; + let mut offsets: Vec = Vec::new(); + for op in module.compiled_stocks.code.iter() { + match op { + Opcode::AssignNext { off } | Opcode::BinOpAssignNext { off, .. } => { + offsets.push(base_off + *off as usize); + } + Opcode::EvalModule { id, .. } => { + let decl = &module.context.modules[*id as usize]; + let child_key = crate::vm::make_module_key(&decl.model_name, &decl.input_set); + offsets.extend(collect_all_stock_offsets( + modules, + &child_key, + base_off + decl.off, + )); + } + _ => {} + } + } + // Defensive dedup, as the VM does: duplicate offsets would double-copy. + offsets.sort_unstable(); + offsets.dedup(); + offsets +} + +/// The set of *relative* offsets a module assigns via an `AssignConstCurr` in +/// its **flows** phase: exactly this module's overridable constants. Mirrors the +/// first (flows-only) pass of the VM's `collect_constant_info` (`vm.rs:436-450`), +/// but keyed by relative offset and computed per module, so it is compile-time +/// even for a shared `CompiledModule` instantiated at several base offsets (every +/// instantiation's `base_off + off` is overridable, since `collect_constant_info` +/// recurses through every declaration). An `AssignConstCurr { off }` in any phase +/// whose `off` is in this set is redirected to read the constants-override +/// region; one whose `off` is absent emits its immediate literal. +fn flows_const_offsets_for(module: &CompiledModule) -> std::collections::HashSet { + module + .compiled_flows + .code + .iter() + .filter_map(|op| match op { + Opcode::AssignConstCurr { off, .. } => Some(*off), + _ => None, + }) + .collect() +} + +/// Collect `(absolute offset, compiled-default literal)` for every overridable +/// constant across the whole simulation, recursing through `EvalModule` +/// declarations with cumulative `base_off`. Mirrors the VM's `collect_constant_info` +/// (`vm.rs:426-507`): an offset is overridable iff some module assigns it via an +/// `AssignConstCurr` in its **flows** phase, and the default value is that flows +/// `AssignConstCurr`'s literal. Used to size and initialize the constants-override +/// region so the wasm blob's `set_value` accepts exactly the offsets the VM's +/// `set_value_by_offset` does, each initialized to the same compiled default. +/// +/// A shared module instantiated at two base offsets contributes both absolute +/// offsets (one per instantiation), exactly as the VM's recursion does. +fn collect_overridable_defaults( + modules: &HashMap, + key: &ModuleKey, + base_off: usize, +) -> Vec<(usize, f64)> { + let module = match modules.get(key) { + Some(m) => m, + None => return Vec::new(), + }; + let mut out: Vec<(usize, f64)> = Vec::new(); + for op in module.compiled_flows.code.iter() { + if let Opcode::AssignConstCurr { off, literal_id } = op { + // The literal is the flows assignment's compiled default. A + // well-formed program always has the literal in range; fall back to + // 0.0 defensively rather than panicking across what is otherwise an + // infallible layout pass. + let v = module + .compiled_flows + .literals + .get(*literal_id as usize) + .copied() + .unwrap_or(0.0); + out.push((base_off + *off as usize, v)); + } + } + for decl in &module.context.modules { + let child_key = crate::vm::make_module_key(&decl.model_name, &decl.input_set); + out.extend(collect_overridable_defaults( + modules, + &child_key, + base_off + decl.off, + )); + } + out +} + +/// The linear-memory region geometry `run` needs: the chunk/results bases, the +/// snapshot bases (`initial_values`/`prev_values`), and the RK scratch bases +/// (`saved`/`accum`). Bundled to keep `emit_run_simulation`'s signature small as +/// the run loop gained snapshot + RK regions. +#[derive(Clone, Copy)] +struct RunRegions { + n_slots: u32, + results_base: u32, + stride: u32, + n_chunks: u32, + initial_values_base: u32, + prev_values_base: u32, + /// Slot-0 byte base of the RK `saved[i]` scratch (one f64 per stock). + rk_saved_base: u32, + /// Slot-0 byte base of the RK `accum[i]` scratch (one f64 per stock). + rk_accum_base: u32, +} + +// `run`'s f64 locals (after the three i32 locals). The RK loops need a +// `saved_time` (the timestep's t, restored after the stages move `curr[TIME]` to +// trial points) and a per-stage `s` scratch (`next[off]-curr[off]`). Euler +// declares them too -- two unused f64 locals are free. +const L_SAVED_TIME: u32 = 3; +const L_RK_S: u32 = 4; + +/// Emit the body of `run` for the `CompiledSimulation` path: seed the reserved +/// globals, run the initials, capture `initial_values`, then drive the +/// integration loop selected by `specs.method`. The loop `call`s the three +/// opcode-emitted functions; the Euler arm mirrors `vm.rs::run_to`'s Euler arm, +/// and the RK arms mirror `vm.rs:712-838`. +fn emit_run_simulation( + specs: &Specs, + regions: RunRegions, + save_every: i32, + stock_offsets: &[usize], + root_fn_base: u32, +) -> Function { + // Three i32 locals (saved/step_accum/dst) + two f64 locals (saved_time, s). + let mut f = Function::new([(3, ValType::I32), (2, ValType::F64)]); + + // Absolute function indices of the ROOT instance's three program functions: + // its function-triple base + the per-phase offset. `run` drives the root with + // `module_off = 0`; nested instances are reached via `EvalModule` from there. + let f_initials = root_fn_base + F_INITIALS; + let f_flows = root_fn_base + F_FLOWS; + let f_stocks = root_fn_base + F_STOCKS; + + // Seed the reserved global slots into curr (chunk base 0), then run the + // initials. The seeds mirror the VM, which writes start/dt/start/stop into + // TIME/DT/INITIAL_TIME/FINAL_TIME before run_initials. + store_curr_const_abs(&mut f, TIME_OFF, specs.start); + store_curr_const_abs(&mut f, DT_OFF, specs.dt); + store_curr_const_abs(&mut f, INITIAL_TIME_OFF, specs.start); + store_curr_const_abs(&mut f, FINAL_TIME_OFF, specs.stop); + // Re-arm the PREVIOUS fallback for this run, mirroring the VM's + // `run_initials` (which sets `use_prev_fallback = true` at the start of + // every run). `run` reseeds the time globals + reruns initials and is the + // documented per-change entry point for repeated re-simulation, so it must + // reset this flag itself: the loop below clears it to 0 after the first + // `prev_values` snapshot, and without re-arming it here a second `run` on + // the same instance would read the prior run's `prev_values` on step 0 (and + // during initials) instead of the fallback. The module-init value is also 1, + // so this is a no-op only on the very first run. + f.instruction(&I::I32Const(1)); + f.instruction(&I::GlobalSet(G_USE_PREV_FALLBACK)); + f.instruction(&I::I32Const(0)); + f.instruction(&I::Call(f_initials)); + + // Capture `initial_values := curr` exactly once, after initials, for + // `INIT(x)` reads in the flows/stocks programs (`vm.rs:1124-1128`). + // `use_prev_fallback` is 1 (re-armed just above) through initials, so any + // `PREVIOUS(x)` evaluated during initials returns its fallback. + emit_copy_chunk( + &mut f, + CURR_BASE, + regions.initial_values_base, + regions.n_slots, + ); + + f.instruction(&I::Block(BlockType::Empty)); // $break + f.instruction(&I::Loop(BlockType::Empty)); // $continue + + // if curr[TIME] > stop: break + f.instruction(&I::I32Const(0)); + f.instruction(&I::F64Load(memarg(TIME_ADDR))); + f.instruction(&f64_const(specs.stop)); + f.instruction(&I::F64Gt); + f.instruction(&I::BrIf(1)); + + // The per-method step: compute the new stock values into `next[off]`, leave + // `curr` holding the full time-`t` state (aux/flows + time-`t` stocks), then + // snapshot `prev_values := curr` and clear `use_prev_fallback`. + match specs.method { + Method::Euler => emit_euler_step(&mut f, f_flows, f_stocks, ®ions), + Method::RungeKutta4 => { + emit_rk4_step(&mut f, f_flows, f_stocks, specs.dt, stock_offsets, ®ions) + } + Method::RungeKutta2 => { + emit_rk2_step(&mut f, f_flows, f_stocks, specs.dt, stock_offsets, ®ions) + } + } + + // The save + advance tail is method-agnostic: every method leaves `next[off]` + // holding the new stock values and `curr` holding the time-`t` state, so the + // save row records `curr`, the advance copies the new stocks `next -> curr`, + // and `curr[TIME] += dt`. + emit_save_advance(&mut f, specs, save_every, stock_offsets, ®ions); + + f.instruction(&I::Br(0)); // continue + f.instruction(&I::End); // end loop + f.instruction(&I::End); // end block + f.instruction(&I::End); // end function + f +} + +/// The Euler step: `flows`+`stocks` (the stocks program writes `next[off]`), +/// then the `prev_values` snapshot. Mirrors `vm.rs:698-708`. +fn emit_euler_step(f: &mut Function, f_flows: u32, f_stocks: u32, regions: &RunRegions) { + emit_eval_step(f, f_flows, f_stocks); + emit_prev_snapshot(f, regions); +} + +/// `eval_step` = `flows(0)` then `stocks(0)` (`vm.rs:1195`). The stocks program +/// writes each stock's integrated value into `next[off]`. +fn emit_eval_step(f: &mut Function, f_flows: u32, f_stocks: u32) { + f.instruction(&I::I32Const(0)); + f.instruction(&I::Call(f_flows)); + f.instruction(&I::I32Const(0)); + f.instruction(&I::Call(f_stocks)); +} + +/// Snapshot `prev_values := curr` and clear `use_prev_fallback` so the next +/// step's `PREVIOUS(x)` reads this step's `curr` rather than its fallback +/// (`vm.rs:705-707` for Euler; `vm.rs:781-783` / `832-834` for RK, where it runs +/// only after the end-of-step flows re-eval has restored `curr`). +fn emit_prev_snapshot(f: &mut Function, regions: &RunRegions) { + emit_copy_chunk(f, CURR_BASE, regions.prev_values_base, regions.n_slots); + f.instruction(&I::I32Const(0)); + f.instruction(&I::GlobalSet(G_USE_PREV_FALLBACK)); +} + +/// The method-agnostic save + advance tail (the wasm analogue of the VM's +/// `save_advance!` plus its per-step advance). Records a results row from `curr` +/// on the VM's cadence, breaks when the chunk budget is exhausted, then advances +/// by copying the new stock values `next -> curr` and stepping `curr[TIME] += dt`. +fn emit_save_advance( + f: &mut Function, + specs: &Specs, + save_every: i32, + stock_offsets: &[usize], + regions: &RunRegions, +) { + let n_slots = regions.n_slots; + + // step_accum += 1 + f.instruction(&I::LocalGet(L_STEP_ACCUM)); + f.instruction(&I::I32Const(1)); + f.instruction(&I::I32Add); + f.instruction(&I::LocalSet(L_STEP_ACCUM)); + + // save_cond = (step_accum == save_every) | (saved == 0 & time == start) + f.instruction(&I::LocalGet(L_STEP_ACCUM)); + f.instruction(&I::I32Const(save_every)); + f.instruction(&I::I32Eq); + f.instruction(&I::LocalGet(L_SAVED)); + f.instruction(&I::I32Eqz); + f.instruction(&I::I32Const(0)); + f.instruction(&I::F64Load(memarg(TIME_ADDR))); + f.instruction(&f64_const(specs.start)); + f.instruction(&I::F64Eq); + f.instruction(&I::I32And); + f.instruction(&I::I32Or); + f.instruction(&I::If(BlockType::Empty)); + + // dst = results_base + saved * stride + f.instruction(&I::I32Const(regions.results_base as i32)); + f.instruction(&I::LocalGet(L_SAVED)); + f.instruction(&I::I32Const(regions.stride as i32)); + f.instruction(&I::I32Mul); + f.instruction(&I::I32Add); + f.instruction(&I::LocalSet(L_DST)); + + // results[dst + slot*8] = curr[slot] for every slot + for slot in 0..n_slots { + f.instruction(&I::LocalGet(L_DST)); + f.instruction(&I::I32Const(0)); + f.instruction(&I::F64Load(memarg(u64::from(slot) * u64::from(SLOT_SIZE)))); + f.instruction(&I::F64Store(memarg(u64::from(slot) * u64::from(SLOT_SIZE)))); + } + + // saved += 1; step_accum = 0 + f.instruction(&I::LocalGet(L_SAVED)); + f.instruction(&I::I32Const(1)); + f.instruction(&I::I32Add); + f.instruction(&I::LocalSet(L_SAVED)); + f.instruction(&I::I32Const(0)); + f.instruction(&I::LocalSet(L_STEP_ACCUM)); + + // if saved >= n_chunks: break (depth 2: if -> loop -> block) + f.instruction(&I::LocalGet(L_SAVED)); + f.instruction(&I::I32Const(regions.n_chunks as i32)); + f.instruction(&I::I32GeS); + f.instruction(&I::BrIf(2)); + + f.instruction(&I::End); // end if + + // Advance: copy the freshly integrated stock values next -> curr. The + // `next` chunk's slot-0 byte base is one chunk past `curr`, i.e. the chunk + // stride (`compile_simulation` sets `next_base = stride`). + let next_base = regions.stride; + for &off in stock_offsets { + f.instruction(&I::I32Const(0)); + f.instruction(&I::I32Const(0)); + f.instruction(&I::F64Load(memarg( + u64::from(next_base) + off as u64 * u64::from(SLOT_SIZE), + ))); + f.instruction(&I::F64Store(memarg(off as u64 * u64::from(SLOT_SIZE)))); + } + + // time += dt + f.instruction(&I::I32Const(0)); + f.instruction(&I::I32Const(0)); + f.instruction(&I::F64Load(memarg(TIME_ADDR))); + f.instruction(&f64_const(specs.dt)); + f.instruction(&I::F64Add); + f.instruction(&I::F64Store(memarg(TIME_ADDR))); +} + +/// Store a compile-time constant into a `curr` slot at an absolute (module_off +/// 0) address. +fn store_curr_const_abs(f: &mut Function, off: usize, v: f64) { + f.instruction(&I::I32Const(0)); + f.instruction(&f64_const(v)); + f.instruction(&I::F64Store(memarg(off as u64 * u64::from(SLOT_SIZE)))); +} + +// ── Constants-override exports (Phase 7 Task 2) ─────────────────────────── +// +// `set_value(offset: i32, val: f64) -> i32` writes the override into the +// constants region (0 ok / 1 when `offset` is out of range or not overridable); +// `reset() -> ()` resets the run state without clearing the region (overrides +// persist across reset, like the VM); `clear_values() -> ()` restores the +// compiled defaults. The constants region is `n_slots`-wide and indexed by +// absolute slab offset (so a redirected `AssignConstCurr` reads it with the same +// `module_off`-relative addressing the slab uses); a parallel `n_slots`-byte +// validity region (1 = overridable) is what `set_value` checks. + +/// A `MemArg` for a single-byte access (the validity region), align 0. +fn byte_memarg(addr: u64) -> wasm_encoder::MemArg { + wasm_encoder::MemArg { + offset: addr, + align: 0, + memory_index: 0, + } +} + +// `set_value`'s i32 params: the absolute slab offset and (param 1) the f64 +// value. Param 0 is the offset. +const SV_OFFSET: u32 = 0; +const SV_VALUE: u32 = 1; + +/// Emit `set_value(offset: i32, val: f64) -> i32`: write `const_region[offset] = +/// val` and return 0 when `offset` is a valid overridable slot, else return 1 +/// without writing. Validity is `0 <= offset < n_slots` AND `valid[offset] != 0` +/// (the byte the data segment set for each overridable absolute offset). This +/// mirrors the VM's `set_value_by_offset` (`vm.rs:1037-1052`): an out-of-range or +/// non-constant offset is rejected (the VM returns `Err`), a valid one applies +/// the override (which persists across `reset`). +fn emit_set_value(n_slots: u32, const_region_base: u32, const_valid_base: u32) -> Function { + let mut f = Function::new([]); + + // if (offset < 0) | (offset >= n_slots): return 1 + f.instruction(&I::LocalGet(SV_OFFSET)); + f.instruction(&I::I32Const(0)); + f.instruction(&I::I32LtS); + f.instruction(&I::LocalGet(SV_OFFSET)); + f.instruction(&I::I32Const(n_slots as i32)); + f.instruction(&I::I32GeS); + f.instruction(&I::I32Or); + f.instruction(&I::If(BlockType::Empty)); + f.instruction(&I::I32Const(1)); + f.instruction(&I::Return); + f.instruction(&I::End); + + // if valid[offset] == 0: return 1 (valid byte at const_valid_base + offset) + f.instruction(&I::LocalGet(SV_OFFSET)); + f.instruction(&I::I32Load8U(byte_memarg(u64::from(const_valid_base)))); + f.instruction(&I::I32Eqz); + f.instruction(&I::If(BlockType::Empty)); + f.instruction(&I::I32Const(1)); + f.instruction(&I::Return); + f.instruction(&I::End); + + // const_region[offset] = val (f64 at const_region_base + offset*8) + f.instruction(&I::LocalGet(SV_OFFSET)); + f.instruction(&I::I32Const(SLOT_SIZE as i32)); + f.instruction(&I::I32Mul); + f.instruction(&I::LocalGet(SV_VALUE)); + f.instruction(&I::F64Store(memarg(u64::from(const_region_base)))); + + // return 0 + f.instruction(&I::I32Const(0)); + f.instruction(&I::End); + f +} + +/// Emit `reset() -> ()`: reset the run state so the next `run` re-runs initials +/// and the loop from t=start. The wasm `run` already re-seeds the time slots and +/// re-runs initials on every call and uses fresh i32 locals for the chunk/step +/// counters, so the only cross-run state is the `use_prev_fallback` global, which +/// `run` clears after the first `prev_values` snapshot. Setting it back to 1 here +/// is the analogue of the VM's `reset` clearing `prev_values_valid` (`vm.rs:976-989`), +/// and -- like the VM -- it deliberately does NOT touch the constants region, so +/// overrides persist across reset. +fn emit_reset() -> Function { + let mut f = Function::new([]); + f.instruction(&I::I32Const(1)); + f.instruction(&I::GlobalSet(G_USE_PREV_FALLBACK)); + f.instruction(&I::End); + f +} + +/// Emit `clear_values() -> ()`: restore each overridable constant to its +/// compiled-default literal by writing the defaults back into the constants +/// region (the VM's `clear_values`, `vm.rs:1055-1062`). The defaults are +/// compile-time constants, so this is a straight-line sequence of `f64.store`s -- +/// one per overridable absolute offset. The data segment also writes these at +/// instantiation; `clear_values` lets a host undo a `set_value` without +/// re-instantiating the module. +fn emit_clear_values(const_region_base: u32, overridable_defaults: &[(usize, f64)]) -> Function { + let mut f = Function::new([]); + for &(abs_off, default) in overridable_defaults { + f.instruction(&I::I32Const(0)); + f.instruction(&f64_const(default)); + f.instruction(&I::F64Store(memarg( + u64::from(const_region_base) + abs_off as u64 * u64::from(SLOT_SIZE), + ))); + } + f.instruction(&I::End); + f +} + +/// The active `DataSection` payloads that initialize the constants region and +/// its validity bytes at instantiation: for each overridable absolute offset, the +/// f64 default written into the constants region and a `1` validity byte. Sparse +/// (one segment per overridable offset), so a model with no overridable constants +/// produces an empty list (no segments). +struct ConstRegionInit { + /// `(byte address within the constants region, the 8 LE bytes of the default)`. + value_segments: Vec<(u32, [u8; 8])>, + /// `byte address within the validity region` (the byte written is always 1). + valid_segments: Vec, +} + +/// Build the constants-region init payloads from the overridable defaults. +fn build_const_region_init( + overridable_defaults: &[(usize, f64)], + const_region_base: u32, + const_valid_base: u32, +) -> ConstRegionInit { + let mut value_segments = Vec::with_capacity(overridable_defaults.len()); + let mut valid_segments = Vec::with_capacity(overridable_defaults.len()); + for &(abs_off, default) in overridable_defaults { + let value_addr = const_region_base + abs_off as u32 * SLOT_SIZE; + value_segments.push((value_addr, default.to_le_bytes())); + valid_segments.push(const_valid_base + abs_off as u32); + } + ConstRegionInit { + value_segments, + valid_segments, + } +} + +// ── RK loop primitives ──────────────────────────────────────────────────── +// +// Every RK memory slot lives at a constant byte address (`base + idx*8`), so the +// dynamic part of the address is always `i32.const 0` and the constant +// `memarg.offset` carries `base + idx*8`. `f64.store` wants `[addr_i32, +// value_f64]`, so the store helpers push the `i32.const 0` address first, then +// the caller leaves the value on the stack. + +/// `i32.const 0; f64.load[base + idx*8]` -- push the f64 at slot `idx` of the +/// region whose slot-0 byte base is `base`. +fn emit_load_slot(f: &mut Function, base: u32, idx: u32) { + f.instruction(&I::I32Const(0)); + f.instruction(&I::F64Load(memarg( + u64::from(base) + u64::from(idx) * u64::from(SLOT_SIZE), + ))); +} + +/// Push the store *address* half of an RK slot store: a bare `i32.const 0`. +/// Every RK slot's full byte address (`base + idx*8`) rides in the matching +/// [`emit_store_slot_value`]'s `memarg.offset`, so the dynamic address is always +/// the constant 0 -- this half therefore needs no `base`/`idx`. Kept as the +/// named symmetry partner of `emit_store_slot_value` (which it precedes at every +/// call site, since `f64.store` consumes `[addr_i32, value_f64]`): inlining only +/// this half would scatter unexplained `i32.const 0`s whose absolute-addressing +/// intent is exactly what the pairing documents. +fn emit_store_slot_addr(f: &mut Function) { + f.instruction(&I::I32Const(0)); +} + +/// `f64.store[base + idx*8]` -- consume `[addr_i32, value_f64]` already on the +/// stack (the address from [`emit_store_slot_addr`]). +fn emit_store_slot_value(f: &mut Function, base: u32, idx: u32) { + f.instruction(&I::F64Store(memarg( + u64::from(base) + u64::from(idx) * u64::from(SLOT_SIZE), + ))); +} + +/// Emit `L_RK_S := next[off] - curr[off]` -- the stock's stage delta `s_k` +/// (`vm.rs`: `let sN = next[off] - curr[off]`). Computed before any of the +/// stage's writes clobber `curr[off]`. `next_base` is `n_slots*8`. +/// +/// `off` is the full-width absolute slot offset (`u32`, like the Euler advance's +/// `emit_save_advance`). A `u16` here would silently truncate a stock at slot +/// 65536 or above -- reachable in a large nested model (each submodel / SMOOTH / +/// DELAY instance adds slots, with no cap on total `n_slots`) -- to +/// `off & 0xFFFF`, clobbering an unrelated slot (offset 65536 maps to slot 0, +/// TIME). +fn emit_compute_stage_delta(f: &mut Function, next_base: u32, off: u32) { + emit_load_slot(f, next_base, off); + emit_load_slot(f, CURR_BASE, off); + f.instruction(&I::F64Sub); + f.instruction(&I::LocalSet(L_RK_S)); +} + +/// The RK4 step (`vm.rs:712-787`): four stages over the compile-time stock +/// offsets, the time juggling, the final flows-only re-eval with restored +/// `curr`, and the `prev_values` snapshot. `next[off]` ends holding the new +/// integrated stock value; `curr` ends holding the time-`t` state. +fn emit_rk4_step( + f: &mut Function, + f_flows: u32, + f_stocks: u32, + dt: f64, + stock_offsets: &[usize], + regions: &RunRegions, +) { + let (saved, accum) = (regions.rk_saved_base, regions.rk_accum_base); + // The `next` chunk's slot-0 byte base == the chunk stride (`next` sits one + // chunk past `curr`); see `emit_save_advance`. + let next_base = regions.stride; + + // saved_time = curr[TIME] + f.instruction(&I::I32Const(0)); + f.instruction(&I::F64Load(memarg(TIME_ADDR))); + f.instruction(&I::LocalSet(L_SAVED_TIME)); + + // Stage 1 at (t, y): s1 = next-curr; saved=curr; accum=s1; curr=saved+s1*0.5 + emit_eval_step(f, f_flows, f_stocks); + for (i, &off) in stock_offsets.iter().enumerate() { + let (i, off) = (i as u32, off as u32); + emit_compute_stage_delta(f, next_base, off); + // saved[i] = curr[off] + emit_store_slot_addr(f); + emit_load_slot(f, CURR_BASE, off); + emit_store_slot_value(f, saved, i); + // accum[i] = s1 + emit_store_slot_addr(f); + f.instruction(&I::LocalGet(L_RK_S)); + emit_store_slot_value(f, accum, i); + // curr[off] = saved[i] + s1*0.5 + emit_store_slot_addr(f); + emit_load_slot(f, saved, i); + f.instruction(&I::LocalGet(L_RK_S)); + f.instruction(&f64_const(0.5)); + f.instruction(&I::F64Mul); + f.instruction(&I::F64Add); + emit_store_slot_value(f, CURR_BASE, off); + } + // curr[TIME] = saved_time + dt*0.5 + emit_store_time_offset(f, dt * 0.5); + + // Stage 2 at (t+dt/2, y+s1/2): s2 = next-curr; accum+=2*s2; curr=saved+s2*0.5 + emit_eval_step(f, f_flows, f_stocks); + for (i, &off) in stock_offsets.iter().enumerate() { + let (i, off) = (i as u32, off as u32); + emit_compute_stage_delta(f, next_base, off); + // accum[i] += 2*s2 + emit_store_slot_addr(f); + emit_load_slot(f, accum, i); + f.instruction(&I::LocalGet(L_RK_S)); + f.instruction(&f64_const(2.0)); + f.instruction(&I::F64Mul); + f.instruction(&I::F64Add); + emit_store_slot_value(f, accum, i); + // curr[off] = saved[i] + s2*0.5 + emit_store_slot_addr(f); + emit_load_slot(f, saved, i); + f.instruction(&I::LocalGet(L_RK_S)); + f.instruction(&f64_const(0.5)); + f.instruction(&I::F64Mul); + f.instruction(&I::F64Add); + emit_store_slot_value(f, CURR_BASE, off); + } + + // Stage 3 at (t+dt/2, y+s2/2): s3 = next-curr; accum+=2*s3; curr=saved+s3 + emit_eval_step(f, f_flows, f_stocks); + for (i, &off) in stock_offsets.iter().enumerate() { + let (i, off) = (i as u32, off as u32); + emit_compute_stage_delta(f, next_base, off); + // accum[i] += 2*s3 + emit_store_slot_addr(f); + emit_load_slot(f, accum, i); + f.instruction(&I::LocalGet(L_RK_S)); + f.instruction(&f64_const(2.0)); + f.instruction(&I::F64Mul); + f.instruction(&I::F64Add); + emit_store_slot_value(f, accum, i); + // curr[off] = saved[i] + s3 + emit_store_slot_addr(f); + emit_load_slot(f, saved, i); + f.instruction(&I::LocalGet(L_RK_S)); + f.instruction(&I::F64Add); + emit_store_slot_value(f, CURR_BASE, off); + } + // curr[TIME] = saved_time + dt + emit_store_time_offset(f, dt); + + // Stage 4 at (t+dt, y+s3): s4 = next-curr; accum+=s4; + // next[off] = saved[i] + accum[i]/6; curr[off] = saved[i] + emit_eval_step(f, f_flows, f_stocks); + for (i, &off) in stock_offsets.iter().enumerate() { + let (i, off) = (i as u32, off as u32); + emit_compute_stage_delta(f, next_base, off); + // accum[i] += s4 + emit_store_slot_addr(f); + emit_load_slot(f, accum, i); + f.instruction(&I::LocalGet(L_RK_S)); + f.instruction(&I::F64Add); + emit_store_slot_value(f, accum, i); + // next[off] = saved[i] + accum[i]/6.0 + emit_store_slot_addr(f); + emit_load_slot(f, saved, i); + emit_load_slot(f, accum, i); + f.instruction(&f64_const(6.0)); + f.instruction(&I::F64Div); + f.instruction(&I::F64Add); + emit_store_slot_value(f, next_base, off); + // curr[off] = saved[i] (restore the original) + emit_store_slot_addr(f); + emit_load_slot(f, saved, i); + emit_store_slot_value(f, CURR_BASE, off); + } + + // curr[TIME] = saved_time ; next[TIME] = saved_time + dt + emit_restore_and_advance_time(f, dt, regions); + + // Final flows-only re-eval with the restored curr, so curr's aux/flow slots + // hold time-`t` values (stages 2-4 clobbered them). Load-bearing for both + // the saved output row and the PREVIOUS snapshot (`vm.rs:769-778`). + f.instruction(&I::I32Const(0)); + f.instruction(&I::Call(f_flows)); + + emit_prev_snapshot(f, regions); +} + +/// The RK2 (Heun) step (`vm.rs:788-838`): two stages, the time juggling, the +/// final flows-only re-eval, and the `prev_values` snapshot. +fn emit_rk2_step( + f: &mut Function, + f_flows: u32, + f_stocks: u32, + dt: f64, + stock_offsets: &[usize], + regions: &RunRegions, +) { + let (saved, accum) = (regions.rk_saved_base, regions.rk_accum_base); + // The `next` chunk's slot-0 byte base == the chunk stride; see + // `emit_save_advance`. + let next_base = regions.stride; + + // saved_time = curr[TIME] + f.instruction(&I::I32Const(0)); + f.instruction(&I::F64Load(memarg(TIME_ADDR))); + f.instruction(&I::LocalSet(L_SAVED_TIME)); + + // Stage 1 at (t, y): s1 = next-curr; saved=curr; accum=s1; curr=saved+s1 + emit_eval_step(f, f_flows, f_stocks); + for (i, &off) in stock_offsets.iter().enumerate() { + let (i, off) = (i as u32, off as u32); + emit_compute_stage_delta(f, next_base, off); + // saved[i] = curr[off] + emit_store_slot_addr(f); + emit_load_slot(f, CURR_BASE, off); + emit_store_slot_value(f, saved, i); + // accum[i] = s1 + emit_store_slot_addr(f); + f.instruction(&I::LocalGet(L_RK_S)); + emit_store_slot_value(f, accum, i); + // curr[off] = saved[i] + s1 (full Euler step for the trial point) + emit_store_slot_addr(f); + emit_load_slot(f, saved, i); + f.instruction(&I::LocalGet(L_RK_S)); + f.instruction(&I::F64Add); + emit_store_slot_value(f, CURR_BASE, off); + } + // curr[TIME] = saved_time + dt + emit_store_time_offset(f, dt); + + // Stage 2 at (t+dt, y+s1): s2 = next-curr; accum+=s2; + // next[off] = saved[i] + accum[i]/2; curr[off] = saved[i] + emit_eval_step(f, f_flows, f_stocks); + for (i, &off) in stock_offsets.iter().enumerate() { + let (i, off) = (i as u32, off as u32); + emit_compute_stage_delta(f, next_base, off); + // accum[i] += s2 + emit_store_slot_addr(f); + emit_load_slot(f, accum, i); + f.instruction(&I::LocalGet(L_RK_S)); + f.instruction(&I::F64Add); + emit_store_slot_value(f, accum, i); + // next[off] = saved[i] + accum[i]/2.0 + emit_store_slot_addr(f); + emit_load_slot(f, saved, i); + emit_load_slot(f, accum, i); + f.instruction(&f64_const(2.0)); + f.instruction(&I::F64Div); + f.instruction(&I::F64Add); + emit_store_slot_value(f, next_base, off); + // curr[off] = saved[i] (restore the original) + emit_store_slot_addr(f); + emit_load_slot(f, saved, i); + emit_store_slot_value(f, CURR_BASE, off); + } + + // curr[TIME] = saved_time ; next[TIME] = saved_time + dt + emit_restore_and_advance_time(f, dt, regions); + + // Final flows-only re-eval with restored curr (see the RK4 comment). + f.instruction(&I::I32Const(0)); + f.instruction(&I::Call(f_flows)); + + emit_prev_snapshot(f, regions); +} + +/// `curr[TIME] = saved_time + offset` -- the trial-point time the stages run at +/// (`saved_time + dt*0.5` or `saved_time + dt`). +fn emit_store_time_offset(f: &mut Function, offset: f64) { + f.instruction(&I::I32Const(0)); + f.instruction(&I::LocalGet(L_SAVED_TIME)); + f.instruction(&f64_const(offset)); + f.instruction(&I::F64Add); + f.instruction(&I::F64Store(memarg(TIME_ADDR))); +} + +/// Restore `curr[TIME] = saved_time` and set `next[TIME] = saved_time + dt` +/// (`vm.rs:759-760` / `818-819`), so the final flows re-eval runs at time `t`. +/// `next[TIME]` is set for faithfulness with the VM even though the wasm +/// save/advance tail advances via `curr[TIME] += dt` rather than reading it. +fn emit_restore_and_advance_time(f: &mut Function, dt: f64, regions: &RunRegions) { + let next_time_addr = u64::from(regions.n_slots) * u64::from(SLOT_SIZE) + TIME_ADDR; + // curr[TIME] = saved_time + f.instruction(&I::I32Const(0)); + f.instruction(&I::LocalGet(L_SAVED_TIME)); + f.instruction(&I::F64Store(memarg(TIME_ADDR))); + // next[TIME] = saved_time + dt + f.instruction(&I::I32Const(0)); + f.instruction(&I::LocalGet(L_SAVED_TIME)); + f.instruction(&f64_const(dt)); + f.instruction(&I::F64Add); + f.instruction(&I::F64Store(memarg(next_time_addr))); +} + +/// Emit an unrolled `dst[0..n_slots] := src[0..n_slots]` f64 copy between two +/// linear-memory regions whose slot-0 byte bases are `src_base`/`dst_base`. Used +/// for the whole-chunk snapshots (`initial_values := curr`, `prev_values := +/// curr`), each `n_slots` wide. The unroll matches the per-slot store style the +/// rest of `run` uses; `n_slots` is small for scalar models. +fn emit_copy_chunk(f: &mut Function, src_base: u32, dst_base: u32, n_slots: u32) { + for slot in 0..n_slots { + let slot_off = u64::from(slot) * u64::from(SLOT_SIZE); + // f64.store wants [addr_i32, value_f64]; the constant `memarg.offset` + // carries each region's base, so the dynamic address is a constant 0. + f.instruction(&I::I32Const(0)); + f.instruction(&I::I32Const(0)); + f.instruction(&I::F64Load(memarg(u64::from(src_base) + slot_off))); + f.instruction(&I::F64Store(memarg(u64::from(dst_base) + slot_off))); + } +} + +/// Inputs to [`assemble_simulation`], grouped to keep the signature small now +/// that the module carries a per-instance function-triple (one per +/// `(model, input_set)`) plus a `run` driver, and possibly several GF regions. +struct AssembleParts<'a> { + helpers: BuiltHelpers, + /// The instances' program functions in `instance_order`, flattened as + /// `[initials_0, flows_0, stocks_0, initials_1, ...]`. `instance_input_counts` + /// (same instance order) gives each triple's f64 input-param count. + program_fns: Vec, + run_fn: Function, + /// `set_value(offset: i32, val: f64) -> i32` (Phase 7 Task 2). + set_value_fn: Function, + /// `reset() -> ()` (Phase 7 Task 2). + reset_fn: Function, + /// `clear_values() -> ()` (Phase 7 Task 2). + clear_values_fn: Function, + /// Module-input parameter count per instance, in the same order the triples + /// appear in `program_fns`. Drives the per-triple wasm type + /// (`(i32, f64*k) -> ()`). + instance_input_counts: &'a [u32], + pages: u32, + n_slots: u32, + n_chunks: u32, + results_base: u32, + /// Every GF-bearing instance's region image, for the active `DataSection` + /// segments (each instance's directory + data sit at distinct bases). + gf_regions: &'a [&'a GfRegions], + /// The constants-override region init payloads (Phase 7 Task 2): sparse + /// active `DataSection` segments seeding each overridable slot's f64 default + /// and its validity byte. + const_init: &'a ConstRegionInit, +} + +/// Assemble the simulation module: types, functions, memory, globals, exports, +/// code, and (when present) the GF data segments. Layout: the emitted helper +/// functions ([`build_helpers`]) lead the function/code sections (indices +/// `0..n_helpers`); then one `[initials, flows, stocks]` triple per module +/// instance (in `instance_order`); then `run` last. Exports `memory`, `run`, and +/// the three self-describing i32 geometry globals. Each GF-bearing instance +/// contributes two active `DataSection` segments (its directory + data) at its +/// own bases. +fn assemble_simulation(parts: AssembleParts) -> Vec { + let AssembleParts { + helpers, + program_fns, + run_fn, + set_value_fn, + reset_fn, + clear_values_fn, + instance_input_counts, + pages, + n_slots, + n_chunks, + results_base, + gf_regions, + const_init, + } = parts; + + let mut wasm = WasmModule::new(); + let n_helpers = helpers.functions.len() as u32; + let n_instances = instance_input_counts.len() as u32; + // Function layout: helpers, the per-instance triples, then `run`, then the + // three constants-override exports (`set_value`/`reset`/`clear_values`). + let run_fn_index = n_helpers + n_instances * FUNCS_PER_INSTANCE; + let set_value_fn_index = run_fn_index + 1; + let reset_fn_index = run_fn_index + 2; + let clear_values_fn_index = run_fn_index + 3; + + // Type section: `run`'s `() -> ()` first, then one opcode-program type per + // *distinct* module-input count (`(i32, f64*k) -> ()`, sorted), then the + // helper types, then the `set_value` type (`(i32, f64) -> i32`). + // `reset`/`clear_values` reuse `TYPE_RUN_FN`. `opcode_type_for` maps an + // instance's `n_inputs` to its type index; a helper at function index `i` + // uses the type appended after those. + let mut distinct_inputs: Vec = instance_input_counts.to_vec(); + distinct_inputs.sort_unstable(); + distinct_inputs.dedup(); + let opcode_type_index: HashMap = distinct_inputs + .iter() + .enumerate() + .map(|(i, &k)| (k, TYPE_RUN_FN + 1 + i as u32)) + .collect(); + let first_helper_type = TYPE_RUN_FN + 1 + distinct_inputs.len() as u32; + let set_value_type = first_helper_type + helpers.functions.len() as u32; + + let mut types = TypeSection::new(); + types.ty().function([], []); // TYPE_RUN_FN: () -> () + for &k in &distinct_inputs { + // (module_off: i32, in_0..in_{k-1}: f64) -> () + let mut params: Vec = Vec::with_capacity(1 + k as usize); + params.push(ValType::I32); + params.extend(std::iter::repeat_n(ValType::F64, k as usize)); + types.ty().function(params, []); + } + for hf in &helpers.functions { + types.ty().function(hf.params.clone(), hf.results.clone()); + } + // `set_value(offset: i32, val: f64) -> i32`. + types + .ty() + .function([ValType::I32, ValType::F64], [ValType::I32]); + wasm.section(&types); + + // Function section: helpers first (indices `0..n_helpers`), then each + // instance's three program functions (typed by that instance's `n_inputs`), + // then `run`, then `set_value`/`reset`/`clear_values`. + let mut functions = FunctionSection::new(); + for (i, _) in helpers.functions.iter().enumerate() { + functions.function(first_helper_type + i as u32); + } + for &k in instance_input_counts { + let ty = opcode_type_index[&k]; + functions.function(ty); // initials + functions.function(ty); // flows + functions.function(ty); // stocks + } + functions.function(TYPE_RUN_FN); // run + functions.function(set_value_type); // set_value + functions.function(TYPE_RUN_FN); // reset + functions.function(TYPE_RUN_FN); // clear_values + wasm.section(&functions); + + let mut memories = MemorySection::new(); + memories.memory(MemoryType { + minimum: u64::from(pages), + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + wasm.section(&memories); + + let i32_global = || GlobalType { + val_type: ValType::I32, + mutable: false, + shared: false, + }; + let mut globals = GlobalSection::new(); + globals.global(i32_global(), &ConstExpr::i32_const(n_slots as i32)); + globals.global(i32_global(), &ConstExpr::i32_const(n_chunks as i32)); + globals.global(i32_global(), &ConstExpr::i32_const(results_base as i32)); + // `use_prev_fallback`: the only mutable global. Init 1 so `LoadPrev` returns + // its fallback until the first `prev_values` snapshot clears it (`vm.rs:668`). + globals.global( + GlobalType { + val_type: ValType::I32, + mutable: true, + shared: false, + }, + &ConstExpr::i32_const(1), + ); + wasm.section(&globals); + + let mut exports = ExportSection::new(); + exports.export("run", ExportKind::Func, run_fn_index); + exports.export("set_value", ExportKind::Func, set_value_fn_index); + exports.export("reset", ExportKind::Func, reset_fn_index); + exports.export("clear_values", ExportKind::Func, clear_values_fn_index); + exports.export("memory", ExportKind::Memory, 0); + exports.export("n_slots", ExportKind::Global, G_N_SLOTS); + exports.export("n_chunks", ExportKind::Global, G_N_CHUNKS); + exports.export("results_offset", ExportKind::Global, G_RESULTS_OFFSET); + wasm.section(&exports); + + // Code section order must match the function section: helper bodies, then the + // per-instance program functions (in `program_fns` order), then `run`, then + // `set_value`/`reset`/`clear_values`. + let mut code = CodeSection::new(); + for hf in &helpers.functions { + code.function(&hf.body); + } + for program in &program_fns { + code.function(program); + } + code.function(&run_fn); + code.function(&set_value_fn); + code.function(&reset_fn); + code.function(&clear_values_fn); + wasm.section(&code); + + // The GF directory + data regions and the constants-override init values + // are read-only-at-instantiation constants; active data segments write each + // at its byte address when the module is instantiated. A module has at most + // one data section, so the GF regions and the constants-override init share + // it. The data section must follow the code section per the wasm binary order. + let has_const_init = + !const_init.value_segments.is_empty() || !const_init.valid_segments.is_empty(); + if !gf_regions.is_empty() || has_const_init { + let mut data = DataSection::new(); + for gf in gf_regions { + data.active( + 0, + &ConstExpr::i32_const(gf.directory_base as i32), + gf.directory.iter().copied(), + ); + data.active( + 0, + &ConstExpr::i32_const(gf.data_base as i32), + gf.data.iter().copied(), + ); + } + // The constants region's per-slot default (8 LE bytes each) and its + // validity bytes (a single `1` each), one active segment per overridable + // absolute offset. + for &(addr, bytes) in &const_init.value_segments { + data.active(0, &ConstExpr::i32_const(addr as i32), bytes.iter().copied()); + } + for &addr in &const_init.valid_segments { + data.active(0, &ConstExpr::i32_const(addr as i32), [1u8].iter().copied()); + } + wasm.section(&data); + } + + wasm.finish() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::common::{Canonical, Ident}; + use crate::compat::open_xmile; + use crate::db::{SimlinDb, compile_project_incremental, sync_from_datamodel_incremental}; + use crate::vm::Vm; + use checked::Store; + use std::io::BufReader; + use wasm::validate; + + const POPULATION_XMILE: &str = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../default_projects/population/model.xmile" + ); + + /// A graphical function whose table is `knots`. `Continuous` kind, with the + /// x-scale spanning the knots' x-range. + fn gf_from_knots(knots: &[(f64, f64)]) -> crate::datamodel::GraphicalFunction { + use crate::datamodel; + let x_points: Vec = knots.iter().map(|&(x, _)| x).collect(); + let y_points: Vec = knots.iter().map(|&(_, y)| y).collect(); + datamodel::GraphicalFunction { + kind: datamodel::GraphicalFunctionKind::Continuous, + x_points: Some(x_points.clone()), + y_points, + x_scale: datamodel::GraphicalFunctionScale { + min: x_points.first().copied().unwrap_or(0.0), + max: x_points.last().copied().unwrap_or(1.0), + }, + y_scale: datamodel::GraphicalFunctionScale { min: 0.0, max: 1.0 }, + } + } + + /// Decode a GF directory's `n`th entry from `directory` bytes: the absolute + /// data byte offset and the point count. + fn decode_dir_entry(directory: &[u8], n: usize) -> (usize, usize) { + let base = n * GF_DIRECTORY_ENTRY_BYTES as usize; + let data_off = i32::from_le_bytes(directory[base..base + 4].try_into().unwrap()) as usize; + let n_points = + i32::from_le_bytes(directory[base + 4..base + 8].try_into().unwrap()) as usize; + (data_off, n_points) + } + + /// Decode the `(x, y)` knots stored at relative `data` offset `rel_off` for + /// a table of `n_points` (interleaved f64 LE x,y pairs). + fn decode_knots(data: &[u8], rel_off: usize, n_points: usize) -> Vec<(f64, f64)> { + (0..n_points) + .map(|k| { + let a = rel_off + k * GF_KNOT_BYTES as usize; + let x = f64::from_le_bytes(data[a..a + 8].try_into().unwrap()); + let y = f64::from_le_bytes(data[a + 8..a + 16].try_into().unwrap()); + (x, y) + }) + .collect() + } + + /// Task 1 (pure layout): `build_gf_regions` concatenates several tables into + /// the data region in order, and the directory maps each global table index + /// to its *absolute* data byte offset + point count. The data offset for + /// table `t` must be `data_base` plus the byte span of all earlier tables. + #[test] + fn build_gf_regions_lays_out_directory_and_data() { + let region_base = 4096u32; + let tables = vec![ + vec![(0.0, 10.0), (1.0, 20.0), (2.5, 5.0)], + vec![(-1.0, 0.5)], + vec![(0.0, 0.0), (10.0, 100.0)], + ]; + let regions = build_gf_regions(&tables, region_base) + .expect("layout must succeed") + .expect("non-empty tables yield Some"); + + // Directory immediately at region_base; data follows the directory. + assert_eq!(regions.directory_base, region_base); + let directory_bytes = tables.len() as u32 * GF_DIRECTORY_ENTRY_BYTES; + assert_eq!(regions.data_base, region_base + directory_bytes); + assert_eq!(regions.directory.len(), directory_bytes as usize); + + // Walk the directory; each table's data offset is absolute and its + // knots round-trip exactly. The running expected offset is data_base + // plus the byte span of all previously-laid tables. + let mut expected_abs = regions.data_base as usize; + let mut total_knot_bytes = 0usize; + for (t, table) in tables.iter().enumerate() { + let (data_off, n_points) = decode_dir_entry(®ions.directory, t); + assert_eq!(n_points, table.len(), "table {t} point count"); + assert_eq!(data_off, expected_abs, "table {t} absolute data offset"); + + let rel = data_off - regions.data_base as usize; + assert_eq!( + decode_knots(®ions.data, rel, n_points).as_slice(), + table.as_slice(), + "table {t} knots round-trip" + ); + + let span = table.len() * GF_KNOT_BYTES as usize; + expected_abs += span; + total_knot_bytes += span; + } + assert_eq!( + regions.total_bytes as usize, + directory_bytes as usize + total_knot_bytes, + "total span covers directory + all knots" + ); + } + + /// Task 3 (pure serializer): a `WasmLayout` round-trips through + /// `serialize`/`deserialize` -- the geometry and the full name->offset map are + /// recovered exactly. The GF offsets are not part of the wire format (a host + /// reads results by name), so they come back as 0. + #[test] + fn wasm_layout_serialize_round_trips() { + let layout = WasmLayout { + n_slots: 7, + n_chunks: 101, + results_offset: 112, + gf_directory_offset: 4096, + gf_data_offset: 4104, + var_offsets: vec![ + ("time".to_string(), 0), + ("population".to_string(), 4), + ("a_var_with_a_longer_name".to_string(), 6), + ], + }; + let bytes = layout.serialize(); + let back = WasmLayout::deserialize(&bytes).expect("round-trip must succeed"); + assert_eq!(back.n_slots, 7); + assert_eq!(back.n_chunks, 101); + assert_eq!(back.results_offset, 112); + assert_eq!(back.var_offsets, layout.var_offsets); + // The GF offsets are not serialized; they reconstruct as 0. + assert_eq!(back.gf_directory_offset, 0); + assert_eq!(back.gf_data_offset, 0); + } + + /// Task 3 (serializer robustness): a truncated buffer deserializes to `None` + /// rather than panicking, so a host handed a corrupt buffer fails cleanly. + #[test] + fn wasm_layout_deserialize_truncated_is_none() { + let layout = WasmLayout { + n_slots: 2, + n_chunks: 3, + results_offset: 32, + gf_directory_offset: 0, + gf_data_offset: 0, + var_offsets: vec![("x".to_string(), 0), ("y".to_string(), 1)], + }; + let bytes = layout.serialize(); + // Every strict prefix of a valid buffer must fail to parse (each cuts off + // a length-prefixed field mid-way). + for cut in 0..bytes.len() { + assert!( + WasmLayout::deserialize(&bytes[..cut]).is_none(), + "a buffer truncated to {cut} bytes must not deserialize" + ); + } + // The full buffer parses. + assert!(WasmLayout::deserialize(&bytes).is_some()); + } + + /// Task 1 (pure layout): an empty table list yields no regions and no + /// growth, so a model without graphical functions is unaffected. + #[test] + fn build_gf_regions_empty_is_none() { + assert!( + build_gf_regions(&[], 4096) + .expect("layout must succeed") + .is_none(), + "no tables -> no GF regions" + ); + } + + /// Task 1 (data-section round-trip): the GF regions reach the instantiated + /// module's linear memory via the active `DataSection`, at the bases the + /// directory advertises. Reads the directory entry for table 0 from memory, + /// follows its absolute data offset, and asserts the `(x, y)` knots are + /// present with the right count -- the contract the `Lookup` opcode (Task 3) + /// relies on. (Exercised end-to-end through a GF *model* once the opcode + /// lowers, in `compile_simulation_gf_lookup_modes_match_vm`.) + #[test] + fn assembled_module_initializes_gf_regions_in_memory() { + let knots = [(0.0, 10.0), (1.0, 20.0), (2.5, 5.0), (4.0, 40.0)]; + let region_base = WASM_PAGE_SIZE; // one page in, comfortably past slot 0 + let regions = build_gf_regions(std::slice::from_ref(&knots.to_vec()), region_base) + .expect("layout") + .expect("non-empty"); + + // A minimal module: one empty exported `run` (so the assembler shape is + // exercised) is unnecessary here -- assert directly that the active data + // segments initialize memory. Assemble via the production assembler with + // a single root instance of three empty (0-input) program functions. + let helpers = build_helpers(); + let empty = || { + let mut f = Function::new([]); + f.instruction(&I::End); + f + }; + let pages = (region_base + regions.total_bytes) + .div_ceil(WASM_PAGE_SIZE) + .max(1); + let empty_const_init = ConstRegionInit { + value_segments: Vec::new(), + valid_segments: Vec::new(), + }; + let wasm = assemble_simulation(AssembleParts { + helpers, + program_fns: vec![empty(), empty(), empty()], + run_fn: empty(), + // Empty (no-op) override functions: this test only checks the GF data + // segments, so the override exports are present but trivial. + set_value_fn: { + let mut f = Function::new([]); + // A `(i32, f64) -> i32` body must leave an i32 on the stack. + f.instruction(&I::I32Const(0)); + f.instruction(&I::End); + f + }, + reset_fn: empty(), + clear_values_fn: empty(), + instance_input_counts: &[0], + pages, + n_slots: 0, + n_chunks: 0, + results_base: 0, + gf_regions: &[®ions], + const_init: &empty_const_init, + }); + + let info = validate(&wasm).expect("module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let mem = store + .instance_export(inst, "memory") + .unwrap() + .as_mem() + .unwrap(); + + let dir_off = regions.directory_base as usize; + let (data_off, n_points, flat) = store.mem_access_mut_slice(mem, |bytes| { + let data_off = + i32::from_le_bytes(bytes[dir_off..dir_off + 4].try_into().unwrap()) as usize; + let n_points = + i32::from_le_bytes(bytes[dir_off + 4..dir_off + 8].try_into().unwrap()) as usize; + let flat: Vec = (0..n_points * 2) + .map(|i| { + let a = data_off + i * 8; + f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap()) + }) + .collect(); + (data_off, n_points, flat) + }); + + assert_eq!(n_points, knots.len(), "directory point count"); + assert_eq!( + data_off, regions.data_base as usize, + "table 0's data offset is the start of the data region" + ); + for (k, &(x, y)) in knots.iter().enumerate() { + assert_eq!(flat[2 * k], x, "knot {k} x"); + assert_eq!(flat[2 * k + 1], y, "knot {k} y"); + } + } + + /// Task 3 (end-to-end): a model with a graphical-function variable looked up + /// in all three modes -- `LOOKUP` (Interpolate), `LOOKUP FORWARD`, and + /// `LOOKUP BACKWARD` -- matches the VM at every saved step. The lookup index + /// is `TIME - 1`, which sweeps the table's x-domain plus a below-range + /// margin (negative at t=0) and an above-range margin, so the recorded + /// series exercise below/at-knot/between/above across the run. + #[test] + fn compile_simulation_gf_lookup_modes_match_vm() { + let knots = [(0.0, 10.0), (1.0, 20.0), (2.5, 5.0), (4.0, 40.0)]; + let datamodel = crate::test_common::TestProject::new("gf_modes") + // TIME 0..6, dt 0.25 -> index = TIME-1 sweeps -1..5 over [0,4] table. + .with_sim_time(0.0, 6.0, 0.25) + .aux("input", "TIME - 1", None) + .aux_with_gf("curve", "0", gf_from_knots(&knots)) + .aux("interp_val", "LOOKUP(curve, input)", None) + .aux("fwd_val", "LOOKUP_FORWARD(curve, input)", None) + .aux("bwd_val", "LOOKUP_BACKWARD(curve, input)", None) + .build_datamodel(); + + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + + let checked = assert_matches_vm(sim, &artifact); + // All five variables must reach parity: the three lookup-mode results + // (interp/fwd/bwd), the lookup-only `curve` holder they read, and its + // `input`. Pinning >= 5 (not just the 3 lookup modes) proves the + // lookup-only curve holder and its driver also match the VM. + assert!( + checked >= 5, + "expected to compare interp/fwd/bwd + curve + input, only checked {checked}" + ); + for name in ["interp_val", "fwd_val", "bwd_val"] { + assert!( + artifact.layout.var_offsets.iter().any(|(n, _)| n == name), + "{name} should be in the layout" + ); + } + } + + /// The FFI entry point goes through the salsa pipeline + `compile_simulation` + /// and returns a non-empty blob that validates under the interpreter. + #[test] + fn compile_datamodel_to_wasm_validates() { + let file = std::fs::File::open(POPULATION_XMILE).expect("open population model"); + let mut reader = BufReader::new(file); + let datamodel = open_xmile(&mut reader).expect("parse population xmile"); + + let wasm = compile_datamodel_to_wasm(&datamodel, "main").expect("wasm codegen"); + assert!(!wasm.is_empty(), "blob should be non-empty"); + validate(&wasm).expect("blob must validate under the interpreter"); + } + + // ── compile_simulation (CompiledSimulation -> wasm) ─────────────────── + + /// Build a `CompiledSimulation` for the named model of `datamodel` via the + /// production incremental pipeline (the same path the VM corpus uses). + fn compile_sim(datamodel: &crate::datamodel::Project, model_name: &str) -> CompiledSimulation { + let mut db = SimlinDb::default(); + let sync = sync_from_datamodel_incremental(&mut db, datamodel, None); + compile_project_incremental(&db, sync.project, model_name).expect("incremental compile") + } + + /// Run a `WasmArtifact` under the DLR-FT interpreter and return the + /// step-major results slab (`n_chunks * n_slots` f64, row-major by step). + fn run_artifact_results(artifact: &WasmArtifact) -> Vec { + let info = validate(&artifact.wasm).expect("generated module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let run = store + .instance_export(inst, "run") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(), ()>(run, ()) + .expect("run wasm"); + let mem = store + .instance_export(inst, "memory") + .unwrap() + .as_mem() + .unwrap(); + let n = artifact.layout.n_chunks * artifact.layout.n_slots; + let base = artifact.layout.results_offset; + store.mem_access_mut_slice(mem, |bytes| { + (0..n) + .map(|i| { + let a = base + i * 8; + f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap()) + }) + .collect() + }) + } + + /// Assert every variable in `artifact.layout` matches the VM's series for + /// the same `CompiledSimulation`. Returns the number of variables checked. + fn assert_matches_vm(sim: CompiledSimulation, artifact: &WasmArtifact) -> usize { + let n_slots = artifact.layout.n_slots; + let n_chunks = artifact.layout.n_chunks; + let wasm_data = run_artifact_results(artifact); + + let mut vm = Vm::new(sim).expect("vm creation"); + vm.run_to_end().expect("vm run"); + let vm_results = vm.into_results(); + + assert_eq!( + vm_results.step_count, n_chunks, + "saved-chunk count differs from VM" + ); + + let mut checked = 0usize; + for (name, wasm_off) in &artifact.layout.var_offsets { + let wasm_off = *wasm_off; + let ident = Ident::::from_str_unchecked(name); + let Some(&vm_off) = vm_results.offsets.get(&ident) else { + continue; + }; + for c in 0..n_chunks { + let vm_val = vm_results.data[c * vm_results.step_size + vm_off]; + let wasm_val = wasm_data[c * n_slots + wasm_off]; + let diff = (vm_val - wasm_val).abs(); + assert!( + diff < 1e-9, + "{name} mismatch at chunk {c}: vm={vm_val} wasm={wasm_val} (diff {diff})", + ); + } + checked += 1; + } + checked + } + + /// End-to-end VM parity for the `AllocateAvailable` opcode on the real + /// `allocate.xmile` corpus model. The model's supply ramps from 0 to 10 + /// over the run while total demand is 9, so the recorded series sweep all + /// three regimes -- `avail <= 0` (zeros) early, the partial-allocation + /// bisection over rectangular priority profiles in the middle, and + /// `avail >= total_demand` (full grant) once supply exceeds demand -- + /// against `Vm::new(sim).run_to_end()`. (The model is NOT in the active + /// `wasm_parity_floor` corpus; raising that floor is a separate task.) + #[test] + fn compile_simulation_allocate_available_matches_vm() { + let path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../test/sdeverywhere/models/allocate/allocate.xmile" + ); + let file = std::fs::File::open(path).expect("open allocate xmile"); + let mut reader = BufReader::new(file); + let datamodel = open_xmile(&mut reader).expect("parse allocate xmile"); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("allocate wasm codegen"); + let checked = assert_matches_vm(sim, &artifact); + assert!( + checked >= 5, + "expected to compare the allocate model's variables, only checked {checked}" + ); + assert!( + artifact + .layout + .var_offsets + .iter() + .any(|(n, _)| n.starts_with("shipments")), + "the arrayed shipments allocation should be in the layout" + ); + } + + #[test] + fn compile_simulation_population_matches_vm() { + let file = std::fs::File::open(POPULATION_XMILE).expect("open population model"); + let mut reader = BufReader::new(file); + let datamodel = open_xmile(&mut reader).expect("parse population xmile"); + + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + + // Geometry is self-consistent with the specs. + let specs = Specs::from(&datamodel.sim_specs); + assert_eq!(artifact.layout.n_chunks, specs.n_chunks); + + let checked = assert_matches_vm(sim, &artifact); + assert!( + checked >= 5, + "expected to compare the population model's variables, only checked {checked}" + ); + assert!( + artifact + .layout + .var_offsets + .iter() + .any(|(n, _)| n == "population"), + "the population stock should be in the layout" + ); + } + + #[test] + fn compile_simulation_simple_stock_flow_matches_vm() { + // A minimal scalar Euler model: a stock filled by a constant inflow. + let datamodel = crate::test_common::TestProject::new("simple") + .with_sim_time(0.0, 10.0, 1.0) + .aux("inflow_rate", "2", None) + .stock("level", "0", &["inflow"], &[], None) + .flow("inflow", "inflow_rate", None) + .build_datamodel(); + + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 2, "expected to compare level + inflow"); + // level should integrate to 2*10 = 20 by the last step. + let last = run_artifact_results(&artifact); + let n_slots = artifact.layout.n_slots; + let level_off = artifact + .layout + .var_offsets + .iter() + .find(|(n, _)| n == "level") + .map(|(_, off)| *off) + .expect("level offset"); + let last_step = (artifact.layout.n_chunks - 1) * n_slots + level_off; + assert!( + (last[last_step] - 20.0).abs() < 1e-9, + "level should reach 20" + ); + } + + #[test] + fn compile_simulation_save_step_cadence_matches_vm() { + // Exercises the conditional-save / non-save-step copy-back branch of + // `save_advance!` (`vm.rs:682`): with save_step = 2*dt, most steps copy + // `next -> curr` WITHOUT recording a snapshot, and only every other step + // (plus the forced t=start sample) writes a results row. Every other + // wasmgen test uses save_step = None (save_every = 1), so this is the + // only coverage of the multi-step cadence. + let mut datamodel = crate::test_common::TestProject::new("cadence") + .with_sim_time(0.0, 10.0, 1.0) + .aux("inflow_rate", "2", None) + .stock("level", "0", &["inflow"], &[], None) + .flow("inflow", "inflow_rate", None) + .build_datamodel(); + // `with_sim_time` clears save_step to dt; the builder has no + // `with_save_step`, so set it directly: save_step = 2, dt = 1. + datamodel.sim_specs.save_step = Some(crate::datamodel::Dt::Dt(2.0)); + + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + + // dt=1, save_step=2 over [0,10] saves at t=0,2,4,6,8,10 -> 6 chunks. + assert_eq!( + artifact.layout.n_chunks, 6, + "save_step = 2*dt over [0,10] should yield 6 saved samples" + ); + + // Per-variable series + saved-chunk count both match the VM (which + // `assert_matches_vm` asserts via `step_count == n_chunks`). + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 2, "expected to compare level + inflow"); + } + + #[test] + fn compile_simulation_conditional_model_matches_vm() { + // Exercises the SetCond/If lowering through the whole-model path. + let datamodel = crate::test_common::TestProject::new("cond") + .with_sim_time(0.0, 5.0, 1.0) + .aux("threshold", "3", None) + .aux("gated", "IF TIME > threshold THEN 10 ELSE 1", None) + .stock("acc", "0", &["gated_flow"], &[], None) + .flow("gated_flow", "gated", None) + .build_datamodel(); + + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 2, "expected to compare gated + acc"); + } + + // ── PREVIOUS / INIT (Task 1: snapshot regions + LoadPrev/LoadInitial) ── + + /// Task 1: `PREVIOUS(x)` under Euler. At t0 the snapshot has not been taken, + /// so `LoadPrev` returns its fallback (the 0 the unary `PREVIOUS` desugars + /// to); after the first step it returns the prior step's `x`. The series + /// must match the VM, which gates the same fallback-vs-snapshot choice on + /// `use_prev_fallback`. + #[test] + fn compile_simulation_previous_matches_vm() { + let datamodel = crate::test_common::TestProject::new("prev") + .with_sim_time(0.0, 5.0, 1.0) + // x ramps each step so PREVIOUS(x) is a visibly-lagged series. + .stock("x", "10", &["grow"], &[], None) + .flow("grow", "1", None) + .aux("x_prev", "PREVIOUS(x)", None) + .build_datamodel(); + + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 2, "expected to compare x + x_prev"); + } + + /// Instantiate `artifact` ONCE and invoke the exported `run` `runs` times in + /// sequence with no `reset` between, returning the results slab read after + /// each call. Models the wasm backend's documented "instantiate once, re-run + /// on every change" usage (interactive scrubbing; the POC's `run` "re-runs + /// the whole simulation" per call) -- which exercises the cross-run state + /// reset that a single `run` invocation cannot. + fn run_artifact_results_repeated(artifact: &WasmArtifact, runs: usize) -> Vec> { + let info = validate(&artifact.wasm).expect("generated module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let n = artifact.layout.n_chunks * artifact.layout.n_slots; + let base = artifact.layout.results_offset; + let mut out = Vec::with_capacity(runs); + for _ in 0..runs { + let run = store + .instance_export(inst, "run") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(), ()>(run, ()) + .expect("run wasm"); + let mem = store + .instance_export(inst, "memory") + .unwrap() + .as_mem() + .unwrap(); + let slab = store.mem_access_mut_slice(mem, |bytes| { + (0..n) + .map(|i| { + let a = base + i * 8; + f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap()) + }) + .collect::>() + }); + out.push(slab); + } + out + } + + /// Regression (PR #620 review): `run` reseeds the time globals and reruns + /// initials, so it is a complete simulation from t0 and the documented + /// per-change entry point for repeated re-simulation. It must therefore + /// reset the PREVIOUS fallback flag itself, mirroring the VM's `run_initials` + /// (which sets `use_prev_fallback = true` at the start of every run). Without + /// that reset, the loop leaves the flag at 0, so a SECOND `run` on the same + /// instance reads the first run's final `prev_values` on step 0 (and during + /// initials) instead of the fallback -- contaminating any `PREVIOUS(...)` + /// model. This instantiates once and runs twice with no `reset` between: a + /// deterministic model must produce identical results both times, and + /// `x_prev` at t0 must be the unary-PREVIOUS fallback (0), not the stale + /// prior-run value. + #[test] + fn compile_simulation_repeated_run_resets_previous_fallback() { + let datamodel = crate::test_common::TestProject::new("prev_repeat") + .with_sim_time(0.0, 5.0, 1.0) + .stock("x", "10", &["grow"], &[], None) + .flow("grow", "1", None) + .aux("x_prev", "PREVIOUS(x)", None) + .build_datamodel(); + + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + + let runs = run_artifact_results_repeated(&artifact, 2); + let (first, second) = (&runs[0], &runs[1]); + + // A deterministic model re-run from t0 produces byte-identical results; + // the bug makes the second run's PREVIOUS reads diverge on step 0. + assert_eq!( + first, second, + "second run() diverged from the first -- stale PREVIOUS fallback state leaked across runs" + ); + + // Pin the discriminating cell: x_prev at the first saved chunk (t0) is + // the unary-PREVIOUS fallback (0), not the prior run's final x. + let x_prev_off = artifact + .layout + .var_offsets + .iter() + .find(|(name, _)| name == "x_prev") + .map(|(_, off)| *off) + .expect("x_prev in layout"); + assert_eq!( + second[x_prev_off], 0.0, + "x_prev at t0 on the second run must be the PREVIOUS fallback (0), got {}", + second[x_prev_off] + ); + } + + /// Regression (PR #620 review): a stock at an absolute slot offset >= 65536 + /// must address its real slot under RK integration, not `off & 0xFFFF`. Such + /// offsets are reachable in a large nested model (each submodel/SMOOTH/DELAY + /// instance adds slots; nothing caps total `n_slots` in the wasm path). The + /// RK stage delta `next[off] - curr[off]` is computed by + /// `emit_compute_stage_delta`; the original bug threaded `off` as `u16`, so a + /// stock at offset 65536 read slot `65536 & 0xFFFF == 0` (TIME) instead of its + /// own. This drives the helper at offset 65536 over a hand-built memory whose + /// slot 0 and slot 65536 hold distinct values and asserts it reads slot 65536 + /// (matching the Euler advance, which has always used the full-width offset). + #[test] + fn rk_stage_delta_addresses_stock_above_65535() { + // 65536 & 0xFFFF == 0, so a truncated offset would alias slot 0 (TIME). + const HIGH_OFF: u32 = 65536; + // `curr` holds slots [0, HIGH_OFF]; `next` sits one stride past it. + let next_base = (HIGH_OFF + 1) * SLOT_SIZE; + + // probe() -> f64: L_RK_S := next[HIGH_OFF] - curr[HIGH_OFF]; return it. + // Locals mirror the run fn so the f64 local L_RK_S (index 4) is valid. + let mut probe = Function::new([(3, ValType::I32), (2, ValType::F64)]); + emit_compute_stage_delta(&mut probe, next_base, HIGH_OFF); + probe.instruction(&I::LocalGet(L_RK_S)); + probe.instruction(&I::End); + + let mut module = WasmModule::new(); + let mut types = TypeSection::new(); + types.ty().function([], [ValType::F64]); + module.section(&types); + let mut functions = FunctionSection::new(); + functions.function(0); + module.section(&functions); + let bytes_needed = next_base + (HIGH_OFF + 1) * SLOT_SIZE; + let mut memories = MemorySection::new(); + memories.memory(MemoryType { + minimum: u64::from(bytes_needed.div_ceil(65536) + 1), + maximum: None, + memory64: false, + shared: false, + page_size_log2: None, + }); + module.section(&memories); + let mut exports = ExportSection::new(); + exports.export("probe", ExportKind::Func, 0); + exports.export("memory", ExportKind::Memory, 0); + module.section(&exports); + let mut code = CodeSection::new(); + code.function(&probe); + module.section(&code); + let wasm = module.finish(); + + let info = validate(&wasm).expect("module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let mem = store + .instance_export(inst, "memory") + .unwrap() + .as_mem() + .unwrap(); + // Seed slot 0 (the alias target under truncation) and slot HIGH_OFF with + // distinct values, so reading the wrong slot yields a distinguishable result. + let curr_hi = (HIGH_OFF * SLOT_SIZE) as usize; + let next0 = next_base as usize; + let next_hi = (next_base + HIGH_OFF * SLOT_SIZE) as usize; + store.mem_access_mut_slice(mem, |b| { + b[0..8].copy_from_slice(&100.0f64.to_le_bytes()); // curr[0] + b[next0..next0 + 8].copy_from_slice(&200.0f64.to_le_bytes()); // next[0] + b[curr_hi..curr_hi + 8].copy_from_slice(&3.0f64.to_le_bytes()); // curr[HIGH_OFF] + b[next_hi..next_hi + 8].copy_from_slice(&10.0f64.to_le_bytes()); // next[HIGH_OFF] + }); + let probe_fn = store + .instance_export(inst, "probe") + .unwrap() + .as_func() + .unwrap(); + let delta: f64 = store + .invoke_simple_typed::<(), f64>(probe_fn, ()) + .expect("probe"); + + // next[HIGH_OFF] - curr[HIGH_OFF] = 10 - 3 = 7. A truncated u16 offset + // would read slot 0 instead (200 - 100 = 100). + assert_eq!( + delta, 7.0, + "RK stage delta read the wrong slot -- stock offset truncated above 65535?" + ); + } + + /// Task 1: `INIT(x)` referenced from a flow reads the `initial_values` + /// snapshot captured once after the initials phase (in the flows/stocks + /// programs `LoadInitial` reads `initial_values[off]`, never `curr`). Here + /// the inflow is held at `INIT(level)`, so `level` integrates by its own + /// initial value each step; the wasm series must match the VM. + #[test] + fn compile_simulation_init_from_flow_matches_vm() { + let datamodel = crate::test_common::TestProject::new("init_flow") + .with_sim_time(0.0, 5.0, 1.0) + .stock("level", "7", &["inflow"], &[], None) + // INIT(level) is captured once at t0 (= 7) and stays 7 every step. + .flow("inflow", "INIT(level)", None) + .build_datamodel(); + + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 2, "expected to compare level + inflow"); + // level starts at 7 and grows by INIT(level)=7 each of 5 steps -> 42. + let results = run_artifact_results(&artifact); + let n_slots = artifact.layout.n_slots; + let level_off = artifact + .layout + .var_offsets + .iter() + .find(|(n, _)| n == "level") + .map(|(_, off)| *off) + .expect("level offset"); + let last = (artifact.layout.n_chunks - 1) * n_slots + level_off; + assert!( + (results[last] - 42.0).abs() < 1e-9, + "level should reach 7 + 5*7 = 42, got {}", + results[last] + ); + } + + /// Task 1: `INIT(x)` referenced from *another initial equation* reads + /// `curr` during the initials phase (the snapshot is taken only after + /// initials run). `seed` is computed during initials, and `derived`'s + /// initial equation reads `INIT(seed)` -- which must resolve to the + /// just-computed `curr[seed]`, not an as-yet-unwritten `initial_values`. + #[test] + fn compile_simulation_init_from_initial_matches_vm() { + let datamodel = crate::test_common::TestProject::new("init_initial") + .with_sim_time(0.0, 3.0, 1.0) + .aux("seed", "5", None) + // A stock whose INITIAL equation reads INIT(seed): during initials + // LoadInitial must read curr[seed] (= 5), so derived starts at 5. + .stock("derived", "INIT(seed)", &["hold"], &[], None) + .flow("hold", "0", None) + .build_datamodel(); + + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 2, "expected to compare seed + derived"); + // derived initializes to INIT(seed)=5 and the flow holds it there. + // Chunk 0 starts at slab offset 0, so `derived_off` indexes it directly. + let results = run_artifact_results(&artifact); + let derived_off = artifact + .layout + .var_offsets + .iter() + .find(|(n, _)| n == "derived") + .map(|(_, off)| *off) + .expect("derived offset"); + assert!( + (results[derived_off] - 5.0).abs() < 1e-9, + "derived should initialize to INIT(seed) = 5, got {}", + results[derived_off] + ); + } + + // ── RK2 / RK4 integration loops (Task 2) ────────────────────────────── + + /// A logistic-growth model: `pop' = rate * pop * (1 - pop/capacity)`. The + /// nonlinear flow depends on the stock, so RK's trial-point evaluations + /// genuinely differ from Euler -- a pure-constant flow would let a broken RK + /// loop pass by coincidence. + fn logistic_growth( + name: &str, + method: crate::datamodel::SimMethod, + ) -> crate::datamodel::Project { + crate::test_common::TestProject::new(name) + .with_sim_time(0.0, 20.0, 0.5) + .with_sim_method(method) + .aux("rate", "0.3", None) + .aux("capacity", "1000", None) + .stock("pop", "10", &["growth"], &[], None) + .flow("growth", "rate * pop * (1 - pop / capacity)", None) + .build_datamodel() + } + + /// Task 2: an RK4 scalar model matches the VM's saved samples (cadence and + /// values). The VM's RK4 loop is the oracle; the emitted four-stage loop + /// with time juggling + the end-of-step flows re-eval must reproduce it. + #[test] + fn compile_simulation_rk4_matches_vm() { + let datamodel = logistic_growth("rk4_logistic", crate::datamodel::SimMethod::RungeKutta4); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (RK4)"); + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 2, "expected to compare pop + growth"); + } + + /// Task 2: an RK2 (Heun) scalar model matches the VM's saved samples. Same + /// nonlinear model so the two-stage trial step is genuinely exercised. + #[test] + fn compile_simulation_rk2_matches_vm() { + let datamodel = logistic_growth("rk2_logistic", crate::datamodel::SimMethod::RungeKutta2); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (RK2)"); + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 2, "expected to compare pop + growth"); + } + + /// Task 2: RK4 and RK2 must genuinely differ from Euler on this nonlinear + /// model -- otherwise the RK tests above could pass against a loop that + /// silently fell back to Euler. Establishes that the oracle (the VM) sees a + /// method-dependent trajectory, so wasm-vs-VM parity is a meaningful check. + #[test] + fn rk_methods_differ_from_euler_in_vm() { + let last_pop = |method| { + let datamodel = logistic_growth("rk_vs_euler", method); + let sim = compile_sim(&datamodel, "main"); + let mut vm = Vm::new(sim).expect("vm"); + vm.run_to_end().expect("vm run"); + let results = vm.into_results(); + let pop = Ident::::from_str_unchecked("pop"); + let off = *results.offsets.get(&pop).expect("pop offset"); + results.data[(results.step_count - 1) * results.step_size + off] + }; + let euler = last_pop(crate::datamodel::SimMethod::Euler); + let rk4 = last_pop(crate::datamodel::SimMethod::RungeKutta4); + let rk2 = last_pop(crate::datamodel::SimMethod::RungeKutta2); + assert!( + (euler - rk4).abs() > 1e-6, + "RK4 must differ from Euler (euler={euler}, rk4={rk4})" + ); + assert!( + (euler - rk2).abs() > 1e-6, + "RK2 must differ from Euler (euler={euler}, rk2={rk2})" + ); + } + + /// A coupled two-stock Lotka-Volterra (predator-prey) model. Each stock's + /// flows read the *other* stock, so a single RK stage's trial-point + /// evaluation interleaves both stocks: `prey`'s `predation` outflow reads + /// `predator`, and `predator`'s `growth` inflow reads `prey`. This is what + /// the single-stock RK tests cannot exercise -- with two stocks the stage + /// math walks `stock_offsets` and keeps each stock's `saved[i]`/`accum[i]` + /// and trial `curr[off_i]` independent. A loop that aliased the scratch + /// across stocks, or iterated `stock_offsets` in an unstable order, would + /// corrupt one stock's trajectory and fail the VM-parity check below. + /// + /// Classic textbook parameters (alpha/beta/gamma/delta) on a short horizon + /// with a small dt: the system oscillates, both stay strictly positive, and + /// Euler vs RK4/RK2 visibly diverge (asserted by + /// `multi_stock_coupled_diverges_euler_vs_rk_in_vm`). 100 steps keeps the + /// un-JITed DLR-FT run well under the per-test budget. + fn lotka_volterra( + name: &str, + method: crate::datamodel::SimMethod, + ) -> crate::datamodel::Project { + crate::test_common::TestProject::new(name) + .with_sim_time(0.0, 5.0, 0.05) + .with_sim_method(method) + .aux("alpha", "1.1", None) + .aux("beta", "0.4", None) + .aux("gamma", "0.4", None) + .aux("delta", "0.1", None) + // prey: d/dt = alpha*prey - beta*prey*predator + .stock("prey", "10", &["prey_birth"], &["predation"], None) + .flow("prey_birth", "alpha * prey", None) + .flow("predation", "beta * prey * predator", None) + // predator: d/dt = delta*prey*predator - gamma*predator + .stock("predator", "10", &["pred_growth"], &["pred_death"], None) + .flow("pred_growth", "delta * prey * predator", None) + .flow("pred_death", "gamma * predator", None) + .build_datamodel() + } + + /// Meaningfulness precondition for the two-stock RK parity tests: the + /// coupled model's trajectory is genuinely method-dependent in the VM (the + /// oracle) for *both* stocks. Without this, a wasm RK loop that silently + /// degraded to Euler -- or never advanced the second stock -- could pass + /// `assert_matches_vm` against a coincidentally-identical VM Euler series. + #[test] + fn multi_stock_coupled_diverges_euler_vs_rk_in_vm() { + let last_two = |method| { + let datamodel = lotka_volterra("lv_vs_euler", method); + let sim = compile_sim(&datamodel, "main"); + let mut vm = Vm::new(sim).expect("vm"); + vm.run_to_end().expect("vm run"); + let results = vm.into_results(); + let read = |name: &str| { + let id = Ident::::from_str_unchecked(name); + let off = *results + .offsets + .get(&id) + .unwrap_or_else(|| panic!("{name} offset")); + results.data[(results.step_count - 1) * results.step_size + off] + }; + (read("prey"), read("predator")) + }; + let (e_prey, e_pred) = last_two(crate::datamodel::SimMethod::Euler); + let (rk4_prey, rk4_pred) = last_two(crate::datamodel::SimMethod::RungeKutta4); + let (rk2_prey, rk2_pred) = last_two(crate::datamodel::SimMethod::RungeKutta2); + // Both stocks must move under RK4 and RK2 relative to Euler -- proving + // the stage math integrates each independently, not just the first. + assert!( + (e_prey - rk4_prey).abs() > 1e-6 && (e_pred - rk4_pred).abs() > 1e-6, + "RK4 must differ from Euler for both stocks \ + (prey: euler={e_prey} rk4={rk4_prey}; predator: euler={e_pred} rk4={rk4_pred})" + ); + assert!( + (e_prey - rk2_prey).abs() > 1e-6 && (e_pred - rk2_pred).abs() > 1e-6, + "RK2 must differ from Euler for both stocks \ + (prey: euler={e_prey} rk2={rk2_prey}; predator: euler={e_pred} rk2={rk2_pred})" + ); + } + + /// Coverage gap closed: a TWO-STOCK COUPLED model under RK4 matches the VM + /// per-variable, per-chunk. The phase's other RK tests are single-stock, so + /// this is the only check that the four-stage stage math keeps two stocks' + /// `saved[i]`/`accum[i]`/`curr[off_i]` independent and iterates + /// `stock_offsets` in a stable order across all four stages. `checked >= 2` + /// pins that both stocks (not just `prey`) reached parity. + #[test] + fn compile_simulation_two_stock_coupled_rk4_matches_vm() { + let datamodel = lotka_volterra("lv_rk4", crate::datamodel::SimMethod::RungeKutta4); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (two-stock RK4)"); + let checked = assert_matches_vm(sim, &artifact); + // Both stocks plus the four flows and four params all match; pin >= 2 so + // the two coupled stocks specifically are among the compared variables. + assert!( + checked >= 2, + "expected to compare both prey + predator, only checked {checked}" + ); + for name in ["prey", "predator"] { + assert!( + artifact.layout.var_offsets.iter().any(|(n, _)| n == name), + "{name} should be in the layout" + ); + } + } + + /// The RK2 (Heun) companion to `compile_simulation_two_stock_coupled_rk4_matches_vm`: + /// the two-stage trial step over two coupled stocks matches the VM. + #[test] + fn compile_simulation_two_stock_coupled_rk2_matches_vm() { + let datamodel = lotka_volterra("lv_rk2", crate::datamodel::SimMethod::RungeKutta2); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (two-stock RK2)"); + let checked = assert_matches_vm(sim, &artifact); + assert!( + checked >= 2, + "expected to compare both prey + predator, only checked {checked}" + ); + } + + /// Task 2: a model using `PREVIOUS`/`INIT` under RK4 matches the VM. The + /// snapshot timing is the subtle part: `prev_values` is captured AFTER the + /// end-of-step flows re-eval (with `curr` restored to time-`t` state), not + /// from a trial point. `x_prev` lags `pop`; `pop_init` reads INIT(pop). + #[test] + fn compile_simulation_rk4_with_previous_and_init_matches_vm() { + let datamodel = crate::test_common::TestProject::new("rk4_prev_init") + .with_sim_time(0.0, 10.0, 0.5) + .with_sim_method(crate::datamodel::SimMethod::RungeKutta4) + .aux("rate", "0.3", None) + .aux("capacity", "1000", None) + .stock("pop", "10", &["growth"], &[], None) + .flow("growth", "rate * pop * (1 - pop / capacity)", None) + // PREVIOUS(pop): lagged by one saved step; captured after re-eval. + .aux("pop_prev", "PREVIOUS(pop)", None) + // INIT(pop): the t0 snapshot (= 10), read from initial_values. + .aux("pop_init", "INIT(pop)", None) + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (RK4 + PREVIOUS/INIT)"); + let checked = assert_matches_vm(sim, &artifact); + assert!( + checked >= 4, + "expected to compare pop + growth + pop_prev + pop_init" + ); + } + + /// After Task 2, RK4 (and RK2) are supported, so a model using them runs + /// rather than being rejected -- the inverse of the Phase-1 guard. Pinned so + /// a regression that re-introduced the Euler-only guard would be caught. + #[test] + fn compile_simulation_accepts_rk4() { + let datamodel = crate::test_common::TestProject::new("rk4_accept") + .with_sim_time(0.0, 5.0, 1.0) + .with_sim_method(crate::datamodel::SimMethod::RungeKutta4) + .aux("inflow_rate", "2", None) + .stock("level", "0", &["inflow"], &[], None) + .flow("inflow", "inflow_rate", None) + .build_datamodel(); + + let sim = compile_sim(&datamodel, "main"); + compile_simulation(&sim).expect("RK4 must now be supported"); + } + + // ── Modules: EvalModule / LoadModuleInput (Phase 7 Task 1) ──────────── + // + // Each unique `(model, input_set)` instance becomes its own initials/flows/ + // stocks wasm function taking `(module_off: i32, in_0..in_{k-1}: f64)`. An + // `EvalModule` resolves the child instance and `call`s its function for the + // current `StepPart`, passing `module_off + decl.off` and the popped inputs; + // `LoadModuleInput` reads an input parameter. These tests assert wasm matches + // the VM for submodel-bearing models, including the SMOOTH stdlib macro (which + // expands to implicit module stocks) and the same instance at two offsets. + + /// A two-model datamodel: a `main` model that instantiates `submodel` + /// `n_instances` times, wiring `in_value` (an aux in `main`) into each + /// instance's `in` input. The submodel computes `out = body` (referencing its + /// own `in`); `body_is_stock` makes `out` a stock integrating `body`, so the + /// submodel carries internal stocks reached only through `EvalModule` (the + /// nested-stock-offset case). `TestProject` only emits a single `main` model, + /// so this is built as an explicit datamodel. + fn submodel_project( + name: &str, + method: crate::datamodel::SimMethod, + in_value: &str, + body: &str, + body_is_stock: bool, + n_instances: usize, + ) -> crate::datamodel::Project { + use crate::datamodel; + let mut main_vars: Vec = + vec![datamodel::Variable::Aux(datamodel::Aux { + ident: "in_value".to_string(), + equation: datamodel::Equation::Scalar(in_value.to_string()), + documentation: String::new(), + units: None, + gf: None, + ai_state: None, + uid: None, + compat: datamodel::Compat::default(), + })]; + for i in 0..n_instances { + let ident = format!("sub{i}"); + main_vars.push(datamodel::Variable::Module(datamodel::Module { + // A module reference's `dst` is qualified with the instance name + // (`subN.in`), not the bare input variable; an unqualified `dst` + // silently fails to wire the input (the submodel's `in` keeps its + // default), which would make `LoadModuleInput` untested. + references: vec![datamodel::ModuleReference { + src: "in_value".to_string(), + dst: format!("{ident}.in"), + }], + ident, + model_name: "submodel".to_string(), + documentation: String::new(), + units: None, + compat: datamodel::Compat::default(), + ai_state: None, + uid: None, + })); + } + + let out_var = if body_is_stock { + datamodel::Variable::Stock(datamodel::Stock { + ident: "out".to_string(), + equation: datamodel::Equation::Scalar("0".to_string()), + documentation: String::new(), + units: None, + inflows: vec!["grow".to_string()], + outflows: vec![], + ai_state: None, + uid: None, + compat: datamodel::Compat::default(), + }) + } else { + datamodel::Variable::Aux(datamodel::Aux { + ident: "out".to_string(), + equation: datamodel::Equation::Scalar(body.to_string()), + documentation: String::new(), + units: None, + gf: None, + ai_state: None, + uid: None, + compat: datamodel::Compat::default(), + }) + }; + let mut submodel_vars = vec![ + datamodel::Variable::Aux(datamodel::Aux { + ident: "in".to_string(), + equation: datamodel::Equation::Scalar("0".to_string()), + documentation: String::new(), + units: None, + gf: None, + ai_state: None, + uid: None, + compat: datamodel::Compat { + can_be_module_input: true, + ..datamodel::Compat::default() + }, + }), + out_var, + ]; + if body_is_stock { + submodel_vars.push(datamodel::Variable::Flow(datamodel::Flow { + ident: "grow".to_string(), + equation: datamodel::Equation::Scalar(body.to_string()), + documentation: String::new(), + units: None, + gf: None, + ai_state: None, + uid: None, + compat: datamodel::Compat::default(), + })); + } + + datamodel::Project { + name: name.to_string(), + sim_specs: datamodel::SimSpecs { + start: 0.0, + stop: 5.0, + dt: datamodel::Dt::Dt(1.0), + save_step: None, + sim_method: method, + time_units: None, + }, + dimensions: vec![], + units: vec![], + models: vec![ + datamodel::Model { + name: "main".to_string(), + sim_specs: None, + variables: main_vars, + views: vec![], + loop_metadata: vec![], + groups: vec![], + macro_spec: None, + }, + datamodel::Model { + name: "submodel".to_string(), + sim_specs: None, + variables: submodel_vars, + views: vec![], + loop_metadata: vec![], + groups: vec![], + macro_spec: None, + }, + ], + source: Default::default(), + ai_information: None, + } + } + + /// A two-model datamodel like [`submodel_project`], but the submodel carries + /// its OWN overridable constant `k` (a flows-phase `AssignConstCurr`) and + /// `out = in + k`. Instantiating it `n_instances` times in `main` gives each + /// instance a DISTINCT absolute offset for its own `k` (the recursive + /// `base_off + module_decl.off` addressing), so a per-instance `set_value` + /// override on one instance's `k` must not perturb the other. `in_value` is a + /// constant wired into every instance's `in`, so the only differentiator + /// between two instances' `out` is each instance's `k` override. + fn submodel_with_constant_project( + name: &str, + in_value: &str, + k_default: &str, + n_instances: usize, + ) -> crate::datamodel::Project { + use crate::datamodel; + let mut main_vars: Vec = + vec![datamodel::Variable::Aux(datamodel::Aux { + ident: "in_value".to_string(), + equation: datamodel::Equation::Scalar(in_value.to_string()), + documentation: String::new(), + units: None, + gf: None, + ai_state: None, + uid: None, + compat: datamodel::Compat::default(), + })]; + for i in 0..n_instances { + let ident = format!("sub{i}"); + main_vars.push(datamodel::Variable::Module(datamodel::Module { + references: vec![datamodel::ModuleReference { + src: "in_value".to_string(), + dst: format!("{ident}.in"), + }], + ident, + model_name: "submodel".to_string(), + documentation: String::new(), + units: None, + compat: datamodel::Compat::default(), + ai_state: None, + uid: None, + })); + } + + let submodel_vars = vec![ + datamodel::Variable::Aux(datamodel::Aux { + ident: "in".to_string(), + equation: datamodel::Equation::Scalar("0".to_string()), + documentation: String::new(), + units: None, + gf: None, + ai_state: None, + uid: None, + compat: datamodel::Compat { + can_be_module_input: true, + ..datamodel::Compat::default() + }, + }), + // `k` is a bare constant, so it lowers to a flows-phase + // `AssignConstCurr` -- i.e. an overridable constant, distinct per + // instance. + datamodel::Variable::Aux(datamodel::Aux { + ident: "k".to_string(), + equation: datamodel::Equation::Scalar(k_default.to_string()), + documentation: String::new(), + units: None, + gf: None, + ai_state: None, + uid: None, + compat: datamodel::Compat::default(), + }), + datamodel::Variable::Aux(datamodel::Aux { + ident: "out".to_string(), + equation: datamodel::Equation::Scalar("in + k".to_string()), + documentation: String::new(), + units: None, + gf: None, + ai_state: None, + uid: None, + compat: datamodel::Compat::default(), + }), + ]; + + datamodel::Project { + name: name.to_string(), + sim_specs: datamodel::SimSpecs { + start: 0.0, + stop: 3.0, + dt: datamodel::Dt::Dt(1.0), + save_step: None, + sim_method: datamodel::SimMethod::Euler, + time_units: None, + }, + dimensions: vec![], + units: vec![], + models: vec![ + datamodel::Model { + name: "main".to_string(), + sim_specs: None, + variables: main_vars, + views: vec![], + loop_metadata: vec![], + groups: vec![], + macro_spec: None, + }, + datamodel::Model { + name: "submodel".to_string(), + sim_specs: None, + variables: submodel_vars, + views: vec![], + loop_metadata: vec![], + groups: vec![], + macro_spec: None, + }, + ], + source: Default::default(), + ai_information: None, + } + } + + /// Task 1: a model instantiating a submodel runs through wasm and matches the + /// VM. The submodel's `out` depends on its `in` input (passed from `main`), so + /// this exercises both `EvalModule` (the child `call`) and `LoadModuleInput` + /// (the child reading its passed input). Previously this construct was rejected + /// as `submodules are not supported`. + #[test] + fn compile_simulation_submodel_matches_vm() { + let datamodel = submodel_project( + "submod", + crate::datamodel::SimMethod::Euler, + "TIME + 1", + "in * 2", + false, + 1, + ); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (submodel)"); + let checked = assert_matches_vm(sim, &artifact); + assert!( + checked >= 2, + "expected to compare main's in_value + the submodel's out, only checked {checked}" + ); + // The submodel's output slot is in the single shared slab, addressed at + // `module_off + off`; its layout entry confirms it was emitted. + assert!( + artifact + .layout + .var_offsets + .iter() + .any(|(n, _)| n.ends_with("out")), + "the submodel's `out` should be in the layout" + ); + } + + /// Task 1: `LoadModuleInput` reads the right input. The submodel's output is + /// exactly its input, and `in_value` varies with TIME, so a wrong input-param + /// index (or a missing pass-through) would diverge from the VM immediately. + #[test] + fn compile_simulation_submodel_loadmoduleinput_reads_right_input() { + let datamodel = submodel_project( + "passthru", + crate::datamodel::SimMethod::Euler, + "TIME * 3 + 1", + "in", // out == in: a pure pass-through of the module input + false, + 1, + ); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (passthrough)"); + + // out must equal in_value (= TIME*3+1) at every saved step. + let results = run_artifact_results(&artifact); + let n_slots = artifact.layout.n_slots; + let find = |needle: &str| { + artifact + .layout + .var_offsets + .iter() + .find(|(n, _)| n.ends_with(needle)) + .map(|(_, o)| *o) + .unwrap_or_else(|| panic!("{needle} offset")) + }; + let in_off = find("in_value"); + let out_off = find("out"); + for c in 0..artifact.layout.n_chunks { + let in_v = results[c * n_slots + in_off]; + let out_v = results[c * n_slots + out_off]; + assert!( + (in_v - out_v).abs() < 1e-9, + "submodel out must equal its passed input at chunk {c}: in={in_v} out={out_v}" + ); + } + // And the whole model matches the VM. + assert_matches_vm(sim, &artifact); + } + + /// Task 1 (the `module_off` proof): the SAME `(model, input_set)` instance, + /// instantiated twice in `main`, runs through wasm and matches the VM. Both + /// instances share one `CompiledModule` (one function triple) but run at two + /// different base offsets, so `module_off` must thread correctly into the + /// child's slab reads/writes. Each `EvalModule` passes a distinct + /// `module_off + decl.off`. + #[test] + fn compile_simulation_two_instances_same_module_matches_vm() { + let datamodel = submodel_project( + "twice", + crate::datamodel::SimMethod::Euler, + "TIME + 2", + "in * 10", + false, + 2, + ); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (two instances)"); + let checked = assert_matches_vm(sim, &artifact); + assert!( + checked >= 3, + "expected to compare in_value + both instances' out, only checked {checked}" + ); + // Both instances' outputs occupy distinct slots in the shared slab. + let out_slots: Vec = artifact + .layout + .var_offsets + .iter() + .filter(|(n, _)| n.ends_with("out")) + .map(|(_, o)| *o) + .collect(); + assert_eq!( + out_slots.len(), + 2, + "two instances should contribute two distinct `out` slots, got {out_slots:?}" + ); + assert_ne!( + out_slots[0], out_slots[1], + "the two instances must run at different module offsets" + ); + } + + /// Task 1 (per-instance DISTINCT overrides -- the direct test of the + /// absolute-slot const-region addressing): the SAME `CompiledModule`, + /// instantiated twice in `main`, carries DISTINCT `set_value` overrides for + /// its own constant `k`. Each instance's `k` lives at a distinct absolute + /// offset (`base_off + module_decl.off`, the recursion in + /// `collect_overridable_defaults`); the wasm override region is indexed by + /// that absolute offset, so overriding instance 0's `k` to 100 and instance + /// 1's `k` to 200 makes each instance's `out = in + k` reflect ITS OWN + /// override. A bug that applied one override to both instances, or that + /// ignored `module_off` (writing both overrides to the same slot), would make + /// the two `out` series equal -- which the non-vacuity `assert_ne!` rejects. + /// + /// This is a wasm-only correctness property: the VM is NOT a valid cell-for- + /// cell oracle for *distinct* overrides of a SHARED module, because its + /// `set_value_by_offset` mutates the module's shared bytecode literal (one + /// `literal_id` for both instances, resolved through the single shared + /// `ModuleKey`), so the second override clobbers the first and both instances + /// read the last value. The wasm backend is strictly more correct here. The + /// VM divergence is tracked separately; this test still anchors against the + /// VM in the regime where they DO agree -- both instances overridden to the + /// SAME value (`compile_simulation_two_instances_same_value_override_matches_vm`). + #[test] + fn compile_simulation_two_instances_distinct_overrides() { + // `in_value` is the constant 7 wired into both instances' `in`, so the + // ONLY differentiator between the two instances' `out` is each instance's + // `k` override (default 1). + let datamodel = submodel_with_constant_project("distinct", "7", "1", 2); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (distinct overrides)"); + + let (k0_off, k1_off) = instance_k_offsets(&artifact); + assert_ne!( + k0_off, k1_off, + "the two instances' `k` must occupy distinct absolute offsets" + ); + assert!( + sim.is_constant_offset(k0_off) && sim.is_constant_offset(k1_off), + "each instance's `k` must be a VM-overridable constant (sub0·k={k0_off}, sub1·k={k1_off})" + ); + + // Apply DIFFERENT overrides to the two instances, then reset + run. + let wasm_slab = run_artifact_with_overrides(&artifact, &[(k0_off, 100.0), (k1_off, 200.0)]); + let n_slots = artifact.layout.n_slots; + let n_chunks = artifact.layout.n_chunks; + + // Non-vacuity: each instance's `out` reflects ITS OWN override, and the + // two genuinely DIFFER. `in_value` is 7, so sub0·out = 7 + 100 = 107 and + // sub1·out = 7 + 200 = 207 at every saved step. If a bug applied one + // override to both instances (or ignored `module_off` and wrote both to + // one slot), the two `out` series would be equal and this would fail. + let out0_off = layout_offset(&artifact, qualified_ident("sub0", "out").as_str()); + let out1_off = layout_offset(&artifact, qualified_ident("sub1", "out").as_str()); + for c in 0..n_chunks { + let out0 = wasm_slab[c * n_slots + out0_off]; + let out1 = wasm_slab[c * n_slots + out1_off]; + assert!( + (out0 - 107.0).abs() < 1e-9, + "sub0·out should be in_value(7)+k0(100)=107 at chunk {c}, got {out0}" + ); + assert!( + (out1 - 207.0).abs() < 1e-9, + "sub1·out should be in_value(7)+k1(200)=207 at chunk {c}, got {out1}" + ); + assert_ne!( + out0, out1, + "the two instances' outputs must DIFFER under distinct per-instance overrides" + ); + } + } + + /// Task 1 (VM parity anchor for the shared-module override path): overriding + /// BOTH instances' `k` to the SAME value matches the VM cell-for-cell. This is + /// the regime where the VM and wasm agree -- the VM's shared-literal clobber + /// (see `compile_simulation_two_instances_distinct_overrides`) is harmless + /// when both overrides carry the same value -- so it proves the wasm override + /// mechanism is faithful to the VM (not merely internally consistent) for a + /// shared `CompiledModule` instantiated at two `module_off`s. + #[test] + fn compile_simulation_two_instances_same_value_override_matches_vm() { + let datamodel = submodel_with_constant_project("same_val", "7", "1", 2); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + + let (k0_off, k1_off) = instance_k_offsets(&artifact); + let wasm_slab = run_artifact_with_overrides(&artifact, &[(k0_off, 300.0), (k1_off, 300.0)]); + let n_slots = artifact.layout.n_slots; + let n_chunks = artifact.layout.n_chunks; + + let mut vm = Vm::new(compile_sim(&datamodel, "main")).expect("vm creation"); + vm.set_value_by_offset(k0_off, 300.0) + .expect("sub0·k must be a VM-overridable constant"); + vm.set_value_by_offset(k1_off, 300.0) + .expect("sub1·k must be a VM-overridable constant"); + vm.run_to_end().expect("vm run"); + let vm_results = vm.into_results(); + assert_eq!( + vm_results.step_count, n_chunks, + "saved-chunk count differs from VM" + ); + + let mut checked = 0usize; + for (name, wasm_off) in &artifact.layout.var_offsets { + let wasm_off = *wasm_off; + let ident = Ident::::from_str_unchecked(name); + let Some(&vm_off) = vm_results.offsets.get(&ident) else { + continue; + }; + for c in 0..n_chunks { + let vm_val = vm_results.data[c * vm_results.step_size + vm_off]; + let wasm_val = wasm_slab[c * n_slots + wasm_off]; + assert!( + (vm_val - wasm_val).abs() < 1e-9, + "{name} mismatch at chunk {c} under same-value override: \ + vm={vm_val} wasm={wasm_val}" + ); + } + checked += 1; + } + assert!( + checked >= 3, + "expected to compare in_value + both instances' k/out, only checked {checked}" + ); + // Both instances reach 7 + 300 = 307 (the override took on both). + let out0_off = layout_offset(&artifact, qualified_ident("sub0", "out").as_str()); + let out1_off = layout_offset(&artifact, qualified_ident("sub1", "out").as_str()); + assert!( + (wasm_slab[out0_off] - 307.0).abs() < 1e-9 + && (wasm_slab[out1_off] - 307.0).abs() < 1e-9, + "both instances should reach 7+300=307 under the shared override" + ); + } + + /// Task 1 (nested stocks under Euler): a submodel whose `out` is a stock + /// integrating a flow that depends on its `in` input. The submodel's internal + /// stock is reached only through `EvalModule`, and its offset must be picked + /// up by the recursive stock-offset collection so the Euler advance copies it + /// `next -> curr`. The wasm must match the VM. + #[test] + fn compile_simulation_submodel_nested_stock_euler_matches_vm() { + let datamodel = submodel_project( + "nested_stock", + crate::datamodel::SimMethod::Euler, + "2", + "in", // grow = in (= 2); out integrates by 2 each step + true, + 1, + ); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (nested stock)"); + let checked = assert_matches_vm(sim, &artifact); + assert!( + checked >= 2, + "expected to compare in_value + nested out stock" + ); + // Pin the nested stock's value so this can't pass vacuously with an + // un-wired input (`in` defaulting to 0). `grow = in = 2` integrates the + // nested `out` stock by 2 each of the 5 Euler steps -> 10. + let results = run_artifact_results(&artifact); + let n_slots = artifact.layout.n_slots; + let out_off = artifact + .layout + .var_offsets + .iter() + .find(|(n, _)| n.ends_with("out")) + .map(|(_, o)| *o) + .expect("nested out offset"); + let last = (artifact.layout.n_chunks - 1) * n_slots + out_off; + assert!( + (results[last] - 10.0).abs() < 1e-9, + "nested out stock should integrate to 2*5 = 10, got {}", + results[last] + ); + } + + /// Task 1 (nested stocks under RK4): the same nested-stock submodel under RK4. + /// The recursive stock-offset collection must feed the RK stage math (saved/ + /// accum scratch indexed by stock position) the submodel's internal stock, so + /// the four-stage integration covers nested stocks. The wasm must match the VM. + #[test] + fn compile_simulation_submodel_nested_stock_rk4_matches_vm() { + // A nonlinear flow so RK genuinely differs from Euler: grow = in - out/10, + // a first-order approach to a steady state, evaluated at trial points. + let datamodel = submodel_project( + "nested_stock_rk4", + crate::datamodel::SimMethod::RungeKutta4, + "5", + "in - out / 10", + true, + 1, + ); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (nested stock RK4)"); + let checked = assert_matches_vm(sim, &artifact); + assert!( + checked >= 2, + "expected to compare in_value + nested out stock" + ); + } + + /// Task 1 (stdlib macro -> implicit module stocks): `SMTH1(input, delay)` + /// expands to a stdlib `smth1` submodule carrying an internal SMOOTH stock. + /// The whole model must match the VM, proving the implicit-module path (the + /// stdlib instance's own `ByteCodeContext`, its nested stock under the RK/Euler + /// loop, and the `EvalModule`/`LoadModuleInput` wiring) reproduces the VM. + /// `SMTH1` was the canonical still-`Skipped` construct before this task. + /// + /// A NaN-aware comparison: the stdlib `smth1` instance carries an internal + /// `initial_value` helper slot that is NaN at the t=0 results snapshot in + /// *both* the VM and wasm (it is not written into `curr` before the forced + /// t=0 save), so a finite-difference compare would spuriously fail on a + /// faithful NaN==NaN match. Every user-visible variable (`input`, + /// `smoothed`) is finite and compared exactly. + #[test] + fn compile_simulation_smooth_macro_matches_vm() { + let datamodel = crate::test_common::TestProject::new("smooth") + .with_sim_time(0.0, 8.0, 0.25) + .aux("input", "TIME", None) + .aux("smoothed", "SMTH1(input, 2)", None) + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (SMTH1)"); + // Pin that `smoothed` is finite and nonzero at the last step, so the + // NaN-aware comparison cannot pass vacuously (an all-NaN `smoothed` would + // satisfy NaN==NaN). A 2-unit smoothing of `input = TIME` reaches a + // meaningful positive value by t=8. + let results = run_artifact_results(&artifact); + let n_slots = artifact.layout.n_slots; + let smoothed_off = artifact + .layout + .var_offsets + .iter() + .find(|(n, _)| n == "smoothed") + .map(|(_, o)| *o) + .expect("smoothed offset"); + let last = (artifact.layout.n_chunks - 1) * n_slots + smoothed_off; + assert!( + results[last].is_finite() && results[last] > 0.0, + "smoothed should be finite and positive by the last step, got {}", + results[last] + ); + let checked = assert_matches_vm_nan_aware(sim, &artifact); + assert!( + checked >= 2, + "expected to compare input + smoothed, only checked {checked}" + ); + } + + /// Task 1 (DELAY stdlib macro under RK4): `DELAY3` expands to a stdlib + /// submodule with three chained internal SMOOTH stocks, exercising a deeper + /// nested-stock chain under the RK4 stage math. The wasm must match the VM. + /// NaN-aware for the same internal-`initial_value` reason as the SMTH1 test. + #[test] + fn compile_simulation_delay3_macro_rk4_matches_vm() { + let datamodel = crate::test_common::TestProject::new("delay3") + .with_sim_time(0.0, 8.0, 0.25) + .with_sim_method(crate::datamodel::SimMethod::RungeKutta4) + .aux("input", "TIME", None) + .aux("delayed", "DELAY3(input, 2)", None) + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (DELAY3 RK4)"); + let checked = assert_matches_vm_nan_aware(sim, &artifact); + assert!( + checked >= 2, + "expected to compare input + delayed, only checked {checked}" + ); + } + + /// AC4.1: a host reads the three exported geometry globals from the + /// instantiated module and uses them (no external metadata) to stride one + /// variable's series, which must match the VM. + #[test] + fn compile_simulation_exports_self_describing_geometry() { + let file = std::fs::File::open(POPULATION_XMILE).expect("open population model"); + let mut reader = BufReader::new(file); + let datamodel = open_xmile(&mut reader).expect("parse population xmile"); + + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + + let info = validate(&artifact.wasm).expect("module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + + // Read the three i32 geometry globals straight from the module. + let read_global = |store: &mut Store<()>, name: &str| -> usize { + let g = store + .instance_export(inst, name) + .unwrap() + .as_global() + .unwrap(); + match store.global_read(g) { + checked::StoredValue::I32(x) => x as usize, + other => panic!("expected i32 global, got {other:?}"), + } + }; + let n_slots = read_global(&mut store, "n_slots"); + let n_chunks = read_global(&mut store, "n_chunks"); + let results_offset = read_global(&mut store, "results_offset"); + + // They equal the layout values. + assert_eq!(n_slots, artifact.layout.n_slots); + assert_eq!(n_chunks, artifact.layout.n_chunks); + assert_eq!(results_offset, artifact.layout.results_offset); + + // Stride to the population series using only module-reported geometry. + let run = store + .instance_export(inst, "run") + .unwrap() + .as_func() + .unwrap(); + store + .invoke_simple_typed::<(), ()>(run, ()) + .expect("run wasm"); + let mem = store + .instance_export(inst, "memory") + .unwrap() + .as_mem() + .unwrap(); + let pop_off = artifact + .layout + .var_offsets + .iter() + .find(|(n, _)| n == "population") + .map(|(_, off)| *off) + .expect("population offset"); + let pop_series: Vec = store.mem_access_mut_slice(mem, |bytes| { + (0..n_chunks) + .map(|c| { + let a = results_offset + (c * n_slots + pop_off) * 8; + f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap()) + }) + .collect() + }); + + let mut vm = Vm::new(sim).expect("vm"); + vm.run_to_end().expect("vm run"); + let vm_results = vm.into_results(); + let pop = Ident::::from_str_unchecked("population"); + let vm_pop_off = *vm_results.offsets.get(&pop).expect("vm population offset"); + for (c, &wasm_val) in pop_series.iter().enumerate() { + let vm_val = vm_results.data[c * vm_results.step_size + vm_pop_off]; + assert!( + (vm_val - wasm_val).abs() < 1e-9, + "population mismatch at chunk {c}: vm={vm_val} wasm={wasm_val}" + ); + } + } + + // ── Array reducers end-to-end (Phase 5 Tasks 1-2) ───────────────────── + // + // These compile real reducer models through the production salsa pipeline + // (so the bytecode is the genuine `PushStaticView; Array; PopView` + // codegen emits, with all constant subscripts baked into the static view) + // and assert the wasm matches the VM. They are the gold-standard parity + // checks for Tasks 1-2; the inline `lower.rs` unit tests pin the individual + // view ops against the VM's addressing oracle. + + /// Assert a single scalar variable's wasm series matches the VM, allowing a + /// NaN-vs-NaN match (`assert_matches_vm` rejects NaN via its abs-diff + /// tolerance, so the empty-view / OOB reducers need this NaN-aware variant). + fn assert_scalar_matches_vm(sim: CompiledSimulation, artifact: &WasmArtifact, name: &str) { + let n_slots = artifact.layout.n_slots; + let n_chunks = artifact.layout.n_chunks; + let wasm_data = run_artifact_results(artifact); + + let mut vm = Vm::new(sim).expect("vm creation"); + vm.run_to_end().expect("vm run"); + let vm_results = vm.into_results(); + + let wasm_off = artifact + .layout + .var_offsets + .iter() + .find(|(n, _)| n == name) + .map(|(_, off)| *off) + .unwrap_or_else(|| panic!("{name} not in wasm layout")); + let ident = Ident::::from_str_unchecked(name); + let vm_off = *vm_results + .offsets + .get(&ident) + .unwrap_or_else(|| panic!("{name} not in vm offsets")); + + for c in 0..n_chunks { + let vm_val = vm_results.data[c * vm_results.step_size + vm_off]; + let wasm_val = wasm_data[c * n_slots + wasm_off]; + if vm_val.is_nan() { + assert!( + wasm_val.is_nan(), + "{name} chunk {c}: vm=NaN but wasm={wasm_val}" + ); + } else { + assert!( + (vm_val - wasm_val).abs() < 1e-9, + "{name} chunk {c}: vm={vm_val} wasm={wasm_val}" + ); + } + } + } + + /// A 1-D `SUM(source[3:5])` over an indexed dimension: a range subscript that + /// codegen bakes into a static view with `offset=2`, `dims=[3]`. The whole + /// model (including the arrayed `source`) must match the VM. + #[test] + fn compile_simulation_sum_range_matches_vm() { + let datamodel = crate::test_common::TestProject::new("sum_range") + .with_sim_time(0.0, 3.0, 1.0) + .indexed_dimension("A", 5) + .array_aux("source[A]", "3 * A + 1") + .scalar_aux("total", "SUM(source[3:5])") + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (SUM range)"); + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 1, "expected to compare source elements + total"); + } + + /// `SUM(values[*:SubA])` (star-range) selects a sparse subset of a named + /// dimension's elements; codegen bakes the sparse mapping into the static + /// view, exercising the sparse addressing path against the VM. (A transposed + /// reducer like `SUM(matrix')` instead hoists into a `BeginIter` temp-copy + /// loop, so it lands in Phase 5 Task 3; the transpose `ViewDesc` transform + /// itself is pinned by `lower.rs`'s `view_transpose_then_reduce_matches_vm`.) + #[test] + fn compile_simulation_sum_star_range_matches_vm() { + let datamodel = crate::test_common::TestProject::new("sum_star_range") + .with_sim_time(0.0, 2.0, 1.0) + .named_dimension("DimA", &["A1", "A2", "A3", "A4"]) + .named_dimension("SubA", &["A2", "A3"]) + .array_with_ranges( + "values[DimA]", + vec![("A1", "10"), ("A2", "20"), ("A3", "30"), ("A4", "40")], + ) + .scalar_aux("total", "SUM(values[*:SubA])") + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (SUM star range)"); + // The whole model (including the sparse-selected `total` = A2+A3 = 50) + // matches the VM element-for-element. + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 1); + // Independently pin the sparse selection value against the VM. + let sim2 = compile_sim(&datamodel, "main"); + assert_scalar_matches_vm(sim2, &artifact, "total"); + } + + /// A per-element sliced reducer `msum[D] = SUM(m[D, *])` over a 2-D array. + /// Each output element is its own `PushStaticView; ArraySum; PopView` over a + /// per-row static view (the A2A target unrolls to per-element bytecode). + #[test] + fn compile_simulation_sliced_row_sum_matches_vm() { + let datamodel = crate::test_common::TestProject::new("row_sum") + .with_sim_time(0.0, 2.0, 1.0) + .indexed_dimension("D", 2) + .indexed_dimension("E", 3) + .array_aux("m[D, E]", "10 * D + E") + .array_aux("msum[D]", "SUM(m[D, *])") + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (row sum)"); + let checked = assert_matches_vm(sim, &artifact); + assert!( + checked >= 1, + "expected to compare m elements + msum elements" + ); + } + + /// MEAN / STDDEV / MAX / MIN / SIZE over a range slice, each matching the VM. + /// One model carries all five so a single compile exercises every reducer's + /// production lowering. + #[test] + fn compile_simulation_all_reducers_match_vm() { + let datamodel = crate::test_common::TestProject::new("all_reducers") + .with_sim_time(0.0, 2.0, 1.0) + .indexed_dimension("A", 5) + .array_aux("source[A]", "2 * A") + .scalar_aux("mean_val", "MEAN(source[2:4])") + .scalar_aux("stddev_val", "STDDEV(source[1:5])") + .scalar_aux("max_val", "MAX(source[2:4])") + .scalar_aux("min_val", "MIN(source[2:4])") + .scalar_aux("size_val", "SIZE(source[2:4])") + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (all reducers)"); + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 5, "expected to compare all five reducer results"); + for name in ["mean_val", "stddev_val", "max_val", "min_val", "size_val"] { + assert!( + artifact.layout.var_offsets.iter().any(|(n, _)| n == name), + "{name} should be in the layout" + ); + } + } + + // The empty-but-valid view reducer asymmetry (SUM->0.0 vs others->NaN) and + // the invalid-view->NaN-for-all asymmetry are pinned directly against the + // VM's `reduce_view` semantics by the inline `lower.rs` unit tests + // (`empty_valid_view_*` / `invalid_view_*`): a literal empty range + // (`source[4:3]`) is rejected at compile time, and a runtime-empty range + // (`source[start:end]` with `start > end`) plus an out-of-bounds dynamic + // subscript both go through `ViewRangeDynamic` / `ViewSubscriptDynamic`, + // which are Phase 5 Task 4, so the end-to-end coverage of those cases lands + // there. + + // ── Phase 5 Task 3: BeginIter iteration loops (end-to-end) ──────────── + // + // The broadcasting `LoadIterViewAt` path (source dims != iter dims) and the + // standalone `BeginBroadcastIter` family are not reachable through the + // current production codegen (an A2A elementwise op is scalar-unrolled, and a + // mismatched-dim reducer argument fails the engine's own dimension check), so + // those are pinned directly against the VM by hand-built-bytecode unit tests + // in `lower.rs` (`iter_loop_*` / `broadcast_iter_*`). The two reachable + // shapes -- a hoisted same-dim reducer loop and the deferred transpose + // reducer -- are covered end-to-end here. + + /// `SUM(2 * source[3:5] + 1)`: the elementwise expression is hoisted into an + /// `AssignTemp` `BeginIter` loop (codegen.rs:1183-1378), then `SUM` reduces + /// the temp. The whole-model wasm must match the VM element-for-element. + #[test] + fn compile_simulation_hoisted_reducer_loop_matches_vm() { + let datamodel = crate::test_common::TestProject::new("hoist") + .with_sim_time(0.0, 2.0, 1.0) + .indexed_dimension("A", 5) + .array_aux("source[A]", "A") + .scalar_aux("summed", "SUM(2 * source[3:5] + 1)") + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (hoisted reducer)"); + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 1, "expected to compare summed"); + } + + /// `SUM(matrix')`: the transpose materializes the transposed matrix into a + /// temp via a `BeginIter` loop reading the (transposed) source through + /// `LoadIterViewAt`, then sums the temp. This is the case Subcomponent A + /// deferred to the iteration task; the wasm must match the VM. + #[test] + fn compile_simulation_transpose_reducer_matches_vm() { + let datamodel = crate::test_common::TestProject::new("transpose") + .with_sim_time(0.0, 2.0, 1.0) + .indexed_dimension("A", 2) + .indexed_dimension("B", 3) + .array_aux("matrix[A,B]", "A * 10 + B") + .scalar_aux("summed", "SUM(matrix')") + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen (transpose)"); + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 1, "expected to compare summed"); + } + + // ── Phase 5 Task 4: dynamic subscripts + OOB->NaN (end-to-end) ──────── + + /// Assert every layout variable matches the VM, treating a NaN on both sides + /// as equal (the OOB-subscript result). The plain `assert_matches_vm` uses a + /// finite-difference compare that a NaN would fail, so the OOB tests use this. + fn assert_matches_vm_nan_aware(sim: CompiledSimulation, artifact: &WasmArtifact) -> usize { + let n_slots = artifact.layout.n_slots; + let n_chunks = artifact.layout.n_chunks; + let wasm_data = run_artifact_results(artifact); + let mut vm = Vm::new(sim).expect("vm creation"); + vm.run_to_end().expect("vm run"); + let vm_results = vm.into_results(); + assert_eq!(vm_results.step_count, n_chunks, "saved-chunk count differs"); + + let mut checked = 0usize; + for (name, wasm_off) in &artifact.layout.var_offsets { + let ident = Ident::::from_str_unchecked(name); + let Some(&vm_off) = vm_results.offsets.get(&ident) else { + continue; + }; + for c in 0..n_chunks { + let vm_val = vm_results.data[c * vm_results.step_size + vm_off]; + let wasm_val = wasm_data[c * n_slots + *wasm_off]; + if vm_val.is_nan() { + assert!( + wasm_val.is_nan(), + "{name} chunk {c}: vm=NaN but wasm={wasm_val}" + ); + } else { + let diff = (vm_val - wasm_val).abs(); + assert!(diff < 1e-9, "{name} chunk {c}: vm={vm_val} wasm={wasm_val}"); + } + } + checked += 1; + } + checked + } + + /// Legacy scalar dynamic subscript `arr[idx]` (`PushSubscriptIndex` / + /// `LoadSubscript`), in range: the wasm must match the VM. + #[test] + fn compile_simulation_scalar_dynamic_subscript_in_range_matches_vm() { + let datamodel = crate::test_common::TestProject::new("dyn") + .with_sim_time(0.0, 2.0, 1.0) + .indexed_dimension("A", 4) + .array_aux("arr[A]", "A * 10") + .scalar_aux("idx", "3") + .scalar_aux("picked", "arr[idx]") + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 1, "expected to compare picked"); + } + + /// Legacy scalar dynamic subscript `arr[idx]` out of range -> NaN, matching + /// the VM (`vm.rs:1343` sets the subscript invalid, `1361` pushes NaN). + #[test] + fn compile_simulation_scalar_dynamic_subscript_oob_is_nan() { + // idx = 99 is well past the 4-element dimension -> NaN on both backends. + let datamodel = crate::test_common::TestProject::new("dyn_oob") + .with_sim_time(0.0, 2.0, 1.0) + .indexed_dimension("A", 4) + .array_aux("arr[A]", "A * 10") + .scalar_aux("idx", "99") + .scalar_aux("picked", "arr[idx]") + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + let checked = assert_matches_vm_nan_aware(sim, &artifact); + assert!(checked >= 1, "expected to compare picked"); + + // Pin the NaN directly: `picked` must be NaN at every step. + let n_slots = artifact.layout.n_slots; + let off = artifact + .layout + .var_offsets + .iter() + .find(|(n, _)| n == "picked") + .map(|(_, o)| *o) + .expect("picked offset"); + let data = run_artifact_results(&artifact); + for c in 0..artifact.layout.n_chunks { + assert!( + data[c * n_slots + off].is_nan(), + "out-of-bounds arr[idx] must be NaN at chunk {c}" + ); + } + } + + /// `ViewSubscriptDynamic` via `SUM(mat[row, 1])`: a dynamically-subscripted + /// view reduced to a scalar. In range, wasm matches the VM. + #[test] + fn compile_simulation_view_dynamic_subscript_in_range_matches_vm() { + let datamodel = crate::test_common::TestProject::new("vdyn") + .with_sim_time(0.0, 2.0, 1.0) + .indexed_dimension("A", 3) + .indexed_dimension("B", 4) + .array_aux("mat[A,B]", "A * 10 + B") + .scalar_aux("row", "2") + .scalar_aux("picked", "SUM(mat[row, 1])") + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + let checked = assert_matches_vm(sim, &artifact); + assert!(checked >= 1, "expected to compare picked"); + } + + /// `ViewSubscriptDynamic` out of range -> the view is invalid -> the reducer + /// yields NaN for *all* reducers, matching `reduce_view`'s `if !is_valid`. + #[test] + fn compile_simulation_view_dynamic_subscript_oob_is_nan() { + let datamodel = crate::test_common::TestProject::new("vdyn_oob") + .with_sim_time(0.0, 2.0, 1.0) + .indexed_dimension("A", 3) + .indexed_dimension("B", 4) + .array_aux("mat[A,B]", "A * 10 + B") + .scalar_aux("row", "99") // out of range for dim A (size 3) + .scalar_aux("picked", "SUM(mat[row, 1])") + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + let checked = assert_matches_vm_nan_aware(sim, &artifact); + assert!(checked >= 1, "expected to compare picked"); + + let n_slots = artifact.layout.n_slots; + let off = artifact + .layout + .var_offsets + .iter() + .find(|(n, _)| n == "picked") + .map(|(_, o)| *o) + .expect("picked offset"); + let data = run_artifact_results(&artifact); + for c in 0..artifact.layout.n_chunks { + assert!( + data[c * n_slots + off].is_nan(), + "out-of-bounds SUM(mat[row,1]) must be NaN at chunk {c}" + ); + } + } + + /// AC4.2: a by-name series read strides the results slab using only the + /// layout's `n_slots`/`results_offset` + the variable's offset, copies exactly + /// `n_chunks` values (never the whole `n_chunks * n_slots` slab), and equals + /// the VM's `get_series` for that variable. This is the read pattern a host + /// performs over the blob's results region (the FFI returns the same layout). + #[test] + fn by_name_series_read_strides_slab_and_matches_vm_get_series() { + let file = std::fs::File::open(POPULATION_XMILE).expect("open population model"); + let mut reader = BufReader::new(file); + let datamodel = open_xmile(&mut reader).expect("parse population xmile"); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + + let n_slots = artifact.layout.n_slots; + let n_chunks = artifact.layout.n_chunks; + let results_offset = artifact.layout.results_offset; + let pop_off = layout_offset(&artifact, "population"); + + // Run the blob and read the whole results region once (the host would map + // the module's memory; here we copy it out). + let slab = run_artifact_results(&artifact); + + // Stride out ONLY `population`'s series: exactly `n_chunks` reads at + // `results_offset/8 + c*n_slots + off` (the slab is f64-indexed here). + let _ = results_offset; // documents the byte base; `slab` already starts at it + let mut series = Vec::with_capacity(n_chunks); + for c in 0..n_chunks { + series.push(slab[c * n_slots + pop_off]); + } + assert_eq!( + series.len(), + n_chunks, + "a by-name read copies exactly n_chunks values, not the whole slab" + ); + assert!( + n_slots > 1, + "the model must have >1 slot so striding (not a full copy) is meaningful" + ); + + // It equals the VM's get_series for the same variable. + let mut vm = Vm::new(sim).expect("vm"); + vm.run_to_end().expect("vm run"); + let pop = Ident::::from_str_unchecked("population"); + let vm_series = vm.get_series(&pop).expect("vm get_series(population)"); + assert_eq!( + vm_series.len(), + series.len(), + "series length matches the VM" + ); + for (c, (&w, &v)) in series.iter().zip(vm_series.iter()).enumerate() { + assert!( + (w - v).abs() < 1e-9, + "population chunk {c}: striped wasm read {w} != vm get_series {v}" + ); + } + } + + // ── set_value / reset override mechanism (Phase 7 Task 2) ───────────── + // + // An exported `set_value(offset, val) -> i32` writes the override into the + // constants region (0 ok / nonzero when `offset` is not overridable), an + // exported `reset()` resets run state without clearing the region (overrides + // persist across reset, like the VM), and the next `run` re-runs initials + + // the loop sourcing the overridable `AssignConstCurr` from the region. + // `clear_values()` restores compiled defaults. These mirror the VM's + // `set_value_by_offset`/`reset`/`clear_values` (`vm.rs:976-1062`). + + /// Instantiate `artifact.wasm`, optionally apply a list of `(offset, value)` + /// overrides via the exported `set_value`, call `reset` then `run`, and copy + /// the step-major results slab out. Each `set_value` return code is checked to + /// be 0 (the caller passes only overridable offsets). Returns the slab. + fn run_artifact_with_overrides( + artifact: &WasmArtifact, + overrides: &[(usize, f64)], + ) -> Vec { + let info = validate(&artifact.wasm).expect("module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let set_value = store + .instance_export(inst, "set_value") + .expect("set_value export") + .as_func() + .expect("set_value is a function"); + for &(off, val) in overrides { + let rc: i32 = store + .invoke_simple_typed::<(i32, f64), i32>(set_value, (off as i32, val)) + .expect("set_value invoke"); + assert_eq!( + rc, 0, + "set_value({off}, {val}) should accept an overridable offset" + ); + } + let reset = store + .instance_export(inst, "reset") + .expect("reset export") + .as_func() + .expect("reset is a function"); + store + .invoke_simple_typed::<(), ()>(reset, ()) + .expect("reset invoke"); + let run = store + .instance_export(inst, "run") + .expect("run export") + .as_func() + .expect("run is a function"); + store + .invoke_simple_typed::<(), ()>(run, ()) + .expect("run invoke"); + let mem = store + .instance_export(inst, "memory") + .unwrap() + .as_mem() + .unwrap(); + let n = artifact.layout.n_chunks * artifact.layout.n_slots; + let base = artifact.layout.results_offset; + store.mem_access_mut_slice(mem, |bytes| { + (0..n) + .map(|i| { + let a = base + i * 8; + f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap()) + }) + .collect() + }) + } + + /// Call the exported `set_value` once on a freshly-instantiated module and + /// return its i32 return code, without running the simulation. Used to assert + /// the validation behavior (nonzero on a non-overridable offset). + fn set_value_rc(artifact: &WasmArtifact, off: i32, val: f64) -> i32 { + let info = validate(&artifact.wasm).expect("module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let set_value = store + .instance_export(inst, "set_value") + .expect("set_value export") + .as_func() + .expect("set_value is a function"); + store + .invoke_simple_typed::<(i32, f64), i32>(set_value, (off, val)) + .expect("set_value invoke") + } + + /// The absolute slab offset of `name` in the artifact's layout. + fn layout_offset(artifact: &WasmArtifact, name: &str) -> usize { + artifact + .layout + .var_offsets + .iter() + .find(|(n, _)| n == name) + .map(|(_, o)| *o) + .unwrap_or_else(|| panic!("{name} offset")) + } + + /// The canonical qualified ident for a sub-model `instance`'s sub-variable + /// `var` (`Ident::join`, the U+00B7 module-hierarchy separator), e.g. + /// `sub0·k`. Built the same way `calc_flattened_offsets_incremental` keys the + /// layout, so it stays correct if the separator ever changes. + fn qualified_ident(instance: &str, var: &str) -> Ident { + Ident::::join( + &Ident::::new(instance).as_canonical_str(), + &Ident::::new(var).as_canonical_str(), + ) + } + + /// The absolute slab offsets of the two `submodel_with_constant_project` + /// instances' own constant `k` (`sub0·k`, `sub1·k`). These are distinct + /// because `calc_flattened_offsets_incremental` advances the base offset per + /// instance, mirroring the VM's `collect_constant_info` recursion. + fn instance_k_offsets(artifact: &WasmArtifact) -> (usize, usize) { + ( + layout_offset(artifact, qualified_ident("sub0", "k").as_str()), + layout_offset(artifact, qualified_ident("sub1", "k").as_str()), + ) + } + + /// A VM run of `sim` with an override applied at absolute `off` (the VM's + /// `set_value_by_offset`), returning that variable's slab so wasm overrides + /// can be compared cell-for-cell against the VM oracle. + fn vm_results_with_override( + sim: CompiledSimulation, + off: usize, + val: f64, + ) -> (Vec, usize, usize) { + let mut vm = Vm::new(sim).expect("vm creation"); + vm.set_value_by_offset(off, val) + .expect("offset must be a VM-overridable constant"); + vm.run_to_end().expect("vm run"); + let results = vm.into_results(); + (results.data.to_vec(), results.step_size, results.step_count) + } + + /// AC5.1: overriding a constant via `set_value`, then `reset`, then `run`, + /// yields the same series the VM produces under the same override. A constant + /// aux feeds a flow that integrates a stock, so the override propagates into + /// every downstream value at every step -- a wrong source (or an override that + /// did not take) would diverge from the VM immediately. + #[test] + fn compile_simulation_set_value_override_matches_vm() { + let datamodel = crate::test_common::TestProject::new("override") + .with_sim_time(0.0, 5.0, 1.0) + .aux("inflow_rate", "2", None) + .stock("level", "0", &["inflow"], &[], None) + .flow("inflow", "inflow_rate", None) + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + + let rate_off = layout_offset(&artifact, "inflow_rate"); + assert!( + sim.is_constant_offset(rate_off), + "inflow_rate must be a VM-overridable constant for this test to be meaningful" + ); + + // Override the constant inflow_rate to 5 (was 2), so level integrates by + // 5/step: 0,5,10,...,25 -- visibly different from the default 0,2,...,10. + let wasm_slab = run_artifact_with_overrides(&artifact, &[(rate_off, 5.0)]); + let n_slots = artifact.layout.n_slots; + let n_chunks = artifact.layout.n_chunks; + + let sim_vm = compile_sim(&datamodel, "main"); + let (vm_data, vm_step_size, vm_step_count) = + vm_results_with_override(sim_vm, rate_off, 5.0); + assert_eq!(vm_step_count, n_chunks, "saved-chunk count differs from VM"); + + let mut checked = 0usize; + for (name, wasm_off) in &artifact.layout.var_offsets { + let wasm_off = *wasm_off; + let ident = Ident::::from_str_unchecked(name); + // Index the VM slab with the VM's own offset for this variable. It + // equals `wasm_off` (both backends derive offsets from + // `calc_flattened_offsets_incremental`), so this also skips the + // implicit globals the layout carries but the VM offsets map omits. + let vm_off = match sim.get_offset(&ident) { + Some(o) => o, + None => continue, + }; + for c in 0..n_chunks { + let vm_val = vm_data[c * vm_step_size + vm_off]; + let wasm_val = wasm_slab[c * n_slots + wasm_off]; + assert!( + (vm_val - wasm_val).abs() < 1e-9, + "{name} mismatch at chunk {c} under override: vm={vm_val} wasm={wasm_val}" + ); + } + checked += 1; + } + assert!( + checked >= 2, + "expected to compare inflow_rate + level + inflow" + ); + + // Pin the override actually took: level reaches 5*5 = 25 (not the default + // 10), so this cannot pass vacuously with an ignored override. + let level_off = layout_offset(&artifact, "level"); + let last = (n_chunks - 1) * n_slots + level_off; + assert!( + (wasm_slab[last] - 25.0).abs() < 1e-9, + "level under inflow_rate=5 should reach 25, got {}", + wasm_slab[last] + ); + } + + /// AC5.2: `reset` with no override reproduces the compiled-default series. A + /// `set_value`-then-reset-then-run with an empty override list must match a + /// plain VM run (the default literals), proving the constants region is + /// initialized to the compiled defaults and `reset` leaves them intact. + #[test] + fn compile_simulation_reset_no_override_restores_defaults() { + let datamodel = crate::test_common::TestProject::new("defaults") + .with_sim_time(0.0, 5.0, 1.0) + .aux("inflow_rate", "2", None) + .stock("level", "0", &["inflow"], &[], None) + .flow("inflow", "inflow_rate", None) + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + + let wasm_slab = run_artifact_with_overrides(&artifact, &[]); + let n_slots = artifact.layout.n_slots; + let n_chunks = artifact.layout.n_chunks; + + // The default run: level integrates by 2/step -> reaches 10. + let mut vm = Vm::new(compile_sim(&datamodel, "main")).expect("vm"); + vm.run_to_end().expect("vm run"); + let vm_results = vm.into_results(); + for (name, wasm_off) in &artifact.layout.var_offsets { + let wasm_off = *wasm_off; + let ident = Ident::::from_str_unchecked(name); + let Some(&vm_off) = vm_results.offsets.get(&ident) else { + continue; + }; + for c in 0..n_chunks { + let vm_val = vm_results.data[c * vm_results.step_size + vm_off]; + let wasm_val = wasm_slab[c * n_slots + wasm_off]; + assert!( + (vm_val - wasm_val).abs() < 1e-9, + "{name} default mismatch at chunk {c}: vm={vm_val} wasm={wasm_val}" + ); + } + } + let level_off = layout_offset(&artifact, "level"); + let last = (n_chunks - 1) * n_slots + level_off; + assert!( + (wasm_slab[last] - 10.0).abs() < 1e-9, + "default level should reach 10, got {}", + wasm_slab[last] + ); + } + + /// `set_value` on a non-constant offset returns the error code and does not + /// write. A stock's offset (`level`) is not an overridable constant (its + /// initial is a constant, but it is assigned via `AssignNext`, not an + /// `AssignConstCurr` in flows), so `set_value` must reject it. After the + /// rejected call the default run must be unchanged. + #[test] + fn compile_simulation_set_value_rejects_non_constant_offset() { + let datamodel = crate::test_common::TestProject::new("reject") + .with_sim_time(0.0, 5.0, 1.0) + .aux("inflow_rate", "2", None) + .stock("level", "0", &["inflow"], &[], None) + .flow("inflow", "inflow_rate", None) + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + + let level_off = layout_offset(&artifact, "level"); + assert!( + !sim.is_constant_offset(level_off), + "level (a stock) must not be a VM-overridable constant" + ); + // A non-overridable offset returns nonzero. + assert_ne!( + set_value_rc(&artifact, level_off as i32, 999.0), + 0, + "set_value on a stock offset must return a nonzero error code" + ); + // An out-of-range offset (>= n_slots) also returns nonzero. + assert_ne!( + set_value_rc(&artifact, artifact.layout.n_slots as i32, 1.0), + 0, + "set_value on an out-of-range offset must return a nonzero error code" + ); + assert_ne!( + set_value_rc(&artifact, -1, 1.0), + 0, + "set_value on a negative offset must return a nonzero error code" + ); + + // The rejected write left the constants region untouched: a no-override + // run still reproduces the defaults (level reaches 10, not 999-driven). + let wasm_slab = run_artifact_with_overrides(&artifact, &[]); + let n_slots = artifact.layout.n_slots; + let n_chunks = artifact.layout.n_chunks; + let last = (n_chunks - 1) * n_slots + level_off; + assert!( + (wasm_slab[last] - 10.0).abs() < 1e-9, + "a rejected set_value must not perturb the default run; level should still reach 10, got {}", + wasm_slab[last] + ); + } + + /// `clear_values` restores compiled defaults after an override, without + /// re-instantiating. Override inflow_rate, run (diverges), then clear, reset, + /// run again -- the second run must reproduce the defaults. + #[test] + fn compile_simulation_clear_values_restores_defaults() { + let datamodel = crate::test_common::TestProject::new("clear") + .with_sim_time(0.0, 5.0, 1.0) + .aux("inflow_rate", "2", None) + .stock("level", "0", &["inflow"], &[], None) + .flow("inflow", "inflow_rate", None) + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + let rate_off = layout_offset(&artifact, "inflow_rate"); + let level_off = layout_offset(&artifact, "level"); + let n_slots = artifact.layout.n_slots; + let n_chunks = artifact.layout.n_chunks; + + let info = validate(&artifact.wasm).expect("module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate") + .module_addr; + let func = |store: &mut Store<()>, name: &str| { + store + .instance_export(inst, name) + .unwrap() + .as_func() + .unwrap() + }; + + // Override -> run -> level reaches 25. + let set_value = func(&mut store, "set_value"); + let rc: i32 = store + .invoke_simple_typed::<(i32, f64), i32>(set_value, (rate_off as i32, 5.0)) + .expect("set_value"); + assert_eq!(rc, 0); + let run = func(&mut store, "run"); + store.invoke_simple_typed::<(), ()>(run, ()).expect("run"); + + // clear_values -> reset -> run -> level back to the default 10. + let clear_values = func(&mut store, "clear_values"); + store + .invoke_simple_typed::<(), ()>(clear_values, ()) + .expect("clear_values"); + let reset = func(&mut store, "reset"); + store + .invoke_simple_typed::<(), ()>(reset, ()) + .expect("reset"); + let run = func(&mut store, "run"); + store.invoke_simple_typed::<(), ()>(run, ()).expect("run"); + + let mem = store + .instance_export(inst, "memory") + .unwrap() + .as_mem() + .unwrap(); + let base = artifact.layout.results_offset; + let last_addr = base + ((n_chunks - 1) * n_slots + level_off) * 8; + let level_last = store.mem_access_mut_slice(mem, |bytes| { + f64::from_le_bytes(bytes[last_addr..last_addr + 8].try_into().unwrap()) + }); + assert!( + (level_last - 10.0).abs() < 1e-9, + "after clear_values the default level should reach 10, got {level_last}" + ); + } + + /// The wasm backend's overridable-constant set (`collect_overridable_defaults`, + /// which mirrors the VM's `collect_constant_info` recursion to capture each + /// default literal) must address EXACTLY the offsets the VM reports overridable + /// via `CompiledSimulation::constant_offsets`. If the two diverged, a blob's + /// `set_value` would accept/reject a different set than the VM's, or initialize + /// the wrong slots -- so this pins them equal over a model with both a top-level + /// constant and a nested-module (SMOOTH) constant. + #[test] + fn wasm_overridable_set_matches_vm_constant_offsets() { + let datamodel = crate::test_common::TestProject::new("const_set") + .with_sim_time(0.0, 4.0, 0.5) + .aux("k", "3", None) + .aux("input", "TIME + k", None) + // SMTH1 expands to a nested stdlib module carrying its own constants + // (the smoothing delay), so the overridable set spans nested modules. + .aux("smoothed", "SMTH1(input, 2)", None) + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + + let mut wasm_set: Vec = collect_overridable_defaults(&sim.modules, &sim.root, 0) + .into_iter() + .map(|(off, _)| off) + .collect(); + wasm_set.sort_unstable(); + wasm_set.dedup(); + + let mut vm_set: Vec = sim.constant_offsets().collect(); + vm_set.sort_unstable(); + + assert_eq!( + wasm_set, vm_set, + "the wasm overridable-constant offsets must match the VM's exactly" + ); + assert!( + !vm_set.is_empty(), + "this model must have at least one overridable constant (k) for the check to be meaningful" + ); + + // Every overridable offset is in range (so it indexes the n_slots-wide + // const region and the validity byte region safely). + let n_slots = sim.n_slots(); + for &off in &vm_set { + assert!( + off < n_slots, + "overridable offset {off} must be < n_slots {n_slots}" + ); + } + } + + /// AC5.1 with an override on a constant that feeds an *initial* equation: the + /// VM re-applies the override across initials (it mutates the literal at all + /// locations), so an overridable constant read during the initials phase must + /// also source from the region. Here `seed` is a constant whose value is the + /// stock's initial, so overriding `seed` must change the stock's starting + /// value -- exercising the initials-phase redirect, not just flows. + #[test] + fn compile_simulation_set_value_override_in_initials_matches_vm() { + let datamodel = crate::test_common::TestProject::new("override_init") + .with_sim_time(0.0, 3.0, 1.0) + .aux("seed", "5", None) + .stock("level", "seed", &["hold"], &[], None) + .flow("hold", "0", None) + .build_datamodel(); + let sim = compile_sim(&datamodel, "main"); + let artifact = compile_simulation(&sim).expect("wasm codegen"); + let seed_off = layout_offset(&artifact, "seed"); + assert!( + sim.is_constant_offset(seed_off), + "seed must be an overridable constant" + ); + + let wasm_slab = run_artifact_with_overrides(&artifact, &[(seed_off, 42.0)]); + let n_slots = artifact.layout.n_slots; + let n_chunks = artifact.layout.n_chunks; + + let sim_vm = compile_sim(&datamodel, "main"); + let (vm_data, vm_step_size, vm_step_count) = + vm_results_with_override(sim_vm, seed_off, 42.0); + assert_eq!(vm_step_count, n_chunks); + + for (name, wasm_off) in &artifact.layout.var_offsets { + let wasm_off = *wasm_off; + let ident = Ident::::from_str_unchecked(name); + if sim.get_offset(&ident).is_none() { + continue; + } + for c in 0..n_chunks { + let vm_val = vm_data[c * vm_step_size + wasm_off]; + let wasm_val = wasm_slab[c * n_slots + wasm_off]; + assert!( + (vm_val - wasm_val).abs() < 1e-9, + "{name} mismatch at chunk {c} under initials override: vm={vm_val} wasm={wasm_val}" + ); + } + } + // seed=42 makes level start (and stay, hold=0) at 42. + let level_off = layout_offset(&artifact, "level"); + assert!( + (wasm_slab[level_off] - 42.0).abs() < 1e-9, + "level should initialize to the overridden seed=42, got {}", + wasm_slab[level_off] + ); + } +} diff --git a/src/simlin-engine/src/wasmgen/vector.rs b/src/simlin-engine/src/wasmgen/vector.rs new file mode 100644 index 000000000..994ee51a1 --- /dev/null +++ b/src/simlin-engine/src/wasmgen/vector.rs @@ -0,0 +1,1063 @@ +// Copyright 2026 The Simlin Authors. All rights reserved. +// Use of this source code is governed by the Apache License, +// Version 2.0, that can be found in the LICENSE file. + +// pattern: Functional Core +// Pure transformation: each emitter appends a wasm instruction sequence for one +// vector-operation opcode, mirroring the matching VM arm element-for-element. No +// I/O; the only side effect is in `#[cfg(test)]` (which lives in `lower_tests.rs` +// alongside the rest of the lowering harness). + +//! Lowering of the bytecode VM's vector-operation opcodes to WebAssembly +//! (Phase 6). +//! +//! These opcodes operate over the compile-time view stack (`super::views`) and +//! the operand stack and -- except [`VectorSelect`](emit_vector_select), which +//! reduces to one scalar -- write their result array to a `write_temp_id` region +//! of `temp_storage`. Each emitter reproduces the matching VM dispatch arm +//! element-for-element: +//! +//! - [`emit_vector_select`] -- `vm.rs:2444-2502` +//! - [`emit_vector_elm_map`] -- `crate::vm_vector_elm_map::vector_elm_map` +//! - [`emit_vector_sort_order`] -- `crate::vm_vector_sort_order::vector_sort_order` +//! - [`emit_rank`] -- `vm.rs:2540-2584` +//! - [`emit_lookup_array`] -- `vm.rs:2586-2629` +//! +//! ## Runtime loop vs unrolled +//! +//! The *stable sort* ([`emit_stable_sort`], backing `VectorSortOrder`/`Rank`) is +//! a self-contained wasm helper with a **runtime** insertion-sort loop -- never +//! unrolled, since an unrolled O(n^2) body over a runtime view size would blow +//! up. Everything else here is a per-element map/gather/scatter over the +//! *compile-time* view size, so the element addresses fold into wasm constants +//! and the bodies are unrolled. The caller (`super::lower`) charges the Phase-5 +//! [`EmitState`](super::lower) unroll budget for the view size before invoking +//! these, so the size cap still bounds an over-large arrayed model. +//! +//! ## Invalid input view +//! +//! An input view that a dynamic subscript (Phase-5 Task 4) made invalid at +//! runtime takes the VM's short-circuit: the whole destination temp region is +//! filled with IEEE `f64::NAN` (NOT the finite `crate::float::NA` sentinel) via +//! [`super::lower::emit_fill_temp_nan`], while `VectorSelect` pushes a single +//! NaN. The validity gate is only emitted when an input view actually carries a +//! runtime validity flag; in the common case (static / temp / full-var views) +//! every input is statically valid and no runtime check is generated. + +use wasm_encoder::{BlockType, Function, Instruction as Ins, ValType}; + +use crate::bytecode::{GraphicalFunctionId, LookupMode}; + +use super::WasmGenError; +use super::lower::{ + EmitCtx, GF_DIRECTORY_ENTRY_BYTES, SLOT_SIZE, emit_fill_temp_nan, emit_is_truthy, + emit_view_element_load, f64_const, i32_memarg, memarg, push_module_relative_base, + temp_element_byte_addr, +}; +use super::views::{ViewBase, ViewDesc}; + +/// Push `round_half_away(x)` for the f64 already on the wasm stack, reproducing +/// Rust's `f64::round` (round half AWAY from zero) bit-for-bit -- which is what +/// the VM uses (`stack.pop().round()`, `offset_val.round()`). This is NOT wasm +/// `f64.nearest` (round half to EVEN), so the two diverge for half-integer +/// inputs. +/// +/// Emits the precision-safe form `t = x.trunc(); if (x - t).abs() >= 0.5 then t +/// plus-or-minus 1 (sign of x) else t`. The naive `trunc(x + copysign(0.5, x))` +/// is off-by-one against `f64::round` for two reachable input classes. First: +/// the largest f64 below 0.5 (`0.49999999999999994` and its negative), where +/// `x + 0.5` rounds up to exactly 1.0 so `trunc` yields a magnitude of one +/// though `f64::round` yields zero. Second: already-integer magnitudes in +/// `[2^52, 2^53)`, where `x + 0.5` rounds up to `x + 1` though `f64::round` +/// returns `x`. The `(x - t)` fraction here is computed exactly (the operands +/// are within a factor of two for `|x| < 2^53`, and `t == x` for integer +/// magnitudes at or above `2^52`), so no rounding can perturb the half-way +/// test. Verified bit-identical to `f64::round` over 5M random doubles +/// including sign-of-zero and both boundary classes. +/// +/// `x_scratch` and `t_scratch` are two free f64 locals (distinct), holding `x` +/// and `trunc(x)` while each is read more than once. +pub(crate) fn emit_round_half_away(f: &mut Function, x_scratch: u32, t_scratch: u32) { + f.instruction(&Ins::LocalSet(x_scratch)); // x_scratch = x + f.instruction(&Ins::LocalGet(x_scratch)); + f.instruction(&Ins::F64Trunc); + f.instruction(&Ins::LocalSet(t_scratch)); // t_scratch = trunc(x) + + // round-up value: t + copysign(1.0, x) (the deeper Select operand) + f.instruction(&Ins::LocalGet(t_scratch)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::LocalGet(x_scratch)); + f.instruction(&Ins::F64Copysign); // copysign(1.0, x): ±1.0 with x's sign + f.instruction(&Ins::F64Add); // t + copysign(1.0, x) + + // keep-trunc value: t (the shallower Select operand) + f.instruction(&Ins::LocalGet(t_scratch)); + + // condition: |x - t| >= 0.5 (exact fraction; round half away from zero) + f.instruction(&Ins::LocalGet(x_scratch)); + f.instruction(&Ins::LocalGet(t_scratch)); + f.instruction(&Ins::F64Sub); + f.instruction(&Ins::F64Abs); + f.instruction(&f64_const(0.5)); + f.instruction(&Ins::F64Ge); + + // select([round_up, t, cond]) == round_up when cond != 0, else t. + f.instruction(&Ins::Select); +} + +// ── stable sort helper (VectorSortOrder / Rank) ───────────────────────────── + +// `stable_sort(pairs_ptr: i32, n: i32, ascending: i32)` local layout. +const SS_PTR: u32 = 0; // i32 byte address of pair 0 +const SS_N: u32 = 1; // i32 pair count +const SS_ASC: u32 = 2; // i32 1 = ascending, else descending +const SS_I: u32 = 3; // i32 outer index +const SS_J: u32 = 4; // i32 inner index +const SS_KEY_VAL: u32 = 5; // f64 key value +const SS_KEY_IDX: u32 = 6; // f64 key idx payload +const SS_LEFT_VAL: u32 = 7; // f64 the left neighbour's value + +/// Bytes per `(value: f64, idx: f64)` sort pair. +const PAIR_BYTES: i32 = 16; + +/// Build the body of `stable_sort(pairs_ptr: i32, n: i32, ascending: i32) -> ()`, +/// an in-place **stable** insertion sort of `n` `(value: f64 @ +0, idx: f64 @ +8)` +/// pairs starting at byte `pairs_ptr`, ordered by `value`. +/// +/// Reproduces the VM's stable `sort_by(|a, b| a.partial_cmp(b).unwrap_or(Equal))` +/// (ascending) / the `b.partial_cmp(a)` form (descending). The shift predicate is +/// a **strict** `f64.lt` (ascending) / `f64.gt` (descending) of the left +/// neighbour against the key: it is `false` whenever either operand is NaN, so a +/// NaN never displaces a non-NaN and never reorders relative to another NaN -- +/// i.e. NaN comparisons act as `Equal`, exactly matching `partial_cmp(..) +/// .unwrap_or(Equal)` under a stable sort. Insertion sort only shifts past +/// strictly-ordered neighbours, so equal-keyed elements keep their input order +/// (stability) for free. +/// +/// A runtime loop (never unrolled): `n` is a runtime view size, so an unrolled +/// O(n^2) body would be unbounded. n is small for real arrays (the corpus's +/// largest single dimension is 9), so insertion sort is more than adequate. +pub(crate) fn emit_stable_sort() -> Function { + // Locals after the three i32 params: i32 SS_I/SS_J, f64 SS_KEY_VAL/ + // SS_KEY_IDX/SS_LEFT_VAL. + let mut f = Function::new([(2, ValType::I32), (3, ValType::F64)]); + + // i = 1 + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::LocalSet(SS_I)); + + f.instruction(&Ins::Block(BlockType::Empty)); // $outer_exit + f.instruction(&Ins::Loop(BlockType::Empty)); // $outer + + // while-head: if !(i < n) break $outer_exit (br depth 1) + f.instruction(&Ins::LocalGet(SS_I)); + f.instruction(&Ins::LocalGet(SS_N)); + f.instruction(&Ins::I32LtS); + f.instruction(&Ins::I32Eqz); + f.instruction(&Ins::BrIf(1)); + + // key_val = mem[ptr + 16*i + 0]; key_idx = mem[ptr + 16*i + 8] + push_pair_addr(&mut f, SS_I); + f.instruction(&Ins::F64Load(memarg(0))); + f.instruction(&Ins::LocalSet(SS_KEY_VAL)); + push_pair_addr(&mut f, SS_I); + f.instruction(&Ins::F64Load(memarg(8))); + f.instruction(&Ins::LocalSet(SS_KEY_IDX)); + + // j = i - 1 + f.instruction(&Ins::LocalGet(SS_I)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Sub); + f.instruction(&Ins::LocalSet(SS_J)); + + f.instruction(&Ins::Block(BlockType::Empty)); // $inner_exit + f.instruction(&Ins::Loop(BlockType::Empty)); // $inner + + // while-head: if !(j >= 0) break $inner_exit (br depth 1) + f.instruction(&Ins::LocalGet(SS_J)); + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::I32GeS); + f.instruction(&Ins::I32Eqz); + f.instruction(&Ins::BrIf(1)); + + // left_val = mem[ptr + 16*j + 0] + push_pair_addr(&mut f, SS_J); + f.instruction(&Ins::F64Load(memarg(0))); + f.instruction(&Ins::LocalSet(SS_LEFT_VAL)); + + // cmp = ascending ? (left_val > key_val) : (left_val < key_val) + // Both are strict, hence false for any NaN operand (NaN-as-Equal stability). + f.instruction(&Ins::LocalGet(SS_LEFT_VAL)); + f.instruction(&Ins::LocalGet(SS_KEY_VAL)); + f.instruction(&Ins::F64Gt); // gt (the ascending predicate) + f.instruction(&Ins::LocalGet(SS_LEFT_VAL)); + f.instruction(&Ins::LocalGet(SS_KEY_VAL)); + f.instruction(&Ins::F64Lt); // lt (the descending predicate) + f.instruction(&Ins::LocalGet(SS_ASC)); + f.instruction(&Ins::Select); // gt if ascending else lt + // if !cmp break $inner_exit (br depth 1) + f.instruction(&Ins::I32Eqz); + f.instruction(&Ins::BrIf(1)); + + // mem[ptr + 16*(j+1)] = mem[ptr + 16*j] (both value and idx) + push_pair_addr_plus1(&mut f, SS_J); // dst addr (value) + push_pair_addr(&mut f, SS_J); + f.instruction(&Ins::F64Load(memarg(0))); + f.instruction(&Ins::F64Store(memarg(0))); + push_pair_addr_plus1(&mut f, SS_J); // dst addr (idx) + push_pair_addr(&mut f, SS_J); + f.instruction(&Ins::F64Load(memarg(8))); + f.instruction(&Ins::F64Store(memarg(8))); + + // j -= 1 ; continue $inner + f.instruction(&Ins::LocalGet(SS_J)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Sub); + f.instruction(&Ins::LocalSet(SS_J)); + f.instruction(&Ins::Br(0)); + + f.instruction(&Ins::End); // end $inner loop + f.instruction(&Ins::End); // end $inner_exit block + + // mem[ptr + 16*(j+1)] = (key_val, key_idx) + push_pair_addr_plus1(&mut f, SS_J); + f.instruction(&Ins::LocalGet(SS_KEY_VAL)); + f.instruction(&Ins::F64Store(memarg(0))); + push_pair_addr_plus1(&mut f, SS_J); + f.instruction(&Ins::LocalGet(SS_KEY_IDX)); + f.instruction(&Ins::F64Store(memarg(8))); + + // i += 1 ; continue $outer + f.instruction(&Ins::LocalGet(SS_I)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Add); + f.instruction(&Ins::LocalSet(SS_I)); + f.instruction(&Ins::Br(0)); + + f.instruction(&Ins::End); // end $outer loop + f.instruction(&Ins::End); // end $outer_exit block + f.instruction(&Ins::End); // end function + f +} + +/// Push the byte address of sort pair `idx_local`: `ptr + 16 * idx`. A following +/// `f64.load`/`store` reads `value` at `memarg(0)` and `idx` at `memarg(8)`. +fn push_pair_addr(f: &mut Function, idx_local: u32) { + f.instruction(&Ins::LocalGet(SS_PTR)); + f.instruction(&Ins::LocalGet(idx_local)); + f.instruction(&Ins::I32Const(PAIR_BYTES)); + f.instruction(&Ins::I32Mul); + f.instruction(&Ins::I32Add); +} + +/// Push the byte address of sort pair `idx_local + 1`: `ptr + 16 * (idx + 1)`. +fn push_pair_addr_plus1(f: &mut Function, idx_local: u32) { + f.instruction(&Ins::LocalGet(SS_PTR)); + f.instruction(&Ins::LocalGet(idx_local)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Add); + f.instruction(&Ins::I32Const(PAIR_BYTES)); + f.instruction(&Ins::I32Mul); + f.instruction(&Ins::I32Add); +} + +// ── shared input-view helpers ─────────────────────────────────────────────── + +/// Whether `view` carries a runtime validity flag or runtime offset addend (a +/// dynamic subscript, Phase-5 Task 4). +/// +/// This is the *`VectorSelect`-specific* dynamic-view rejection predicate (its +/// only consumer is [`is_dynamic_select`]). It deliberately keys on *both* +/// `valid_local` and `runtime_off_local` -- stricter than the temp-writers' +/// [`emit_with_validity_gate`], which keys on `valid_local` alone. The +/// difference is by design: `VectorSelect` reads its source via a compile-time- +/// base path that does NOT fold a runtime offset addend (it has no temp region +/// to gate and would need to thread the runtime offset into the gather by hand), +/// so any runtime offset disqualifies it. The temp-writers tolerate a +/// `runtime_off_local` because their element reads route through +/// [`emit_view_element_load`], which folds the runtime offset + validity itself. +fn is_dynamic(view: &ViewDesc) -> bool { + view.valid_local.is_some() || view.runtime_off_local.is_some() +} + +/// Push the i32 "all inputs valid" condition for `views`: the bitwise-AND of each +/// view's `valid_local`, or a constant `1` when no view carries one. Used to gate +/// the op against the VM's "`!is_valid` -> fill_temp_nan / NaN" short-circuit. +fn push_all_valid(views: &[&ViewDesc], f: &mut Function) { + let valids: Vec = views.iter().filter_map(|v| v.valid_local).collect(); + if valids.is_empty() { + f.instruction(&Ins::I32Const(1)); + return; + } + f.instruction(&Ins::LocalGet(valids[0])); + for &v in &valids[1..] { + f.instruction(&Ins::LocalGet(v)); + f.instruction(&Ins::I32And); + } +} + +/// The constant base byte address of `view`'s *storage element 0* -- i.e. the +/// address the VM's `read_view_element(view, flat)` indexes as `base + flat` (the +/// view's `base_off`, NOT folding in its `offset`, which the caller already folds +/// into the flat index). For a module-relative var view the runtime `module_off` +/// addend is signalled via the returned `bool`. +fn view_storage_base(view: &ViewDesc, ctx: &EmitCtx) -> Result<(u64, bool), WasmGenError> { + match view.base { + ViewBase::CurrAbsolute => Ok(( + u64::from(ctx.curr_base) + u64::from(view.base_off) * u64::from(SLOT_SIZE), + false, + )), + ViewBase::CurrModuleRelative => Ok(( + u64::from(ctx.curr_base) + u64::from(view.base_off) * u64::from(SLOT_SIZE), + true, + )), + ViewBase::Temp => { + let temp_off = *ctx + .ctx + .temp_offsets + .get(view.base_off as usize) + .ok_or_else(|| { + WasmGenError::Unsupported( + "wasmgen: vector-op source references an out-of-range temp id".to_string(), + ) + })? as u64; + Ok(( + u64::from(ctx.temp_storage_base) + temp_off * u64::from(SLOT_SIZE), + false, + )) + } + } +} + +// ── VectorSelect (vm.rs:2444-2502) ────────────────────────────────────────── + +/// Lower `VectorSelect`, mirroring `vm.rs:2444-2502`. The two operands are on the +/// wasm stack as `[max_value, action]` (`action` on top, matching the VM popping +/// `action` then `max_value`); the views are `expr_view = top`, `sel_view = +/// top-1`. Zips the two views to `min(sel.size, expr.size)` with independent +/// odometers, collects each `expr` value where `is_truthy(sel)`, then for an +/// empty selection pushes `max_value`, else dispatches the `action` reduction +/// (1=min, 2=mean, 3=max, 4=product, else sum). The single scalar result is left +/// on the stack. An invalid input view pushes one NaN. +/// +/// The gather is unrolled over the (compile-time) zip size; each selected value +/// is appended to the vector scratch region with a runtime count, and the +/// reduction is a single runtime pass over the collected values (mirroring the +/// VM's `selected` Vec). `min`/`max` reproduce Rust's `f64::min`/`f64::max` +/// (NaN-ignoring), not wasm `f64.min`/`f64.max` (NaN-propagating), so the fold +/// matches the VM's `fold(±inf, f64::min/max)` exactly. +pub(crate) fn emit_vector_select( + sel_view: &ViewDesc, + expr_view: &ViewDesc, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + if is_dynamic_select(sel_view, expr_view) { + return Err(WasmGenError::Unsupported( + "wasmgen: VectorSelect over a dynamically-subscripted view is not supported" + .to_string(), + )); + } + + let max_value = ctx.apply_locals[0]; // popped second + let action = ctx.vector_i32_locals[0]; + let count = ctx.vector_i32_locals[1]; + let k = ctx.vector_i32_locals[2]; + let [acc_sum, acc_prod, acc_min, acc_max, vtmp] = ctx.vector_f64_locals; + + // Pop action (top) -> round-half-away -> i32; then pop max_value. The round + // uses `scratch_local` + `apply_locals[0]` as its two f64 temps; both are + // free here (`max_value` is parked into `apply_locals[0]` only afterward). + emit_round_half_away(f, ctx.scratch_local, ctx.apply_locals[0]); + f.instruction(&Ins::I32TruncSatF64S); + f.instruction(&Ins::LocalSet(action)); + f.instruction(&Ins::LocalSet(max_value)); + + let size = sel_view.size().min(expr_view.size()); + + // count = 0 + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::LocalSet(count)); + + // Gather: for each i in 0..size, if is_truthy(sel[i]) push expr[i] into the + // scratch region at scratch[count] and bump count. The two odometers run + // independently; element `i` of each view is its row-major iteration index. + for i in 0..size { + emit_view_element_load(sel_view, i, ctx, f)?; + emit_is_truthy(ctx, f); + f.instruction(&Ins::If(BlockType::Empty)); + // scratch[count] = expr[i]. f64.store wants [addr_i32, value_f64]; + // addr = vector_scratch_base + count*8 (the constant base in memarg). + f.instruction(&Ins::LocalGet(count)); + f.instruction(&Ins::I32Const(SLOT_SIZE as i32)); + f.instruction(&Ins::I32Mul); + emit_view_element_load(expr_view, i, ctx, f)?; + f.instruction(&Ins::F64Store(memarg(u64::from(ctx.vector_scratch_base)))); + // count += 1 + f.instruction(&Ins::LocalGet(count)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Add); + f.instruction(&Ins::LocalSet(count)); + f.instruction(&Ins::End); + } + + // if count == 0 { result = max_value } else { result = reduce(action) }. + f.instruction(&Ins::LocalGet(count)); + f.instruction(&Ins::I32Eqz); + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&Ins::LocalGet(max_value)); + f.instruction(&Ins::Else); + + // Single pass over scratch[0..count] computing sum/product/min/max; then the + // action selects the result. min/max init mirror the VM's + // fold(INFINITY, f64::min) / fold(NEG_INFINITY, f64::max). + f.instruction(&f64_const(0.0)); + f.instruction(&Ins::LocalSet(acc_sum)); + f.instruction(&f64_const(1.0)); + f.instruction(&Ins::LocalSet(acc_prod)); + f.instruction(&f64_const(f64::INFINITY)); + f.instruction(&Ins::LocalSet(acc_min)); + f.instruction(&f64_const(f64::NEG_INFINITY)); + f.instruction(&Ins::LocalSet(acc_max)); + // k = 0 + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::LocalSet(k)); + + f.instruction(&Ins::Block(BlockType::Empty)); // $reduce_exit + f.instruction(&Ins::Loop(BlockType::Empty)); // $reduce + // if !(k < count) break + f.instruction(&Ins::LocalGet(k)); + f.instruction(&Ins::LocalGet(count)); + f.instruction(&Ins::I32LtS); + f.instruction(&Ins::I32Eqz); + f.instruction(&Ins::BrIf(1)); + // v = scratch[k] + f.instruction(&Ins::LocalGet(k)); + f.instruction(&Ins::I32Const(SLOT_SIZE as i32)); + f.instruction(&Ins::I32Mul); + f.instruction(&Ins::F64Load(memarg(u64::from(ctx.vector_scratch_base)))); + f.instruction(&Ins::LocalSet(vtmp)); + // acc_sum += v + f.instruction(&Ins::LocalGet(acc_sum)); + f.instruction(&Ins::LocalGet(vtmp)); + f.instruction(&Ins::F64Add); + f.instruction(&Ins::LocalSet(acc_sum)); + // acc_prod *= v + f.instruction(&Ins::LocalGet(acc_prod)); + f.instruction(&Ins::LocalGet(vtmp)); + f.instruction(&Ins::F64Mul); + f.instruction(&Ins::LocalSet(acc_prod)); + // acc_min = f64::min(acc_min, v) + f.instruction(&Ins::LocalGet(acc_min)); + f.instruction(&Ins::LocalGet(vtmp)); + emit_f64_min_rust(ctx, f); + f.instruction(&Ins::LocalSet(acc_min)); + // acc_max = f64::max(acc_max, v) + f.instruction(&Ins::LocalGet(acc_max)); + f.instruction(&Ins::LocalGet(vtmp)); + emit_f64_max_rust(ctx, f); + f.instruction(&Ins::LocalSet(acc_max)); + // k += 1 ; continue + f.instruction(&Ins::LocalGet(k)); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Add); + f.instruction(&Ins::LocalSet(k)); + f.instruction(&Ins::Br(0)); + f.instruction(&Ins::End); // end $reduce loop + f.instruction(&Ins::End); // end $reduce_exit block + + // result = match action { 1 => min, 2 => sum/count, 3 => max, 4 => prod, + // _ => sum }. wasm `select` pops [v1, v2, cond] and + // yields the DEEPER `v1` when cond != 0, so the running default (`sum`) is the + // deeper operand, each override is pushed shallower, and the condition is + // `action != n` -- keeping the running value unless `action == n`. (Same + // pattern as `math::emit_quadrant_select`.) + f.instruction(&Ins::LocalGet(acc_sum)); // default: sum (action 0/5/..) + // action == 4 -> product + f.instruction(&Ins::LocalGet(acc_prod)); + push_action_ne(f, action, 4); + f.instruction(&Ins::Select); + // action == 3 -> max + f.instruction(&Ins::LocalGet(acc_max)); + push_action_ne(f, action, 3); + f.instruction(&Ins::Select); + // action == 2 -> mean (sum / count) + f.instruction(&Ins::LocalGet(acc_sum)); + f.instruction(&Ins::LocalGet(count)); + f.instruction(&Ins::F64ConvertI32S); + f.instruction(&Ins::F64Div); + push_action_ne(f, action, 2); + f.instruction(&Ins::Select); + // action == 1 -> min + f.instruction(&Ins::LocalGet(acc_min)); + push_action_ne(f, action, 1); + f.instruction(&Ins::Select); + + f.instruction(&Ins::End); // end if count == 0 + Ok(()) +} + +/// `VectorSelect`'s dynamic-view rejection. The op reduces to a scalar (no temp +/// region), so an invalid view would push one NaN; rather than emit that gate +/// (and the runtime-offset folding the gather would need), a dynamically- +/// subscripted input falls back to the VM. +fn is_dynamic_select(sel_view: &ViewDesc, expr_view: &ViewDesc) -> bool { + is_dynamic(sel_view) || is_dynamic(expr_view) +} + +/// Push i32 `1` when the i32 in `action_local` does NOT equal `n` -- the "keep +/// the running default" condition for the `VectorSelect` action-dispatch selects +/// (the override is taken only when `action == n`). +fn push_action_ne(f: &mut Function, action_local: u32, n: i32) { + f.instruction(&Ins::LocalGet(action_local)); + f.instruction(&Ins::I32Const(n)); + f.instruction(&Ins::I32Ne); +} + +/// Push `f64::min(a, b)` for `[a, b]` on the wasm stack, reproducing Rust's +/// NaN-ignoring `f64::min` (return the non-NaN operand if exactly one is NaN, the +/// lesser otherwise) rather than wasm `f64.min` (NaN-propagating). Parks both +/// operands so they can be read for the NaN tests and the `<` compare. +fn emit_f64_min_rust(ctx: &EmitCtx, f: &mut Function) { + emit_f64_minmax_rust(ctx, f, true); +} + +/// Push `f64::max(a, b)` for `[a, b]` on the wasm stack, reproducing Rust's +/// NaN-ignoring `f64::max`. +fn emit_f64_max_rust(ctx: &EmitCtx, f: &mut Function) { + emit_f64_minmax_rust(ctx, f, false); +} + +/// Shared body of [`emit_f64_min_rust`]/[`emit_f64_max_rust`]. Consumes `[a, b]` +/// and pushes `f64::min(a,b)` (`want_min`) or `f64::max(a,b)`, matching +/// `f64::min`/`f64::max`'s "ignore NaN; if both NaN, NaN" contract. +/// +/// Built as three nested `select`s, each in the wasm "deeper operand wins when +/// cond != 0" form (`select([v1, v2, cond]) == v1 if cond else v2`): +/// 1. `core = (a {<,>} b) ? a : b` -- the non-NaN min/max, +/// 2. `r = (b is NaN) ? a : core` -- ignore a NaN `b`, +/// 3. result `= (a is NaN) ? b : r` -- ignore a NaN `a` (and if both NaN, `b`, +/// which is NaN, so the all-NaN case yields NaN). +/// +/// The intermediate must be a *shallower* select operand at each step, so it is +/// parked in a scratch local rather than left on the stack. The `VectorSelect` +/// reduction reaches this only inside its `count != 0` branch, where all three +/// `Apply` scratch f64s are free (`apply_locals[0]`'s `max_value` is dead once +/// the selection is non-empty); this uses `apply_locals[1]`/`[2]` for `a`/`b` and +/// `apply_locals[0]` for the running register. (The ±0 tie is left to wasm's +/// `<`/`>`, acceptable for SD parity -- the VM's reductions never depend on ±0.) +fn emit_f64_minmax_rust(ctx: &EmitCtx, f: &mut Function, want_min: bool) { + let a = ctx.apply_locals[1]; + let b = ctx.apply_locals[2]; + let r = ctx.apply_locals[0]; + // The two operands are on the stack as [a, b] (b on top); park them. + f.instruction(&Ins::LocalSet(b)); + f.instruction(&Ins::LocalSet(a)); + + // core = (a {<,>} b) ? a : b -> r + f.instruction(&Ins::LocalGet(a)); + f.instruction(&Ins::LocalGet(b)); + f.instruction(&Ins::LocalGet(a)); + f.instruction(&Ins::LocalGet(b)); + if want_min { + f.instruction(&Ins::F64Lt); + } else { + f.instruction(&Ins::F64Gt); + } + f.instruction(&Ins::Select); + f.instruction(&Ins::LocalSet(r)); + + // r = (b is NaN) ? a : r + f.instruction(&Ins::LocalGet(a)); + f.instruction(&Ins::LocalGet(r)); + f.instruction(&Ins::LocalGet(b)); + f.instruction(&Ins::LocalGet(b)); + f.instruction(&Ins::F64Ne); // b != b (true iff b is NaN) + f.instruction(&Ins::Select); + f.instruction(&Ins::LocalSet(r)); + + // result = (a is NaN) ? b : r (left on the stack) + f.instruction(&Ins::LocalGet(b)); + f.instruction(&Ins::LocalGet(r)); + f.instruction(&Ins::LocalGet(a)); + f.instruction(&Ins::LocalGet(a)); + f.instruction(&Ins::F64Ne); // a != a (true iff a is NaN) + f.instruction(&Ins::Select); +} + +// ── VectorElmMap (vm_vector_elm_map.rs:33-116) ────────────────────────────── + +/// Lower `VectorElmMap { write_temp_id, full_source_len }`, mirroring +/// `crate::vm_vector_elm_map::vector_elm_map`. The views are `offset_view = top`, +/// `source_view = top-1`. For each element `i` of the offset view: `flat_i = +/// base_i + round(offset[i])` over the source's FULL row-major storage, where +/// `base_i` is 0 for a full contiguous source else the source's flat offset at +/// element `i`'s carried-axis projection (the offset-view indices scattered onto +/// the source axes by dim-id). The result is `NaN` if `offset[i]` is NaN or +/// `flat_i` is out of `[0, full_source_len)`, else `source[flat_i]`. **No +/// modulo.** Written to `temp[temp_off + i]`; an invalid input view fills the +/// whole destination temp region with NaN. +pub(crate) fn emit_vector_elm_map( + source_view: &ViewDesc, + offset_view: &ViewDesc, + write_temp_id: u8, + full_source_len: u32, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + // The source's runtime-indexed read folds a module-relative addend, but a + // runtime-offset addend (a dynamic subscript) on the source is NOT folded + // into the compile-time `base_i`, so reject a dynamically-subscripted source + // (VM fallback). The OFFSET view's reads route through `emit_view_element_load` + // (which handles a runtime offset + validity), and an invalid offset view is + // caught by the op-level validity gate below, so an offset dynamic subscript + // is fine. + if source_view.runtime_off_local.is_some() { + return Err(WasmGenError::Unsupported( + "wasmgen: VectorElmMap over a dynamically-subscripted source view is not supported" + .to_string(), + )); + } + + emit_with_validity_gate( + &[source_view, offset_view], + write_temp_id, + ctx, + f, + |ctx, f| { + emit_vector_elm_map_body( + source_view, + offset_view, + write_temp_id, + full_source_len, + ctx, + f, + ) + }, + ) +} + +/// The valid-input body of [`emit_vector_elm_map`]. +fn emit_vector_elm_map_body( + source_view: &ViewDesc, + offset_view: &ViewDesc, + write_temp_id: u8, + full_source_len: u32, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + let full_len = full_source_len as usize; + let offset_size = offset_view.size(); + + // source_is_full_array: the fast path where base_i is hard-coded 0 and the + // offset indexes the whole array directly (vm_vector_elm_map.rs:52). + let source_is_full_array = source_view.size() == full_len && source_view.is_contiguous(); + + // Carried source dim -> offset-view axis of the same dim id, mirroring the + // VM's `src_to_off_axis` (vm_vector_elm_map.rs:57-61). Used per element to + // project the offset-view indices onto the source axes for `base_i`. + let src_to_off_axis: Vec> = source_view + .dim_ids + .iter() + .map(|sd| offset_view.dim_ids.iter().position(|od| od == sd)) + .collect(); + + let (src_base_byte, src_module_relative) = view_storage_base(source_view, ctx)?; + + let offset_val = ctx.vector_f64_locals[0]; + let flat_i = ctx.vector_i32_locals[0]; + + for i in 0..offset_size { + // base_i (compile-time): 0 for a full-array source, else the sliced + // view's flat offset at this element's carried-dim projection. + let base_i: i64 = if source_is_full_array { + 0 + } else { + let off_indices = ViewDesc::decompose_iter_index(&offset_view.dims, i); + let src_indices: Vec = src_to_off_axis + .iter() + .map(|slot| match slot { + Some(p) => off_indices[*p], + None => 0, + }) + .collect(); + source_view.flat_offset_for_indices(&src_indices) as i64 + }; + + // offset_val = offset_view[i] + emit_view_element_load(offset_view, i, ctx, f)?; + f.instruction(&Ins::LocalSet(offset_val)); + + // result = if offset_val.is_nan() || flat_i<0 || flat_i>=full_len { NaN } + // else source[flat_i]. flat_i = base_i + round(offset_val). + // Compute flat_i (i32) once. The round consumes the pushed copy of + // `offset_val` and uses `scratch_local` + `apply_locals[0]` as its two + // f64 temps -- neither is `vector_f64_locals[0]` (the `offset_val` local, + // read again below), and `apply_locals` is otherwise unused in this op. + f.instruction(&f64_const(base_i as f64)); + f.instruction(&Ins::LocalGet(offset_val)); + emit_round_half_away(f, ctx.scratch_local, ctx.apply_locals[0]); + f.instruction(&Ins::F64Add); // base_i + round(offset_val) (as f64) + f.instruction(&Ins::I32TruncSatF64S); + f.instruction(&Ins::LocalSet(flat_i)); + + // store temp[i] = select(NaN, source[flat_i], oob). oob is true when the + // offset is NaN OR flat_i is out of [0, full_len). f64.store wants + // [addr_i32, value_f64]; push the temp address first. + let temp_addr = temp_element_byte_addr(ctx, write_temp_id, i as u32)?; + f.instruction(&Ins::I32Const(0)); // dynamic addr part (const base in memarg) + + // value = read source[flat_i] (faithful even when oob -- the select + // discards it; flat_i is sat-clamped so the address stays in range only + // when in-bounds, but a read at a clamped OOB index is never used). + // Guard the read with the in-bounds branch so an OOB index never loads + // out of the source storage. + f.instruction(&Ins::LocalGet(offset_val)); + f.instruction(&Ins::LocalGet(offset_val)); + f.instruction(&Ins::F64Ne); // offset_val is NaN + f.instruction(&Ins::LocalGet(flat_i)); + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::I32LtS); // flat_i < 0 + f.instruction(&Ins::I32Or); + f.instruction(&Ins::LocalGet(flat_i)); + f.instruction(&Ins::I32Const(full_len as i32)); + f.instruction(&Ins::I32GeS); // flat_i >= full_len + f.instruction(&Ins::I32Or); // oob + f.instruction(&Ins::If(BlockType::Result(ValType::F64))); + f.instruction(&f64_const(f64::NAN)); + f.instruction(&Ins::Else); + // source[flat_i]: base byte + flat_i*8 (+ module_off*8 if module-relative) + emit_storage_indexed_load(src_base_byte, src_module_relative, flat_i, ctx, f); + f.instruction(&Ins::End); + + f.instruction(&Ins::F64Store(memarg(temp_addr))); + } + Ok(()) +} + +/// Push `storage[flat_i]` where the storage element-0 byte address is the +/// constant `base_byte` and `flat_i` (an i32 local) is the runtime slot index: +/// `f64.load[base_byte + (module_off? )*8 + flat_i*8]`. The constant `base_byte` +/// rides in the `memarg.offset`; the runtime part is `(module_off + flat_i) * 8` +/// for a module-relative view, else `flat_i * 8`. +fn emit_storage_indexed_load( + base_byte: u64, + module_relative: bool, + flat_i: u32, + ctx: &EmitCtx, + f: &mut Function, +) { + if module_relative { + push_module_relative_base(ctx, f); // module_off * 8 + f.instruction(&Ins::LocalGet(flat_i)); + f.instruction(&Ins::I32Const(SLOT_SIZE as i32)); + f.instruction(&Ins::I32Mul); + f.instruction(&Ins::I32Add); + } else { + f.instruction(&Ins::LocalGet(flat_i)); + f.instruction(&Ins::I32Const(SLOT_SIZE as i32)); + f.instruction(&Ins::I32Mul); + } + f.instruction(&Ins::F64Load(memarg(base_byte))); +} + +// ── VectorSortOrder (vm_vector_sort_order.rs:49-101) ───────────────────────── + +/// Lower `VectorSortOrder { write_temp_id }`, mirroring +/// `crate::vm_vector_sort_order::vector_sort_order`. `input_view = top`; the +/// `direction` operand is popped (`.round() as i32`). The innermost (last) +/// dimension is the sorted axis; outer dims select independent rows (a scalar/1-D +/// view is one row of `inner == size`). Per row, the `(value, local_idx 0..inner)` +/// pairs are staged into the vector scratch region, sorted (ascending if +/// `direction == 1`, else descending) by the runtime [`emit_stable_sort`] helper, +/// then `temp[row_base + rank] = local_idx` is written (the 0-based in-row source +/// index at the sorted position). An invalid input view fills the temp with NaN. +pub(crate) fn emit_vector_sort_order( + input_view: &ViewDesc, + write_temp_id: u8, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + // The direction operand is on the stack now; pop it to the `ascending` flag + // first (the validity gate's body / fill_temp_nan arms must be + // operand-balanced, so the operand is consumed before the gate). A + // dynamically-subscripted input is handled by the gate (invalid -> + // fill_temp_nan) and `emit_view_element_load` (runtime offset + validity). + let ascending = ctx.vector_i32_locals[0]; + pop_direction_to_ascending(ascending, ctx, f); + + emit_with_validity_gate(&[input_view], write_temp_id, ctx, f, |ctx, f| { + emit_vector_sort_order_body(input_view, write_temp_id, ascending, ctx, f) + }) +} + +/// The valid-input body of [`emit_vector_sort_order`]. +fn emit_vector_sort_order_body( + input_view: &ViewDesc, + write_temp_id: u8, + ascending: u32, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + let size = input_view.size(); + let n_dims = input_view.dims.len(); + let inner = if n_dims == 0 { + size + } else { + input_view.dims[n_dims - 1] as usize + }; + if inner == 0 { + // A zero-length innermost dim yields an empty result; nothing to write. + return Ok(()); + } + + let scratch = u64::from(ctx.vector_scratch_base); + // Iterate rows in row-major logical order; each block of `inner` iterations + // is one row (mirroring the VM's `increment_indices` walk -- element + // `iter_idx` of the view, read row-major, is `flat_element_offset(iter_idx)`). + let mut i = 0usize; + while i < size { + // Gather: pair[local_idx] = (value = input[i + local_idx], idx = local_idx). + for local_idx in 0..inner { + let pair_val_addr = scratch + (local_idx as u64) * (PAIR_BYTES as u64); + // value slot + f.instruction(&Ins::I32Const(0)); + emit_view_element_load(input_view, i + local_idx, ctx, f)?; + f.instruction(&Ins::F64Store(memarg(pair_val_addr))); + // idx slot (+8) + f.instruction(&Ins::I32Const(0)); + f.instruction(&f64_const(local_idx as f64)); + f.instruction(&Ins::F64Store(memarg(pair_val_addr + 8))); + } + + // stable_sort(scratch, inner, ascending) + f.instruction(&Ins::I32Const(ctx.vector_scratch_base as i32)); + f.instruction(&Ins::I32Const(inner as i32)); + f.instruction(&Ins::LocalGet(ascending)); + f.instruction(&Ins::Call(ctx.helpers.stable_sort)); + + // Scatter: temp[temp_off + i + rank] = pair[rank].idx. + for rank in 0..inner { + let pair_idx_addr = scratch + (rank as u64) * (PAIR_BYTES as u64) + 8; + let temp_addr = temp_element_byte_addr(ctx, write_temp_id, (i + rank) as u32)?; + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::F64Load(memarg(pair_idx_addr))); + f.instruction(&Ins::F64Store(memarg(temp_addr))); + } + + i += inner; + } + Ok(()) +} + +// ── Rank (vm.rs:2540-2584) ─────────────────────────────────────────────────── + +/// Lower `Rank { write_temp_id }`, mirroring `vm.rs:2540-2584`. `input_view = +/// top`; the `direction` operand is popped. Over the WHOLE view, the `(value, +/// orig_idx 0..size)` pairs are staged into the vector scratch region and sorted +/// (ascending if `direction == 1`, else descending) by [`emit_stable_sort`], then +/// `temp[orig_idx] = rank_0based + 1` (1-based, indexed by original position) is +/// written. An invalid input view fills the temp with NaN. +pub(crate) fn emit_rank( + input_view: &ViewDesc, + write_temp_id: u8, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + let ascending = ctx.vector_i32_locals[0]; + pop_direction_to_ascending(ascending, ctx, f); + + emit_with_validity_gate(&[input_view], write_temp_id, ctx, f, |ctx, f| { + emit_rank_body(input_view, write_temp_id, ascending, ctx, f) + }) +} + +/// The valid-input body of [`emit_rank`]. +fn emit_rank_body( + input_view: &ViewDesc, + write_temp_id: u8, + ascending: u32, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + let size = input_view.size(); + if size == 0 { + return Ok(()); + } + let scratch = u64::from(ctx.vector_scratch_base); + let temp_off = *ctx + .ctx + .temp_offsets + .get(write_temp_id as usize) + .ok_or_else(|| { + WasmGenError::Unsupported(format!("wasmgen: temp id {write_temp_id} out of range")) + })?; + + // Gather: pair[orig_idx] = (value = input[orig_idx], idx = orig_idx). + for orig_idx in 0..size { + let pair_val_addr = scratch + (orig_idx as u64) * (PAIR_BYTES as u64); + f.instruction(&Ins::I32Const(0)); + emit_view_element_load(input_view, orig_idx, ctx, f)?; + f.instruction(&Ins::F64Store(memarg(pair_val_addr))); + f.instruction(&Ins::I32Const(0)); + f.instruction(&f64_const(orig_idx as f64)); + f.instruction(&Ins::F64Store(memarg(pair_val_addr + 8))); + } + + // stable_sort(scratch, size, ascending) + f.instruction(&Ins::I32Const(ctx.vector_scratch_base as i32)); + f.instruction(&Ins::I32Const(size as i32)); + f.instruction(&Ins::LocalGet(ascending)); + f.instruction(&Ins::Call(ctx.helpers.stable_sort)); + + // Scatter by ORIGINAL position: for each rank, orig_idx = pair[rank].idx + // (runtime); temp[temp_off + orig_idx] = rank + 1. The destination slot is + // runtime-indexed (it depends on the sorted permutation), so the dynamic + // address part is `orig_idx * 8` and the constant `temp_storage_base + + // temp_off*8` rides in the `memarg.offset`. f64.store wants + // [addr_i32, value_f64], so push the address first, then `rank + 1`. + let temp_base_byte = + u64::from(ctx.temp_storage_base) + (temp_off as u64) * u64::from(SLOT_SIZE); + for rank in 0..size { + let pair_idx_addr = scratch + (rank as u64) * (PAIR_BYTES as u64) + 8; + // dynamic addr = orig_idx * 8, where orig_idx = trunc(pair[rank].idx). + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::F64Load(memarg(pair_idx_addr))); + f.instruction(&Ins::I32TruncSatF64S); + f.instruction(&Ins::I32Const(SLOT_SIZE as i32)); + f.instruction(&Ins::I32Mul); + // value = rank + 1 (1-based) + f.instruction(&f64_const((rank + 1) as f64)); + f.instruction(&Ins::F64Store(memarg(temp_base_byte))); + } + Ok(()) +} + +/// Pop the `direction` operand off the wasm stack (the VM does `.round() as +/// i32`), compute `ascending = (round(direction) == 1) as i32`, and store it in +/// `ascending_local`. Shared by `VectorSortOrder`/`Rank`. +fn pop_direction_to_ascending(ascending_local: u32, ctx: &EmitCtx, f: &mut Function) { + // The round's two f64 temps (`scratch_local` + `apply_locals[0]`) are both + // free here -- nothing survives across this direction pop. + emit_round_half_away(f, ctx.scratch_local, ctx.apply_locals[0]); + f.instruction(&Ins::I32TruncSatF64S); + f.instruction(&Ins::I32Const(1)); + f.instruction(&Ins::I32Eq); + f.instruction(&Ins::LocalSet(ascending_local)); +} + +// ── LookupArray (vm.rs:2586-2629) ──────────────────────────────────────────── + +/// Lower `LookupArray { base_gf, table_count, mode, write_temp_id }`, mirroring +/// `vm.rs:2586-2629`. The shared `index` is popped; `input_view = top`. For each +/// element `i`, `elem_off = flat_offset(indices)` (compile-time); if `elem_off >= +/// table_count` the result is NaN, else the GF directory entry at `base_gf + +/// elem_off` is read and the mode's Phase-3 helper (`lookup_interp`/`forward`/ +/// `backward`) is `call`ed at `index`. Written to `temp[temp_off + i]` (sequential +/// index). An invalid input view fills the temp with NaN. +/// +/// Each element's `elem_off` is compile-time, so the bound check, the GF +/// directory entry address, and the mode dispatch all resolve at compile time; +/// only the `index` and the `lookup_*` evaluation are runtime. Unrolled over the +/// view size (the caller charges the unroll budget). +pub(crate) fn emit_lookup_array( + input_view: &ViewDesc, + base_gf: GraphicalFunctionId, + table_count: u16, + mode: LookupMode, + write_temp_id: u8, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + // Pop `index` to a scratch f64 (read once per element). Done before the gate + // so both gate arms are operand-balanced. A dynamically-subscripted input is + // handled by the gate (invalid -> fill_temp_nan) and `emit_view_element_load`. + let index = ctx.scratch_local; + f.instruction(&Ins::LocalSet(index)); + + emit_with_validity_gate(&[input_view], write_temp_id, ctx, f, |ctx, f| { + emit_lookup_array_body( + input_view, + base_gf, + table_count, + mode, + write_temp_id, + index, + ctx, + f, + ) + }) +} + +/// The valid-input body of [`emit_lookup_array`]. +#[allow(clippy::too_many_arguments)] +fn emit_lookup_array_body( + input_view: &ViewDesc, + base_gf: GraphicalFunctionId, + table_count: u16, + mode: LookupMode, + write_temp_id: u8, + index: u32, + ctx: &EmitCtx, + f: &mut Function, +) -> Result<(), WasmGenError> { + let helper_idx = match mode { + LookupMode::Interpolate => ctx.helpers.lookup_interp, + LookupMode::Forward => ctx.helpers.lookup_forward, + LookupMode::Backward => ctx.helpers.lookup_backward, + }; + let size = input_view.size(); + for i in 0..size { + // elem_off (compile-time) = flat offset of element i over the view. + let elem_off = input_view.flat_element_offset(i); + let temp_addr = temp_element_byte_addr(ctx, write_temp_id, i as u32)?; + f.instruction(&Ins::I32Const(0)); // temp store dynamic addr (const base) + + if elem_off >= table_count as usize { + // Out-of-range element offset -> NaN (matching the scalar Lookup + // bound; vm.rs:2615). + f.instruction(&f64_const(f64::NAN)); + } else { + // table_idx = base_gf + elem_off (compile-time). Read (data_off, + // count) from the GF directory at gf_directory_base + table_idx*8, + // then call the mode's helper at `index`. + let dir_addr = u64::from(ctx.gf_directory_base) + + (base_gf as u64 + elem_off as u64) * (GF_DIRECTORY_ENTRY_BYTES as u64); + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::I32Load(i32_memarg(dir_addr))); // data_off + f.instruction(&Ins::I32Const(0)); + f.instruction(&Ins::I32Load(i32_memarg(dir_addr + 4))); // count + f.instruction(&Ins::LocalGet(index)); + f.instruction(&Ins::Call(helper_idx)); + } + f.instruction(&Ins::F64Store(memarg(temp_addr))); + } + Ok(()) +} + +// ── validity gate ──────────────────────────────────────────────────────────── + +/// Emit `body` for the temp-writing vector ops, gated on the VM's "`!is_valid` +/// -> fill_temp_nan" short-circuit. When no input view carries a runtime validity +/// flag (the common static/temp/full-var case), `body` is emitted directly with +/// no runtime check. Otherwise: `if all_valid { body } else { fill_temp_nan }`. +fn emit_with_validity_gate( + views: &[&ViewDesc], + write_temp_id: u8, + ctx: &EmitCtx, + f: &mut Function, + body: impl FnOnce(&EmitCtx, &mut Function) -> Result<(), WasmGenError>, +) -> Result<(), WasmGenError> { + let any_dynamic = views.iter().any(|v| v.valid_local.is_some()); + if !any_dynamic { + return body(ctx, f); + } + push_all_valid(views, f); + f.instruction(&Ins::If(BlockType::Empty)); + body(ctx, f)?; + f.instruction(&Ins::Else); + emit_fill_temp_nan(ctx, write_temp_id, f)?; + f.instruction(&Ins::End); + Ok(()) +} diff --git a/src/simlin-engine/src/wasmgen/views.rs b/src/simlin-engine/src/wasmgen/views.rs new file mode 100644 index 000000000..8aef2b9cd --- /dev/null +++ b/src/simlin-engine/src/wasmgen/views.rs @@ -0,0 +1,811 @@ +// Copyright 2026 The Simlin Authors. All rights reserved. +// Use of this source code is governed by the Apache License, +// Version 2.0, that can be found in the LICENSE file. + +// pattern: Functional Core +// Pure compile-time model of the VM's runtime `view_stack`. No I/O; the only +// state is the `Vec` the emitter threads through `emit_bytecode`. + +//! Compile-time view descriptors -- the wasm backend's analogue of the VM's +//! runtime `view_stack` (`crate::vm`). +//! +//! The VM resolves every array access through a runtime stack of [`RuntimeView`]s +//! built and transformed by the `Push*View` / `View*` opcodes. Because every +//! static view's geometry (base offset, dims, strides, offset, sparsity, +//! is_temp) is known at compile time, the wasm emitter maintains a *compile-time* +//! stack of [`ViewDesc`]s instead, mirroring the static parts of `RuntimeView` +//! field-for-field and reproducing the `RuntimeView::apply_*` transforms in +//! `apply_*` here. Element addressing then routes through a single source of +//! truth -- [`ViewDesc::element_addr`] -- so Tasks 2-4 and Phase 6 all address +//! elements identically to the VM's `flat_offset` / `offset_for_iter_index`. +//! +//! [`RuntimeView`]: crate::bytecode::RuntimeView + +use crate::bytecode::{ByteCodeContext, StaticArrayView}; + +/// Where a view's base address lives, mirroring how the VM resolves the base of +/// a `RuntimeView` element read (`reduce_view` in `vm.rs`). +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub(crate) enum ViewBase { + /// `curr[base_off + ..]` at an *absolute* slot base. This is what + /// `PushStaticView` produces: `StaticArrayView::to_runtime_view` copies + /// `base_off` verbatim (no `module_off` added), so the byte address is + /// `curr_base + (base_off + flat) * 8` with no runtime addend. + CurrAbsolute, + /// `curr[module_off + base_off + ..]`. `PushVarView` / `PushVarViewDirect` + /// fold the runtime `module_off` into the base (`vm.rs:1749` / `1784`), so a + /// read adds `module_off * 8` to the constant address. In the current + /// single-root scope `module_off == 0`, but the distinction is preserved so + /// Phase 7 can thread a real `module_off` without changing addressing. + CurrModuleRelative, + /// `temp_storage[temp_offsets[base_off] + ..]` (`is_temp`): the base is a + /// temp id, resolved against the `temp_storage` region via `temp_offsets`. + Temp, +} + +/// A single sparse-dimension mapping, mirroring +/// [`crate::bytecode::RuntimeSparseMapping`]: the view's index along +/// `dim_index` is remapped through `parent_offsets` before being multiplied by +/// the stride (`RuntimeView::flat_offset`). +#[derive(Clone, PartialEq, Eq, Debug)] +pub(crate) struct SparseDim { + pub dim_index: usize, + pub parent_offsets: Vec, +} + +/// Compile-time mirror of the static parts of [`crate::bytecode::RuntimeView`]. +/// +/// Holds exactly the geometry needed to compute an element's byte address: +/// `base` (where the storage lives), `dims`/`strides`/`offset`/`sparse` (the +/// flat-offset arithmetic), and `dim_ids` (broadcast matching, used by Phase 5 +/// Task 3's iteration). `runtime_off_local` / `valid_local` are `None` for every +/// static view; Task 4's dynamic subscripts set them to wasm locals carrying a +/// runtime offset addend and a validity flag. +#[derive(Clone, PartialEq, Debug)] +pub(crate) struct ViewDesc { + /// Base slot offset (in `curr`) or temp id (when `base == Temp`). + pub base_off: u32, + pub base: ViewBase, + /// Dimension sizes (`size() == product`). + pub dims: Vec, + /// Per-dimension strides (signed: a transposed view has non-row-major, + /// still-positive strides; the sign supports future reversed views). + pub strides: Vec, + /// Starting flat offset within the base array (folds in collapsed subscripts + /// and range starts). + pub offset: u32, + /// Sparse dimension mappings (empty unless a star-range was applied). + pub sparse: Vec, + /// Dimension IDs, for broadcast matching during iteration (Task 3). + pub dim_ids: Vec, + /// wasm i32 local holding a runtime offset addend (dynamic subscript, Task + /// 4). `None` for static views. + pub runtime_off_local: Option, + /// wasm i32 local that is 0 when the view is invalid (out-of-bounds dynamic + /// subscript, Task 4). `None` for static views (always valid). + pub valid_local: Option, +} + +impl ViewDesc { + /// Build a `ViewDesc` from a baked [`StaticArrayView`] (`PushStaticView`). + /// + /// `StaticArrayView::to_runtime_view` copies `base_off` verbatim with no + /// `module_off`, so the base is [`ViewBase::CurrAbsolute`] for a variable + /// view and [`ViewBase::Temp`] when `is_temp`. + pub fn from_static(view: &StaticArrayView) -> Self { + ViewDesc { + base_off: view.base_off, + base: if view.is_temp { + ViewBase::Temp + } else { + ViewBase::CurrAbsolute + }, + dims: view.dims.to_vec(), + strides: view.strides.to_vec(), + offset: view.offset, + sparse: view + .sparse + .iter() + .map(|s| SparseDim { + dim_index: s.dim_index as usize, + parent_offsets: s.parent_offsets.to_vec(), + }) + .collect(), + dim_ids: view.dim_ids.to_vec(), + runtime_off_local: None, + valid_local: None, + } + } + + /// Build a contiguous view over a full variable/temp array from a dim-list + /// (the `(n_dims, sizes)` for `PushVarViewDirect`, or dim sizes resolved + /// from `ctx.dimensions` for `PushVarView`/`PushTempView`). Strides are + /// row-major, built right-to-left, exactly as `RuntimeView::for_var`. + pub fn contiguous(base_off: u32, base: ViewBase, dims: Vec, dim_ids: Vec) -> Self { + let mut strides = Vec::with_capacity(dims.len()); + let mut stride = 1i32; + for &d in dims.iter().rev() { + strides.push(stride); + stride *= d as i32; + } + strides.reverse(); + ViewDesc { + base_off, + base, + dims, + strides, + offset: 0, + sparse: Vec::new(), + dim_ids, + runtime_off_local: None, + valid_local: None, + } + } + + /// `size() == product of dims` (`RuntimeView::size`). A scalar view (no + /// dims) has size 1. The array reducer (Task 2) bounds its unrolled fold by + /// this. + pub fn size(&self) -> usize { + self.dims.iter().map(|&d| d as usize).product() + } + + /// Whether the view is contiguous: offset 0, no sparse mappings, and + /// row-major strides (`RuntimeView::is_contiguous`). + pub fn is_contiguous(&self) -> bool { + if self.offset != 0 || !self.sparse.is_empty() { + return false; + } + let mut expected = 1i32; + for i in (0..self.dims.len()).rev() { + if self.strides[i] != expected { + return false; + } + expected *= self.dims[i] as i32; + } + true + } + + /// Apply a single-element subscript at `dim_idx` (0-based index), dropping + /// that dimension. Exactly mirrors `RuntimeView::apply_single_subscript`: + /// a sparse dim's index is first remapped through `parent_offsets` (and the + /// mapping removed), the resolved index is folded into `offset`, the + /// dimension is removed, and later sparse mappings shift down by one. + pub fn apply_single_subscript(&mut self, dim_idx: usize, index: u16) { + let actual_index = + if let Some(pos) = self.sparse.iter().position(|s| s.dim_index == dim_idx) { + let parent_idx = self.sparse[pos].parent_offsets[index as usize]; + self.sparse.remove(pos); + parent_idx + } else { + index + }; + + self.offset += actual_index as u32 * self.strides[dim_idx] as u32; + + self.dims.remove(dim_idx); + self.strides.remove(dim_idx); + self.dim_ids.remove(dim_idx); + + for s in &mut self.sparse { + if s.dim_index > dim_idx { + s.dim_index -= 1; + } + } + } + + /// Remove `dim_idx` for a *dynamic* single subscript (Task 4): drop the + /// dimension/stride/dim_id and return that dimension's stride, leaving the + /// (runtime) offset contribution to the caller's `runtime_off_local` rather + /// than the compile-time `offset`. This is the runtime-index analogue of + /// `apply_single_subscript`: the *shape* change (which dim is collapsed) is + /// compile-time, only the offset addend is runtime. + /// + /// Returns `None` if the dim is out of range or sparse. A sparse dynamic + /// subscript would need a runtime `parent_offsets` table lookup, but the + /// dynamic-subscript base (`PushVarViewDirect`) is always dense, so this + /// never arises in practice; rejecting it keeps a wrong module from being + /// emitted if it ever did. + pub fn apply_single_subscript_dynamic(&mut self, dim_idx: usize) -> Option { + if dim_idx >= self.dims.len() { + return None; + } + if self.sparse.iter().any(|s| s.dim_index == dim_idx) { + return None; + } + let stride = self.strides[dim_idx]; + self.dims.remove(dim_idx); + self.strides.remove(dim_idx); + self.dim_ids.remove(dim_idx); + for s in &mut self.sparse { + if s.dim_index > dim_idx { + s.dim_index -= 1; + } + } + Some(stride) + } + + /// The stride of `dim_idx` (for a dynamic subscript's runtime offset + /// computation), or `None` if out of range. + pub fn stride_at(&self, dim_idx: usize) -> Option { + self.strides.get(dim_idx).copied() + } + + /// The size of `dim_idx` (the bound a dynamic subscript range-checks + /// against), or `None` if out of range. + pub fn dim_at(&self, dim_idx: usize) -> Option { + self.dims.get(dim_idx).copied() + } + + /// Apply a `[start:end)` range (0-based) to `dim_idx` + /// (`RuntimeView::apply_range`): fold the start into `offset` and shrink the + /// dimension to `end - start`. + pub fn apply_range(&mut self, dim_idx: usize, start: u16, end: u16) { + self.offset += start as u32 * self.strides[dim_idx] as u32; + self.dims[dim_idx] = end - start; + } + + /// Apply a star-range (sparse) at `dim_idx` + /// (`RuntimeView::apply_sparse_with_dim_id`): the dimension's size becomes + /// the number of parent offsets, a sparse mapping is recorded, and the + /// dim id is relabeled to the subdimension for broadcast matching. + pub fn apply_sparse(&mut self, dim_idx: usize, parent_offsets: Vec, new_dim_id: u16) { + self.dims[dim_idx] = parent_offsets.len() as u16; + self.sparse.push(SparseDim { + dim_index: dim_idx, + parent_offsets, + }); + self.dim_ids[dim_idx] = new_dim_id; + } + + /// Transpose the view (`RuntimeView::transpose`): reverse dims/strides/ + /// dim_ids and renumber the sparse `dim_index`es to `n-1-dim_index`. + pub fn transpose(&mut self) { + self.dims.reverse(); + self.strides.reverse(); + self.dim_ids.reverse(); + let n = self.dims.len(); + for s in &mut self.sparse { + s.dim_index = n - 1 - s.dim_index; + } + } + + /// The flat element offset (within the base array, in slots) for a flat + /// iteration index `iter_idx in 0..size()`. Mirrors + /// `RuntimeView::offset_for_iter_index` + `flat_offset`: contiguous views + /// short-circuit to `offset + iter_idx`; otherwise the flat index is + /// decomposed into row-major multi-dim indices and each (sparse-remapped) + /// index multiplied by its stride. + pub fn flat_element_offset(&self, iter_idx: usize) -> usize { + if self.dims.is_empty() { + return self.offset as usize; + } + if self.is_contiguous() { + return self.offset as usize + iter_idx; + } + + // Decompose iter_idx into per-dimension indices (last dim varies fastest). + let n = self.dims.len(); + let mut indices = vec![0u16; n]; + let mut remaining = iter_idx; + for d in (0..n).rev() { + let dim = self.dims[d] as usize; + indices[d] = (remaining % dim) as u16; + remaining /= dim; + } + + let mut flat = self.offset as usize; + for (i, &idx) in indices.iter().enumerate() { + let actual = if let Some(s) = self.sparse.iter().find(|s| s.dim_index == i) { + s.parent_offsets[idx as usize] as usize + } else { + idx as usize + }; + flat += actual * self.strides[i] as usize; + } + flat + } + + /// The flat element offset (in slots) for an explicit multi-dimensional + /// index, mirroring `RuntimeView::flat_offset`: `offset + Σ idx_k * + /// strides[k]`, with a sparse dimension's index first remapped through its + /// `parent_offsets`. The broadcast paths below build the multi-dim index + /// themselves (rather than from a flat iteration index), so they route + /// through this rather than [`flat_element_offset`](Self::flat_element_offset). + pub fn flat_offset_for_indices(&self, indices: &[u16]) -> usize { + let mut flat = self.offset as usize; + for (i, &idx) in indices.iter().enumerate() { + let actual = if let Some(s) = self.sparse.iter().find(|s| s.dim_index == i) { + s.parent_offsets[idx as usize] as usize + } else { + idx as usize + }; + flat += actual * self.strides[i] as usize; + } + flat + } + + /// Decompose a flat iteration index into per-dimension indices in row-major + /// order (last dim varies fastest), mirroring the VM's iteration-index + /// decomposition in `LoadIterViewTop` / `reduce_view` / `increment_indices`. + /// + /// Shared with `vector.rs` (VectorElmMap's sliced-source projection walks the + /// same row-major order), so it is `pub(crate)` rather than private. + pub(crate) fn decompose_iter_index(dims: &[u16], iter_idx: usize) -> Vec { + let n = dims.len(); + let mut indices = vec![0u16; n]; + let mut remaining = iter_idx; + for d in (0..n).rev() { + let dim = dims[d] as usize; + indices[d] = (remaining % dim) as u16; + remaining /= dim; + } + indices + } + + /// The flat element offset (in slots) for reading `self` as the *source* of + /// an iteration whose output geometry is `iter` at flat index `current`, + /// reproducing the VM's `LoadIterViewTop` / `LoadIterViewAt` broadcast + /// (`vm.rs:1946-2182`). Returns `None` when the VM would push NaN: a smaller + /// source than the iteration, or a dimension that does not match. + /// + /// Fast path (source dims/dim_ids equal the iteration's): the simple + /// `offset_for_iter_index(current)` read, bounds-checked against the source + /// size. Otherwise the broadcast path decomposes `current` into the + /// iteration's multi-dim indices, matches dimensions through + /// [`crate::dimensions::match_dimensions_two_pass`] (exact dim-id match, then + /// the indexed size-fallback), and rebuilds the source indices (bounds-checked + /// per dimension). `is_indexed` for each dim comes from `ctx.dimensions`, + /// exactly as the VM resolves it. + pub fn iter_broadcast_offset( + &self, + iter: &ViewDesc, + current: usize, + ctx: &ByteCodeContext, + ) -> Option { + // Fast path: dims and dim_ids match exactly -> direct iteration-index read + // (with the VM's "source smaller than iteration -> NaN" bounds check). + if self.dims == iter.dims && self.dim_ids == iter.dim_ids { + if current >= self.size() { + return None; + } + return Some(self.flat_element_offset(current)); + } + + // Broadcast path: decompose `current` into the iteration's indices, then + // map each source dimension to an iteration dimension. + let iter_indices = Self::decompose_iter_index(&iter.dims, current); + + let dim_indexed = |dim_ids: &[u16]| -> Vec { + dim_ids + .iter() + .map(|&dim_id| { + ctx.dimensions + .get(dim_id as usize) + .is_some_and(|d| d.is_indexed) + }) + .collect() + }; + let source_is_indexed = dim_indexed(&self.dim_ids); + let iter_is_indexed = dim_indexed(&iter.dim_ids); + + let source_to_iter = crate::dimensions::match_dimensions_two_pass( + &self.dim_ids, + &self.dims, + &source_is_indexed, + &iter.dim_ids, + &iter.dims, + &iter_is_indexed, + ); + + let mut source_indices: Vec = Vec::with_capacity(self.dims.len()); + for (src_dim_pos, mapped_iter_pos) in source_to_iter.iter().enumerate() { + let iter_pos = (*mapped_iter_pos)?; + let idx = iter_indices[iter_pos]; + if idx >= self.dims[src_dim_pos] { + return None; + } + source_indices.push(idx); + } + Some(self.flat_offset_for_indices(&source_indices)) + } + + /// The byte address of view element `iter_idx`, decomposed into the constant + /// part (which rides in a `memarg.offset`) and whether a runtime `module_off` + /// addend is still required. This is the single source of truth for element + /// addressing -- the unrolled reducer (Task 2), the iteration loop (Task 3), + /// and Phase 6 all route through it. + /// + /// - `CurrAbsolute`: `const = curr_base + (base_off + flat) * 8`, + /// `module_relative = false` (static views bake `module_off` in already). + /// - `Temp`: `const = temp_storage_base + (temp_offsets[base_off] + flat)*8`, + /// `module_relative = false`. + /// - `CurrModuleRelative`: `const = curr_base + (base_off + flat) * 8`, + /// `module_relative = true` (the caller adds `module_off * 8`). The VM + /// folds `module_off` into the base at `PushVarView` time (`vm.rs:1749`); + /// in the single-root scope `module_off == 0`, so the read is the same as + /// `CurrAbsolute` today, but the flag keeps Phase 7 correct. + /// + /// A dynamically-subscripted view (`runtime_off_local` set, Task 4) carries + /// the runtime addend + validity flag in the returned [`ElementAddr`]; static + /// views leave both `None`, so the address is fully constant. + pub fn element_addr( + &self, + iter_idx: usize, + curr_base: u32, + temp_storage_base: u32, + ctx: &ByteCodeContext, + ) -> Option { + let flat = self.flat_element_offset(iter_idx); + self.element_addr_for_flat(flat, curr_base, temp_storage_base, ctx) + } + + /// Like [`element_addr`](Self::element_addr) but for an *already-computed* + /// flat slot offset (the broadcast paths build the flat offset themselves via + /// [`flat_offset_for_indices`](Self::flat_offset_for_indices), rather than + /// from an iteration index). Static-view behaviour is byte-identical to + /// `element_addr` for the same flat offset (both `runtime_off_local` / + /// `valid_local` are `None`). + pub fn element_addr_for_flat( + &self, + flat: usize, + curr_base: u32, + temp_storage_base: u32, + ctx: &ByteCodeContext, + ) -> Option { + let flat = flat as u64; + let (const_byte_offset, module_relative) = match self.base { + ViewBase::CurrAbsolute => ( + u64::from(curr_base) + (u64::from(self.base_off) + flat) * 8, + false, + ), + ViewBase::CurrModuleRelative => ( + u64::from(curr_base) + (u64::from(self.base_off) + flat) * 8, + true, + ), + ViewBase::Temp => { + let temp_off = *ctx.temp_offsets.get(self.base_off as usize)? as u64; + (u64::from(temp_storage_base) + (temp_off + flat) * 8, false) + } + }; + Some(ElementAddr { + const_byte_offset, + module_relative, + runtime_off_local: self.runtime_off_local, + valid_local: self.valid_local, + }) + } +} + +/// The byte address of a view element, split into the compile-time-constant part +/// (a `memarg.offset`) and the runtime addends a dynamic subscript (Task 4) +/// requires. Returned by [`ViewDesc::element_addr`]. +/// +/// `module_relative` adds `module_off * 8` (var views; 0 in the single-root +/// scope). `runtime_off_local` (when `Some`) adds that i32 local's slot offset +/// times 8 (a dynamic subscript's accumulated `(index-1)*stride`). +/// `valid_local` (when `Some`) gates the load: 0 means out of bounds, so the +/// read yields NaN rather than touching memory. Both are `None` for a static +/// view, leaving the address fully constant. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub(crate) struct ElementAddr { + pub const_byte_offset: u64, + pub module_relative: bool, + pub runtime_off_local: Option, + pub valid_local: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::bytecode::{RuntimeSparseMapping, RuntimeView}; + use smallvec::SmallVec; + + /// Build the VM `RuntimeView` equivalent of a `ViewDesc` so the two + /// addressing implementations can be cross-checked. Validity/runtime locals + /// are not part of the geometry, so a static-shaped `ViewDesc` maps directly. + fn to_runtime_view(d: &ViewDesc) -> RuntimeView { + RuntimeView { + base_off: d.base_off, + is_temp: matches!(d.base, ViewBase::Temp), + dims: SmallVec::from_slice(&d.dims), + strides: SmallVec::from_slice(&d.strides), + offset: d.offset, + sparse: d + .sparse + .iter() + .map(|s| RuntimeSparseMapping { + dim_index: s.dim_index as u8, + parent_offsets: SmallVec::from_slice(&s.parent_offsets), + }) + .collect(), + dim_ids: SmallVec::from_slice(&d.dim_ids), + is_valid: true, + } + } + + /// Assert `ViewDesc::flat_element_offset` agrees with the VM's + /// `RuntimeView::offset_for_iter_index` for every element of the view -- the + /// addressing oracle Task 1 must match. + fn assert_flat_matches_vm(d: &ViewDesc) { + let rv = to_runtime_view(d); + assert_eq!(d.size(), rv.size(), "size mismatch"); + assert_eq!(d.is_contiguous(), rv.is_contiguous(), "contiguity mismatch"); + for i in 0..d.size() { + assert_eq!( + d.flat_element_offset(i), + rv.offset_for_iter_index(i), + "flat offset mismatch at element {i}" + ); + } + } + + fn dense(base_off: u32, dims: &[u16]) -> ViewDesc { + ViewDesc::contiguous( + base_off, + ViewBase::CurrAbsolute, + dims.to_vec(), + vec![0u16; dims.len()], + ) + } + + #[test] + fn contiguous_1d_addresses_match_vm() { + assert_flat_matches_vm(&dense(0, &[5])); + assert_flat_matches_vm(&dense(7, &[5])); + } + + #[test] + fn contiguous_2d_addresses_match_vm() { + assert_flat_matches_vm(&dense(0, &[2, 3])); + assert_flat_matches_vm(&dense(0, &[3, 4])); + } + + #[test] + fn subscript_const_drops_dim_like_vm() { + // 2x3 matrix; subscript dim 0 to index 1 -> a 1-D row at offset 3. + let mut d = dense(0, &[2, 3]); + let mut rv = to_runtime_view(&d); + d.apply_single_subscript(0, 1); + rv.apply_single_subscript(0, 1); + assert_eq!(d.offset, rv.offset); + assert_eq!(d.dims.as_slice(), rv.dims.as_slice()); + assert_eq!(d.strides.as_slice(), rv.strides.as_slice()); + assert_flat_matches_vm(&d); + } + + #[test] + fn range_matches_vm() { + // [1:4) of a 5-element dim: offset 1, dim 3. + let mut d = dense(0, &[5]); + d.apply_range(0, 1, 4); + assert_eq!(d.offset, 1); + assert_eq!(d.dims, vec![3]); + assert_flat_matches_vm(&d); + } + + #[test] + fn transpose_matches_vm() { + let mut d = dense(0, &[2, 3]); + let mut rv = to_runtime_view(&d); + d.transpose(); + rv.transpose(); + assert_eq!(d.dims.as_slice(), rv.dims.as_slice()); + assert_eq!(d.strides.as_slice(), rv.strides.as_slice()); + assert!( + !d.is_contiguous(), + "a transposed 2x3 view is non-contiguous" + ); + assert_flat_matches_vm(&d); + } + + #[test] + fn star_range_sparse_matches_vm() { + // A 1-D dim of 4, star-ranged to parent offsets [1, 3]. + let mut d = dense(0, &[4]); + let mut rv = to_runtime_view(&d); + d.apply_sparse(0, vec![1, 3], 1); + rv.apply_sparse_with_dim_id(0, SmallVec::from_slice(&[1, 3]), 1); + assert_eq!(d.dims, vec![2]); + assert_flat_matches_vm(&d); + // The two selected elements map to parent flat offsets 1 and 3. + assert_eq!(d.flat_element_offset(0), 1); + assert_eq!(d.flat_element_offset(1), 3); + } + + #[test] + fn subscript_then_renumbers_sparse_like_vm() { + // A 2-D view [3,4] with a sparse mapping on dim 1; subscript dim 0 must + // shift the sparse dim_index down to 0, matching the VM. + let mut d = dense(0, &[3, 4]); + d.apply_sparse(1, vec![0, 2], 5); // sparse on dim 1 -> dim 1 size 2 + let mut rv = to_runtime_view(&d); + d.apply_single_subscript(0, 1); + rv.apply_single_subscript(0, 1); + assert_eq!(d.sparse.len(), 1); + assert_eq!(d.sparse[0].dim_index, rv.sparse[0].dim_index as usize); + assert_flat_matches_vm(&d); + } + + #[test] + fn element_addr_curr_absolute_const() { + let d = dense(2, &[3]); + let ctx = ByteCodeContext::default(); + // element 1 at curr_base=0: (base_off 2 + flat 1) * 8 = 24. + let a = d.element_addr(1, 0, 0, &ctx).unwrap(); + assert_eq!(a.const_byte_offset, 24); + assert!(!a.module_relative); + // A static view carries no runtime addend or validity gate. + assert_eq!(a.runtime_off_local, None); + assert_eq!(a.valid_local, None); + } + + #[test] + fn element_addr_curr_module_relative_flag() { + let d = ViewDesc::contiguous(2, ViewBase::CurrModuleRelative, vec![3], vec![0]); + let ctx = ByteCodeContext::default(); + let a = d.element_addr(1, 0, 0, &ctx).unwrap(); + assert_eq!(a.const_byte_offset, 24); + assert!( + a.module_relative, + "var views carry a runtime module_off addend" + ); + } + + #[test] + fn element_addr_temp_uses_offset_table() { + let mut ctx = ByteCodeContext::default(); + ctx.set_temp_info(vec![0, 4], 8); + let d = ViewDesc::contiguous(1, ViewBase::Temp, vec![2], vec![0]); + // temp_storage_base = 1000; temp 1 offset = 4; element 1 -> (4+1)*8 = 40. + let a = d.element_addr(1, 0, 1000, &ctx).unwrap(); + assert_eq!(a.const_byte_offset, 1000 + 40); + assert!(!a.module_relative); + } + + #[test] + fn element_addr_dynamic_view_carries_runtime_locals() { + // A view with a runtime offset addend + validity flag (Task 4) returns + // the constant base plus the locals the caller must add/guard. + let mut d = dense(0, &[3]); + d.runtime_off_local = Some(9); + d.valid_local = Some(7); + let ctx = ByteCodeContext::default(); + let a = d.element_addr(0, 0, 0, &ctx).unwrap(); + // Element 0: const base is just curr_base + base_off*8 = 0; the runtime + // index offset rides in local 9, the validity in local 7. + assert_eq!(a.const_byte_offset, 0); + assert_eq!(a.runtime_off_local, Some(9)); + assert_eq!(a.valid_local, Some(7)); + } + + // ── iter_broadcast_offset (Task 3): cross-check against the VM ───────── + + /// A `ByteCodeContext` whose dimension table makes the dims with the given + /// ids indexed (so `match_dimensions_two_pass`'s size-fallback can fire), all + /// of `size`. Used only so `iter_broadcast_offset` can resolve `is_indexed`. + fn ctx_indexed_dims(n: usize, size: u16) -> ByteCodeContext { + let mut ctx = ByteCodeContext::default(); + for _ in 0..n { + let nid = ctx.intern_name("D"); + ctx.add_dimension(crate::bytecode::DimensionInfo::indexed(nid, size)); + } + ctx + } + + /// Build a `ViewDesc` with explicit dims/dim_ids (row-major contiguous). + fn view_with_dim_ids(dims: &[u16], dim_ids: &[u16]) -> ViewDesc { + ViewDesc::contiguous(0, ViewBase::CurrAbsolute, dims.to_vec(), dim_ids.to_vec()) + } + + #[test] + fn iter_broadcast_offset_matches_fast_path() { + // Source dims == iter dims: every element reads its own offset. + let ctx = ctx_indexed_dims(2, 3); + let iter = view_with_dim_ids(&[2, 3], &[0, 1]); + let src = view_with_dim_ids(&[2, 3], &[0, 1]); + for current in 0..iter.size() { + assert_eq!( + src.iter_broadcast_offset(&iter, current, &ctx), + Some(current), + "fast-path element {current}" + ); + } + } + + #[test] + fn iter_broadcast_offset_broadcasts_smaller_source() { + // iter is 2-D [DimA(2), DimB(3)]; source is 1-D [DimA(2)] (dim_id 0). The + // VM broadcasts the source along the missing DimB, so result element + // (a, b) reads source[a]. dim_ids: iter [0,1], source [0]. + let ctx = ctx_indexed_dims(2, 3); + let iter = view_with_dim_ids(&[2, 3], &[0, 1]); + let src = view_with_dim_ids(&[2], &[0]); + for a in 0..2u16 { + for b in 0..3u16 { + let current = (a as usize) * 3 + b as usize; + // Result element (a,b) -> source index [a] -> flat offset a. + assert_eq!( + src.iter_broadcast_offset(&iter, current, &ctx), + Some(a as usize), + "broadcast element ({a},{b})" + ); + } + } + } + + #[test] + fn iter_broadcast_offset_smaller_source_same_shape_is_nan() { + // Same dims/dim_ids fast path, but the source is genuinely shorter than + // the iteration: the VM returns NaN past the source size. + let ctx = ctx_indexed_dims(1, 5); + let iter = view_with_dim_ids(&[5], &[0]); + let src = view_with_dim_ids(&[3], &[0]); + assert_eq!(src.iter_broadcast_offset(&iter, 2, &ctx), Some(2)); + assert_eq!( + src.iter_broadcast_offset(&iter, 3, &ctx), + None, + "element past the source size must be NaN" + ); + } + + #[test] + fn iter_broadcast_offset_unmatched_dim_is_nan() { + // Source dim_id 7 has no counterpart in the iteration (dim_ids [0,1]) and + // is named (not indexed), so the size-fallback cannot match it either: + // the VM returns NaN. + let mut ctx = ByteCodeContext::default(); + let n0 = ctx.intern_name("A"); + ctx.add_dimension(crate::bytecode::DimensionInfo::indexed(n0, 2)); // id 0 + let n1 = ctx.intern_name("B"); + ctx.add_dimension(crate::bytecode::DimensionInfo::indexed(n1, 3)); // id 1 + // A named (non-indexed) dim id 2 used only by the source. + let n2 = ctx.intern_name("C"); + ctx.add_dimension(crate::bytecode::DimensionInfo::named( + n2, + SmallVec::from_slice(&[n0, n1]), + )); // id 2, size 2, named + let iter = view_with_dim_ids(&[2, 3], &[0, 1]); + let src = view_with_dim_ids(&[2], &[2]); + assert_eq!(src.iter_broadcast_offset(&iter, 0, &ctx), None); + } + + /// Cross-check `iter_broadcast_offset` against a from-scratch reimplementation + /// of the VM's `LoadIterViewTop` broadcast over a `RuntimeView`, for a + /// transpose-broadcast case (iter [DimA,DimB], source [DimB] -- the source's + /// single dim matches the iteration's *second* axis by dim-id). + #[test] + fn iter_broadcast_offset_matches_vm_loaditerviewtop() { + let ctx = ctx_indexed_dims(2, 0); // sizes overwritten below + // Rebuild with distinct sizes: DimA=2 (id 0), DimB=4 (id 1). + let mut ctx2 = ByteCodeContext::default(); + let na = ctx2.intern_name("A"); + ctx2.add_dimension(crate::bytecode::DimensionInfo::indexed(na, 2)); + let nb = ctx2.intern_name("B"); + ctx2.add_dimension(crate::bytecode::DimensionInfo::indexed(nb, 4)); + let _ = ctx; + + let iter = view_with_dim_ids(&[2, 4], &[0, 1]); + let src = view_with_dim_ids(&[4], &[1]); // only DimB + let iter_rv = to_runtime_view(&iter); + let src_rv = to_runtime_view(&src); + + for current in 0..iter.size() { + // VM reference: decompose current into iter indices, match dims by id + // (DimB is id 1 in both), read source[that DimB index]. + let n = iter_rv.dims.len(); + let mut idx: SmallVec<[u16; 4]> = smallvec::smallvec![0; n]; + let mut rem = current; + for d in (0..n).rev() { + idx[d] = (rem % iter_rv.dims[d] as usize) as u16; + rem /= iter_rv.dims[d] as usize; + } + // DimB is iteration axis 1. + let want = src_rv.flat_offset(&[idx[1]]); + assert_eq!( + src.iter_broadcast_offset(&iter, current, &ctx2), + Some(want), + "element {current}" + ); + } + } +} diff --git a/src/simlin-engine/tests/simulate.rs b/src/simlin-engine/tests/simulate.rs index 2d9ada4ff..958d17dc8 100644 --- a/src/simlin-engine/tests/simulate.rs +++ b/src/simlin-engine/tests/simulate.rs @@ -15,7 +15,9 @@ use simlin_engine::serde::{deserialize, serialize}; use simlin_engine::{Method, Results, SimSpecs as Specs, Vm, project_io}; use simlin_engine::{load_csv, load_dat, open_vensim, open_vensim_with_data, xmile}; -use test_helpers::{ensure_results, ensure_results_excluding}; +use test_helpers::{ + WasmRunOutcome, ensure_results, ensure_results_excluding, ensure_wasm_matches, wasm_results_for, +}; const OUTPUT_FILES: &[(&str, u8)] = &[("output.csv", b','), ("output.tab", b'\t')]; @@ -100,6 +102,86 @@ static TEST_MODELS: &[&str] = &[ "test/test-models/tests/unicode_characters/unicode_test_model.xmile", ]; +/// End-state wasm parity gate (wasm-backend AC3.2 / AC3.3): EVERY corpus model +/// in `TEST_MODELS` must run through the wasm backend to VM parity -- zero may +/// return `WasmGenError::Unsupported`. `expected` is the VM's own output (the +/// parse + `compile_vm` + run path), so this is a direct wasm-vs-VM check +/// independent of the on-disk reference files; the per-model inline hook +/// (`wasm_parity_hook`) separately checks every model against its on-disk +/// `expected` and likewise hard-fails on `Unsupported`. +/// +/// This replaces the Phase 1-7 monotonic floor (a `ran >= FLOOR` count). The +/// backend now covers the full core-simulation surface -- scalar + every +/// `Apply` builtin + arrays/reducers/iteration + vector ops + allocation + +/// scalar/array lookups + Euler/RK2/RK4 + PREVIOUS/INIT + nested modules -- so +/// the end state is total coverage, and any regression that makes a previously +/// supported model `Unsupported` fails here (AC3.3) with the offending model and +/// reason. The genuinely out-of-scope constructs (a runtime view range +/// `arr[lo:hi]` with non-literal bounds -> `ViewRangeDynamic`, or array +/// unrolling past the per-function budget) are not reached by any `TEST_MODELS` +/// member; they are pinned by the inline `wasmgen` unit tests and +/// `ensure_wasm_matches_skips_unsupported_model`. The heavy `#[ignore]`-class +/// models (C-LEARN) have their own `#[ignore]`d wasm twins so this gate stays +/// within the default suite's 3-minute wall-clock cap. +/// +/// Iterating the full `TEST_MODELS` list under the un-JITed DLR-FT interpreter +/// stays well within that cap (the corpus is small/medium scalar/arrayed +/// models), so the gate covers the whole list rather than a subset. +#[test] +fn wasm_parity_floor() { + let mut unsupported: Vec<(String, String)> = Vec::new(); + for &path in TEST_MODELS { + let file_path = format!("../../{path}"); + if let WasmRunOutcome::Skipped(msg) = wasm_parity_outcome_for_path(&file_path) { + unsupported.push((path.to_string(), msg)); + } + } + eprintln!( + "wasm_parity_floor: {} of {} corpus models ran to VM parity ({} unsupported)", + TEST_MODELS.len() - unsupported.len(), + TEST_MODELS.len(), + unsupported.len() + ); + assert!( + unsupported.is_empty(), + "wasm parity gate (AC3.2/AC3.3): every corpus model must run through the \ + wasm backend, but {} of {} returned Unsupported -- a regression that \ + dropped a previously-supported model, or a new feature whose lowering is \ + missing:\n{}", + unsupported.len(), + TEST_MODELS.len(), + unsupported + .iter() + .map(|(p, m)| format!(" {p}: {m}")) + .collect::>() + .join("\n") + ); +} + +/// Parse the XMILE/STMX model at `path`, run it through the VM for an `expected` +/// baseline, and return whether the wasm backend reproduces it (`Ran`) or +/// returns `Unsupported` (`Skipped`). Used only by `wasm_parity_floor`, which +/// turns any `Skipped` into a hard failure. A parse or VM failure is surfaced as +/// `Skipped` (the VM corpus tests gate those paths directly; this gate only +/// checks wasm-vs-VM parity, never re-litigates VM correctness), so an +/// upstream parse/VM break would also trip the gate -- intended, since a model +/// that no longer VM-simulates can't establish wasm parity either. +fn wasm_parity_outcome_for_path(path: &str) -> WasmRunOutcome { + let datamodel = { + let Ok(f) = File::open(path) else { + return WasmRunOutcome::Skipped(format!("could not open {path}")); + }; + let mut f = BufReader::new(f); + match xmile::project_from_reader(&mut f) { + Ok(p) => p, + Err(e) => return WasmRunOutcome::Skipped(format!("parse failed: {e}")), + } + }; + + let expected = vm_results(&datamodel); + ensure_wasm_matches(&datamodel, "main", &expected, &[]) +} + /// Compile a datamodel project to a VM simulation using the incremental /// salsa-backed path. fn compile_vm( @@ -821,6 +903,80 @@ fn run_vacuous_comparison_scenarios() { asserts_panic("near-zero but meaningful divergence", &expected_nz, &sim_nz); } +/// Run the named model of `datamodel` through the VM and return its +/// `Results`, used as the `expected` baseline both the focused +/// `ensure_wasm_matches` tests and `wasm_parity_floor` compare wasm output +/// against. Mirrors the corpus VM path (`compile_vm` -> `Vm::new` -> +/// `run_to_end`). +fn vm_results(datamodel: &simlin_engine::datamodel::Project) -> Results { + let compiled = compile_vm(datamodel); + let mut vm = Vm::new(compiled).unwrap(); + vm.run_to_end().unwrap(); + vm.into_results() +} + +/// AC1.1: a scalar Euler model the wasm backend supports runs through +/// `ensure_wasm_matches` and clears the same `ensure_results` comparator the VM +/// clears (the helper panics internally on any divergence), so the outcome is +/// `Ran`. +#[test] +fn ensure_wasm_matches_runs_supported_scalar_model() { + let datamodel = simlin_engine::test_common::TestProject::new("simple") + .with_sim_time(0.0, 10.0, 1.0) + .aux("inflow_rate", "2", None) + .stock("level", "0", &["inflow"], &[], None) + .flow("inflow", "inflow_rate", None) + .build_datamodel(); + + let expected = vm_results(&datamodel); + let outcome = ensure_wasm_matches(&datamodel, "main", &expected, &[]); + assert!( + matches!(outcome, WasmRunOutcome::Ran), + "a supported scalar model must run through the wasm backend, got {outcome:?}" + ); +} + +/// AC3.1: a model using a not-yet-supported construct is SKIPPED, not failed -- +/// `compile_simulation` returns `WasmGenError::Unsupported` and the helper +/// surfaces it as `Skipped(msg)` carrying that message. +/// +/// The example construct has migrated as the backend's coverage grew: `^` +/// (`Op2::Exp`) became supported in Phase 2 Task 3, RK4 in Phase 4, and *modules* +/// (so `SMTH1`/`DELAY3` stdlib expansions) in Phase 7. The stable still- +/// unsupported construct is now a *true runtime range* `arr[lo:hi]` with +/// non-literal bounds, which lowers to `Opcode::ViewRangeDynamic` -- a runtime +/// view *size* the fully-unrolled emitter cannot express (`wasmgen.rs`'s +/// `ViewRangeDynamic` arm returns `Unsupported`). A literal range is +/// constant-folded into a static view, so the bounds must be variables. +#[test] +fn ensure_wasm_matches_skips_unsupported_model() { + let datamodel = simlin_engine::test_common::TestProject::new("unsupported") + .with_sim_time(0.0, 5.0, 1.0) + .indexed_dimension("A", 5) + .array_aux("source[A]", "A") + .scalar_aux("lo", "2") + .scalar_aux("hi", "4") + // SUM over a runtime range (variable bounds) -> ViewRangeDynamic, which + // the wasm backend cannot express (a runtime view size in a fully- + // unrolled emitter), so the whole model is Skipped. + .scalar_aux("total", "SUM(source[lo:hi])") + .build_datamodel(); + + let expected = vm_results(&datamodel); + let outcome = ensure_wasm_matches(&datamodel, "main", &expected, &[]); + match outcome { + WasmRunOutcome::Skipped(msg) => { + assert!( + msg.contains("ViewRangeDynamic"), + "expected the runtime-range rejection message, got: {msg}" + ); + } + WasmRunOutcome::Ran => { + panic!("a model using a runtime-range construct must be Skipped, not Ran") + } + } +} + type CompileFn = fn(&simlin_engine::datamodel::Project) -> simlin_engine::CompiledSimulation; fn simulate_path(xmile_path: &str) { @@ -912,6 +1068,39 @@ fn simulate_path_with_excluding(xmile_path: &str, compile: CompileFn, excluded: // byte-for-byte identical (we aren't losing any information) let serialized_xmile2 = xmile::project_to_xmile(&roundtripped_project).unwrap(); assert_eq!(&serialized_xmile, &serialized_xmile2); + + // wasm-backend parity: after the VM comparisons pass, run the model through + // the wasm backend once and assert it clears the SAME comparator against the + // same `expected`. A supported model that diverges panics inside the helper; + // an `Unsupported` outcome for this VM-simulated model is now a HARD FAILURE + // (the corpus gate, AC3.2). See AC1.1 / AC3.2. + wasm_parity_hook(&datamodel_project, &expected, excluded); +} + +/// Run one already-parsed, VM-simulated model through the wasm backend and +/// assert parity. This is reached only from the `simulate_path`/`simulate_mdl` +/// helpers, i.e. AFTER the VM has simulated the model, so a `Skipped` +/// (`WasmGenError::Unsupported`) here means a model the VM handles is NOT +/// covered by the wasm backend -- a hard failure (AC3.2: every core-simulation +/// model runs through both backends). A model the VM itself cannot simulate +/// (DELAY FIXED, GET DATA) is `#[ignore]`d and never reaches this hook, so it +/// stays out of scope. A supported-but-divergent model panics inside +/// `ensure_wasm_matches`. +fn wasm_parity_hook( + datamodel: &simlin_engine::datamodel::Project, + expected: &Results, + excluded: &[&str], +) { + if let WasmRunOutcome::Skipped(msg) = ensure_wasm_matches(datamodel, "main", expected, excluded) + { + panic!( + "wasm parity gate: a VM-simulated model returned Unsupported from the \ + wasm backend -- every core-simulation model must run through both \ + backends (AC3.2). Close the lowering gap or, if this is a genuinely \ + VM-unsupported feature, the test should be #[ignore]d so it never \ + reaches this hook. Reason: {msg}" + ); + } } fn load_expected_results_for_mdl(mdl_path: &str) -> Option { @@ -957,6 +1146,8 @@ fn simulate_mdl_path(mdl_path: &str) { let expected = load_expected_results_for_mdl(mdl_path) .unwrap_or_else(|| panic!("no reference data found for {mdl_path}")); ensure_results(&expected, &results); + + wasm_parity_hook(&datamodel_project, &expected, &[]); } /// Simulate a Vensim MDL file that references external data files. @@ -987,6 +1178,8 @@ fn simulate_mdl_path_with_data(mdl_path: &str) { let expected = load_expected_results_for_mdl(mdl_path) .unwrap_or_else(|| panic!("no reference data found for {mdl_path}")); ensure_results(&expected, &results); + + wasm_parity_hook(&datamodel_project, &expected, &[]); } #[test] @@ -1714,6 +1907,48 @@ fn simulates_wrld3_03() { assert_eq!(vdf_results.step_count, results.step_count); } +/// WORLD3 wasm parity twin (wasm-backend.AC1.1, heavy-model scale check): WORLD3 +/// is a large model, so its wasm blob exercises the backend well beyond the +/// small/medium default corpus. The VM test above only smoke-checks the VDF +/// decoder (no series comparison), so this twin asserts the wasm output matches +/// the VM output element-for-element via `ensure_results` -- the strongest +/// available parity check for this model (both backends consume the same +/// `CompiledSimulation`, so any divergence is a wasm lowering bug). A +/// `WasmGenError::Unsupported` would be a hard failure: WORLD3 is a +/// core-simulation model the VM handles. `#[ignore]`d for runtime class, like +/// the other heavy models. +/// +/// Run with: cargo test --release -- --ignored simulates_wrld3_03_wasm +#[test] +#[ignore] +fn simulates_wrld3_03_wasm() { + let mdl_path = "../../test/metasd/WRLD3-03/wrld3-03.mdl"; + + eprintln!("model (vensim mdl): {mdl_path}"); + + let contents = std::fs::read_to_string(mdl_path) + .unwrap_or_else(|e| panic!("failed to read {mdl_path}: {e}")); + + let datamodel_project = + open_vensim(&contents).unwrap_or_else(|e| panic!("failed to parse {mdl_path}: {e}")); + + // VM reference run. + let compiled = compile_vm(&datamodel_project); + let mut vm = + Vm::new(compiled).unwrap_or_else(|e| panic!("VM creation failed for {mdl_path}: {e}")); + vm.run_to_end() + .unwrap_or_else(|e| panic!("VM run failed for {mdl_path}: {e}")); + let vm_results = vm.into_results(); + + // wasm twin: compile through the backend, run under the interpreter, and + // match the VM element-for-element. + let wasm_results = wasm_results_for(&datamodel_project, "main").unwrap_or_else(|msg| { + panic!("WORLD3 must compile to wasm (a core-simulation model the VM handles): {msg}") + }); + + ensure_results(&vm_results, &wasm_results); +} + /// Known-residual C-LEARN base-variable names excluded from the /// `simulates_clearn` VDF gate. C-LEARN compiles via the incremental path, /// runs to FINAL TIME, and matches `Ref.vdf` within the 1% cross-simulator @@ -1838,13 +2073,11 @@ fn simulates_clearn() { ensure_vdf_results_excluding(&vdf_results, &results, EXPECTED_VDF_RESIDUAL); } -/// Compile and run C-LEARN end-to-end and parse `Ref.vdf`, returning -/// `(vdf_results, results)`. Shared by `simulates_clearn` (the 1% gate) and -/// `clearn_residual_exactness` (the exclusion-exactness guard) so both exercise -/// the byte-identical `open_vensim` -> `compile_vm` -> `run_to_end` -> parse-VDF -/// path and compare the same data. Heavy (C-LEARN is ~53k lines / 1.4 MB, -/// ~5s just to parse on release), so every caller is `#[ignore]`d. -fn run_clearn_vs_vdf() -> (Results, Results) { +/// Read and parse the C-LEARN `.mdl` into a datamodel project. Shared by the VM +/// path ([`run_clearn_vs_vdf`]) and the wasm twin ([`simulates_clearn_wasm`]) so +/// both compile the byte-identical model. Heavy (C-LEARN is ~53k lines / 1.4 MB, +/// ~5s just to parse on release). +fn clearn_datamodel() -> simlin_engine::datamodel::Project { let mdl_path = "../../test/xmutil_test_models/C-LEARN v77 for Vensim.mdl"; eprintln!("model (vensim mdl): {mdl_path}"); @@ -1852,28 +2085,72 @@ fn run_clearn_vs_vdf() -> (Results, Results) { let contents = std::fs::read_to_string(mdl_path) .unwrap_or_else(|e| panic!("failed to read {mdl_path}: {e}")); - let datamodel_project = - open_vensim(&contents).unwrap_or_else(|e| panic!("failed to parse {mdl_path}: {e}")); - - let compiled = compile_vm(&datamodel_project); - let mut vm = - Vm::new(compiled).unwrap_or_else(|e| panic!("VM creation failed for {mdl_path}: {e}")); - vm.run_to_end() - .unwrap_or_else(|e| panic!("VM run failed for {mdl_path}: {e}")); - let results = vm.into_results(); + open_vensim(&contents).unwrap_or_else(|e| panic!("failed to parse {mdl_path}: {e}")) +} +/// Parse the C-LEARN `Ref.vdf` genuine-Vensim reference output into `Results`. +/// Shared by every C-LEARN comparison path so they assert against identical data. +fn clearn_vdf_results() -> Results { let vdf_path = "../../test/xmutil_test_models/Ref.vdf"; let vdf_data_bytes = std::fs::read(vdf_path).unwrap_or_else(|e| panic!("failed to read {vdf_path}: {e}")); let vdf_file = simlin_engine::vdf::VdfFile::parse(vdf_data_bytes) .unwrap_or_else(|e| panic!("failed to parse VDF {vdf_path}: {e}")); - let vdf_results = vdf_file + vdf_file .to_results_via_records() - .unwrap_or_else(|e| panic!("VDF to_results_via_records failed: {e}")); + .unwrap_or_else(|e| panic!("VDF to_results_via_records failed: {e}")) +} + +/// Compile and run C-LEARN end-to-end through the VM and parse `Ref.vdf`, +/// returning `(vdf_results, results)`. Shared by `simulates_clearn` (the 1% gate) +/// and `clearn_residual_exactness` (the exclusion-exactness guard) so both +/// exercise the byte-identical `open_vensim` -> `compile_vm` -> `run_to_end` -> +/// parse-VDF path and compare the same data. Heavy, so every caller is +/// `#[ignore]`d. +fn run_clearn_vs_vdf() -> (Results, Results) { + let datamodel_project = clearn_datamodel(); + + let compiled = compile_vm(&datamodel_project); + let mut vm = + Vm::new(compiled).unwrap_or_else(|e| panic!("VM creation failed for C-LEARN: {e}")); + vm.run_to_end() + .unwrap_or_else(|e| panic!("VM run failed for C-LEARN: {e}")); + let results = vm.into_results(); + + let vdf_results = clearn_vdf_results(); (vdf_results, results) } +/// C-LEARN wasm parity twin (wasm-backend.AC1.3): compile C-LEARN through the +/// wasm backend, run it under the DLR-FT interpreter, and assert its output +/// clears the SAME hard 1% VDF gate + `EXPECTED_VDF_RESIDUAL` carve-out that +/// `simulates_clearn` applies to the VM. Both backends consume the same +/// `CompiledSimulation` produced by `compile_project_incremental`, so the wasm +/// output must clear the gate exactly as the VM does (a divergence is a wasm +/// lowering bug); the residual carve-out is identical because it is a property +/// of the model + reference data, not the execution engine. `#[ignore]`d for +/// runtime class -- C-LEARN under the non-JIT interpreter is slow -- exactly +/// like `simulates_clearn`. +/// +/// A `WasmGenError::Unsupported` here would be a hard failure: C-LEARN is a +/// core-simulation model the VM handles, so the wasm backend must too. +/// +/// Run with: cargo test --release -- --ignored simulates_clearn_wasm +#[test] +#[ignore] +fn simulates_clearn_wasm() { + let datamodel_project = clearn_datamodel(); + + let wasm_results = wasm_results_for(&datamodel_project, "main").unwrap_or_else(|msg| { + panic!("C-LEARN must compile to wasm (a core-simulation model the VM handles): {msg}") + }); + + let vdf_results = clearn_vdf_results(); + + ensure_vdf_results_excluding(&vdf_results, &wasm_results, EXPECTED_VDF_RESIDUAL); +} + /// Committed regression guard that `EXPECTED_VDF_RESIDUAL` stays EXACT: it is /// the precise set of C-LEARN base variables that the live `classify_vdf_ident` /// comparator flags, neither over- nor under-broad. Runs C-LEARN through the diff --git a/src/simlin-engine/tests/simulate_systems.rs b/src/simlin-engine/tests/simulate_systems.rs index 25cb432c3..26722c444 100644 --- a/src/simlin-engine/tests/simulate_systems.rs +++ b/src/simlin-engine/tests/simulate_systems.rs @@ -16,7 +16,7 @@ use simlin_engine::db::{ }; use simlin_engine::load_csv; -use test_helpers::ensure_results; +use test_helpers::{WasmRunOutcome, ensure_results, ensure_wasm_matches}; /// All valid systems format test models. const ALL_VALID_MODELS: &[&str] = &[ @@ -61,6 +61,98 @@ fn simulate_systems_file(txt_path: &str, csv_path: &str, rounds: u64) { .unwrap_or_else(|e| panic!("VM execution failed for {txt_path}: {e}")); let results = vm.into_results(); ensure_results(&expected, &results); + + // wasm-backend parity (AC3.2): a systems-format model translates to + // stdlib-module instances (`systems_rate`/`systems_leak`/`systems_conversion`), + // so this exercises the wasm backend's module path end-to-end. Every + // VM-simulated systems model must run through the wasm backend and clear the + // SAME comparator against `expected`; an `Unsupported` outcome here is a hard + // failure (this model VM-simulated, so the wasm backend must cover it). + // `ensure_wasm_matches` panics internally on a supported-but-wrong model. + if let WasmRunOutcome::Skipped(msg) = + ensure_wasm_matches(&datamodel_project, "main", &expected, &[]) + { + panic!( + "wasm parity gate: systems model {txt_path} VM-simulated but the wasm \ + backend returned Unsupported (AC3.2 -- every core-simulation model \ + must run through both backends): {msg}" + ); + } +} + +/// Parse + translate the systems model at `path` (a fixed `rounds`), run it +/// through the VM for an `expected` baseline, and return whether the wasm backend +/// reproduces it (`Ran`) or returns `Unsupported` (`Skipped`). A parse/ +/// translate/VM failure is surfaced as `Skipped` (those paths are gated by the +/// per-model simulation tests; this gate only checks wasm-vs-VM parity). +fn wasm_systems_outcome_for_path(path: &str, rounds: u64) -> WasmRunOutcome { + let Ok(contents) = std::fs::read_to_string(path) else { + return WasmRunOutcome::Skipped(format!("could not read {path}")); + }; + let systems_model = match simlin_engine::systems::parse(&contents) { + Ok(m) => m, + Err(e) => return WasmRunOutcome::Skipped(format!("parse failed: {e}")), + }; + let datamodel = match simlin_engine::systems::translate::translate(&systems_model, rounds) { + Ok(p) => p, + Err(e) => return WasmRunOutcome::Skipped(format!("translate failed: {e}")), + }; + + let mut db = SimlinDb::default(); + let sync = sync_from_datamodel_incremental(&mut db, &datamodel, None); + let compiled = match compile_project_incremental(&db, sync.project, "main") { + Ok(c) => c, + Err(e) => return WasmRunOutcome::Skipped(format!("VM compile failed: {e:?}")), + }; + let mut vm = match Vm::new(compiled) { + Ok(vm) => vm, + Err(e) => return WasmRunOutcome::Skipped(format!("VM creation failed: {e}")), + }; + if let Err(e) = vm.run_to_end() { + return WasmRunOutcome::Skipped(format!("VM run failed: {e}")); + } + let expected = vm.into_results(); + ensure_wasm_matches(&datamodel, "main", &expected, &[]) +} + +/// End-state wasm parity gate (AC3.2 / AC3.3): EVERY systems-format model must +/// run through the wasm backend to VM parity -- zero may return +/// `WasmGenError::Unsupported`. Systems-format models translate to stdlib-module +/// instances (`systems_rate`/`systems_leak`/`systems_conversion`), so they +/// exercise the wasm backend's `EvalModule`/`LoadModuleInput` path end-to-end. +/// This is a direct wasm-vs-VM check (the VM's own output is the baseline), +/// independent of the on-disk CSV fixtures. The per-model simulation tests +/// additionally run the inline `ensure_wasm_matches` hook against their +/// CSV-cleared `expected` and likewise hard-fail on `Unsupported`. A regression +/// that drops a previously-supported systems model fails here with the offender. +#[test] +fn wasm_systems_parity_floor() { + let mut unsupported: Vec<(String, String)> = Vec::new(); + for &path in ALL_VALID_MODELS { + // A fixed `rounds` like the compile-only gate; the wasm-vs-VM parity does + // not depend on the exact horizon, only that both backends agree on it. + if let WasmRunOutcome::Skipped(msg) = wasm_systems_outcome_for_path(path, 5) { + unsupported.push((path.to_string(), msg)); + } + } + eprintln!( + "wasm_systems_parity_floor: {} of {} systems models ran to VM parity ({} unsupported)", + ALL_VALID_MODELS.len() - unsupported.len(), + ALL_VALID_MODELS.len(), + unsupported.len() + ); + assert!( + unsupported.is_empty(), + "wasm systems parity gate (AC3.2/AC3.3): every systems model must run \ + through the wasm backend, but {} of {} returned Unsupported:\n{}", + unsupported.len(), + ALL_VALID_MODELS.len(), + unsupported + .iter() + .map(|(p, m)| format!(" {p}: {m}")) + .collect::>() + .join("\n") + ); } #[test] diff --git a/src/simlin-engine/tests/test_helpers.rs b/src/simlin-engine/tests/test_helpers.rs index bd6bb9da8..e4537bc8d 100644 --- a/src/simlin-engine/tests/test_helpers.rs +++ b/src/simlin-engine/tests/test_helpers.rs @@ -6,10 +6,23 @@ //! //! Extracted from `simulate.rs` so that multiple integration test files //! (simulate.rs, simulate_systems.rs, etc.) can share the comparison logic. +//! +//! pattern: Mixed (unavoidable) +//! Reason: `ensure_results*` is a pure comparator (Functional Core), while +//! `ensure_wasm_matches` is an Imperative Shell (it drives the salsa compile +//! pipeline and executes the emitted wasm under the DLR-FT interpreter). They +//! live together because this is the single shared test-helper module the +//! implementation plan centralizes comparison logic in, and the wasm shell's +//! only job is to feed the pure comparator. The slab -> `Results` conversion is +//! extracted as a pure function (`wasm_results_from_slab`) to keep the I/O +//! boundary explicit. +use checked::Store; use float_cmp::approx_eq; -use simlin_engine::Results; use simlin_engine::common::{Canonical, Ident}; +use simlin_engine::wasmgen::{WasmGenError, WasmLayout, compile_simulation}; +use simlin_engine::{Results, SimSpecs}; +use wasm::validate; /// Columns that are vendor-specific or otherwise not important for /// simulation correctness. @@ -128,3 +141,176 @@ pub fn ensure_results_excluding(expected: &Results, results: &Results, excluded: .contains_key(&Ident::::from_str_unchecked("UNKNOWN")) ); } + +// The wasm-parity helpers below are consumed only by the `simulate` corpus +// binary; the other test binaries that share this module (`simulate_systems`, +// `systems_roundtrip`, `metasd_macros`) include the file but do not run wasm +// parity, so each item is `#[allow(dead_code)]` to stay clean under +// `cargo clippy --all-targets -- -D warnings` (the same shared-helper idiom as +// `SimTier` in `metasd_macros.rs`). + +/// Outcome of running a model through the wasm backend via +/// [`ensure_wasm_matches`]. +/// +/// `Ran` means the model was within the wasm backend's supported feature set, +/// executed under the interpreter, and CLEARED the parity comparator (the +/// helper panics internally on any divergence -- a supported-but-wrong model is +/// a hard failure, never a `Ran`). `Skipped` means `compile_simulation` +/// returned [`WasmGenError::Unsupported`] (an out-of-scope construct); the +/// message is carried so the caller decides whether that is a failure. +/// +/// Phase 8 closed the corpus gate: for a model the VM SIMULATED in the default +/// suite, a `Skipped` outcome is now a HARD FAILURE -- the corpus callers +/// (`wasm_parity_hook`, the parity-floor gates, the systems harness) panic on +/// it (wasm-backend AC3.2: every core-simulation model runs through both +/// backends). The variant survives only so the `ensure_wasm_matches_skips_*` +/// unit test can still observe a *genuinely* out-of-scope construct returning a +/// clean `Unsupported` (AC1.4) -- never a panic or a silently wrong result -- +/// rather than reaching the hook. +#[allow(dead_code)] +#[derive(Debug)] +pub enum WasmRunOutcome { + Ran, + Skipped(String), +} + +/// Build a `Results` from a wasm backend's step-major results slab. +/// +/// The slab is `layout.n_chunks * layout.n_slots` f64 laid out row-major by +/// saved step (the same step-major order the bytecode VM's `Results` uses), so +/// `step_size = n_slots` and `step_count = n_chunks` make `Results::iter` yield +/// one chunk per saved step. Each canonical variable name in `layout` maps back +/// to its slot offset within a chunk. `is_vensim = false`: a wasm-emitted run is +/// a Simlin computation, so it takes the absolute-tolerance branch of the +/// comparator (never the Vensim relative-tolerance branch). +/// +/// Pure: no I/O, no global state -- it only reshapes already-read data, so it is +/// the Functional Core boundary of [`ensure_wasm_matches`]. +#[allow(dead_code)] +fn wasm_results_from_slab(layout: &WasmLayout, slab: Vec, specs: SimSpecs) -> Results { + let offsets = layout + .var_offsets + .iter() + // The names came from `CompiledSimulation::offsets`, whose keys are + // already `Ident`, so they round-trip without re-canonicalizing. + .map(|(name, off)| (Ident::::from_str_unchecked(name), *off)) + .collect(); + + Results { + offsets, + data: slab.into_boxed_slice(), + step_size: layout.n_slots, + step_count: layout.n_chunks, + specs, + is_vensim: false, + } +} + +/// Compile `model_name` of `datamodel` to wasm, run it under the DLR-FT +/// interpreter, and reshape the results slab into a [`Results`] — or return the +/// `Unsupported` message if the model is outside the wasm backend's feature set. +/// +/// Builds the `CompiledSimulation` exactly as the corpus VM path does +/// (simulate.rs `compile_vm`), so the wasm blob is the twin of the VM's run. An +/// incremental-compile error (a VM-side issue gated elsewhere) and an +/// `Unsupported` codegen result both return `Err(msg)`; the caller decides +/// whether that is a skip or a hard failure. +/// +/// Imperative Shell: drives the salsa compile pipeline and the wasm interpreter +/// (side effects), delegating the reshape to the pure [`wasm_results_from_slab`]. +/// Shared by [`ensure_wasm_matches`] (the corpus `.dat`/CSV comparator) and the +/// C-LEARN wasm twin (which compares against `Ref.vdf` instead). +#[allow(dead_code)] +pub fn wasm_results_for( + datamodel: &simlin_engine::datamodel::Project, + model_name: &str, +) -> Result { + use simlin_engine::db::{ + SimlinDb, compile_project_incremental, sync_from_datamodel_incremental, + }; + + let mut db = SimlinDb::default(); + let sync = sync_from_datamodel_incremental(&mut db, datamodel, None); + let sim = compile_project_incremental(&db, sync.project, model_name) + .map_err(|e| format!("incremental compile failed: {e:?}"))?; + + let artifact = match compile_simulation(&sim) { + Ok(artifact) => artifact, + Err(WasmGenError::Unsupported(msg)) => return Err(msg), + }; + + let slab = run_wasm_results(&artifact.wasm, &artifact.layout); + let specs = SimSpecs::from(&datamodel.sim_specs); + Ok(wasm_results_from_slab(&artifact.layout, slab, specs)) +} + +/// Compile `model_name` of `datamodel` to wasm, run it under the DLR-FT +/// interpreter, and assert its results clear the SAME `ensure_results_excluding` +/// comparator the VM clears against `expected`. +/// +/// There is no separate, tighter wasm-vs-VM threshold (per the design's +/// validation bar): "wasm-vs-VM parity" is established because both backends +/// clear the identical comparator against the identical expected outputs. A +/// model outside the wasm backend's supported feature set returns +/// [`WasmRunOutcome::Skipped`] (never a failure); a supported model whose wasm +/// output diverges panics inside `ensure_results_excluding`. +/// +/// Imperative Shell: it drives the salsa compile pipeline and the wasm +/// interpreter (side effects), delegating the reshape to the pure +/// [`wasm_results_from_slab`] and the comparison to the pure +/// [`ensure_results_excluding`]. +#[allow(dead_code)] +pub fn ensure_wasm_matches( + datamodel: &simlin_engine::datamodel::Project, + model_name: &str, + expected: &Results, + excluded: &[&str], +) -> WasmRunOutcome { + let wasm_results = match wasm_results_for(datamodel, model_name) { + Ok(results) => results, + Err(msg) => return WasmRunOutcome::Skipped(msg), + }; + + // The same comparator the VM clears: panics loudly on any divergence, so a + // supported-but-wrong wasm module fails here rather than reporting Ran. + ensure_results_excluding(expected, &wasm_results, excluded); + WasmRunOutcome::Ran +} + +/// Instantiate `wasm` under the DLR-FT `checked::Store`, invoke the exported +/// `run`, and copy `n_chunks * n_slots` f64 out of the results region (located +/// via `layout.results_offset`). This is the wasm-execution side effect of +/// [`ensure_wasm_matches`]; the bytes it returns are consumed purely afterward. +#[allow(dead_code)] +fn run_wasm_results(wasm: &[u8], layout: &WasmLayout) -> Vec { + let info = validate(wasm).expect("generated wasm module must validate"); + let mut store = Store::new(()); + let inst = store + .module_instantiate(&info, Vec::new(), None) + .expect("instantiate wasm module") + .module_addr; + let run = store + .instance_export(inst, "run") + .expect("run export must exist") + .as_func() + .expect("run export must be a function"); + store + .invoke_simple_typed::<(), ()>(run, ()) + .expect("run wasm"); + let mem = store + .instance_export(inst, "memory") + .expect("memory export must exist") + .as_mem() + .expect("memory export must be a memory"); + + let n = layout.n_chunks * layout.n_slots; + let base = layout.results_offset; + store.mem_access_mut_slice(mem, |bytes| { + (0..n) + .map(|i| { + let a = base + i * 8; + f64::from_le_bytes(bytes[a..a + 8].try_into().unwrap()) + }) + .collect() + }) +}