diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 000000000..a3f04c60a --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,17 @@ +# Force the WASM bundle back to size-optimized codegen. +# +# `[profile.release]` (in the workspace Cargo.toml) is `opt-level = 3` so native +# binaries (CLI, server, MCP, pysimlin's libsimlin, C FFI) optimize for speed. +# The browser WASM bundle is dominated by download size, not CPU, so we override +# the opt-level for the wasm32 target here. Keying this on the *target* (rather +# than passing a flag in each build script) means every wasm build path -- the +# `cargo build --target wasm32-unknown-unknown` in src/engine/build.sh, any +# future wasm-pack invocation, etc. -- stays size-optimized automatically. +# +# Caveat: a `RUSTFLAGS` *environment variable* takes precedence over and replaces +# these target rustflags (Cargo does not merge the two). Today only the asan test +# scripts set RUSTFLAGS, and they build for the host, so the wasm bundle is +# unaffected. Do not set RUSTFLAGS during a wasm release build or it will pick up +# opt-level 3. +[target.wasm32-unknown-unknown] +rustflags = ["-C", "opt-level=z"] diff --git a/Cargo.lock b/Cargo.lock index 41cce9ee3..ee1694b26 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2076,6 +2076,15 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +[[package]] +name = "libmimalloc-sys" +version = "0.1.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2892ae4ea6fa2cb7acb0e236a6880d39523239cd9089de71d220910ccc806790" +dependencies = [ + "cc", +] + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -2316,6 +2325,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mimalloc" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebca48a43116bc25f18a61360f1be98412f50cc218f5e52c823086b999a4a21a" +dependencies = [ + "libmimalloc-sys", +] + [[package]] name = "mime" version = "0.3.17" @@ -3768,6 +3786,7 @@ name = "simlin" version = "0.1.0" dependencies = [ "anyhow", + "mimalloc", "prost", "serde", "serde_json", @@ -3824,6 +3843,7 @@ name = "simlin-mcp" version = "0.1.6" dependencies = [ "anyhow", + "mimalloc", "rmcp", "serde_json", "serde_yaml", @@ -3862,6 +3882,7 @@ dependencies = [ "hyper-util", "ignore", "loro", + "mimalloc", "mime_guess", "notify-debouncer-full", "open", diff --git a/Cargo.toml b/Cargo.toml index aa66574c1..c2a44e799 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,8 +11,14 @@ members = [ "src/xmutil", ] +# Native release builds optimize for speed (opt-level = 3). The WASM bundle, +# where download size dominates, is forced back to opt-level = "z" via +# `.cargo/config.toml` ([target.wasm32-unknown-unknown] rustflags), which is +# keyed on the target rather than the build invocation so every wasm build path +# stays size-optimized. Measured on C-LEARN: opt-level 3 vs "z" is ~-30% compile +# and ~-41% simulate on native (see docs/design/engine-performance.md). [profile.release] -opt-level = "z" +opt-level = 3 lto = true panic = "abort" strip = true diff --git a/docs/README.md b/docs/README.md index a6954f1fc..ea9715bb8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -4,6 +4,7 @@ - [architecture.md](architecture.md) -- Component descriptions, dependency graph, project structure - [design/2026-02-21-incremental-compilation.md](design/2026-02-21-incremental-compilation.md) -- Incremental compilation via salsa: symbolic bytecode, per-variable tracking, LTM integration +- [design/engine-performance.md](design/engine-performance.md) -- Engine compile/simulate profile (C-LEARN), implemented optimizations, and remaining proposals - [design/ltm--loops-that-matter.md](design/ltm--loops-that-matter.md) -- LTM implementation design: data structures, synthetic variables, module handling - [design/mdl-parser.md](design/mdl-parser.md) -- Vensim MDL parser design history and implementation notes - [design/vdf.md](design/vdf.md) -- VDF binary format specification and parser design diff --git a/docs/design/engine-performance.md b/docs/design/engine-performance.md new file mode 100644 index 000000000..7d13a15e4 --- /dev/null +++ b/docs/design/engine-performance.md @@ -0,0 +1,314 @@ +# Engine performance: profile and optimization opportunities + +Status: analysis + first clear wins landed. 2026-05-19. + +This documents an empirical CPU/memory profile of **compiling and simulating the +C-LEARN hero model** (the largest model we have: ~53k MDL lines / 1.4 MB, 934 +datamodel variables, 5726 root slots, 162 graphical functions, 1000 Euler +timesteps), the clear-win optimizations already implemented on top of it, and a +set of larger proposals grounded in the measured data. + +## Methodology + +- Harness: `src/simlin-engine/examples/clearn_profile.rs` — times each pipeline + stage (parse → compile-via-salsa → `Vm::new` → `run_to_end`) and, with + `CLEARN_COUNT_ALLOCS=1`, reports allocation counts / peak live bytes per stage + via a gated counting global allocator. With high `CLEARN_COMPILE_ITERS` / + `CLEARN_RUN_ITERS` it is a focused `perf record` / `callgrind` target. +- `CompiledSimulation::bytecode_profile()` — opcode histogram + table sizes. +- CPU: `perf record -g --call-graph dwarf` and `valgrind --tool=callgrind` + (exact call counts). Memory: the counting allocator. Machine: Ryzen 9950X. +- Numbers below are the shipped `[profile.release]` (`opt-level="z"`, LTO) + unless noted. Profile builds add `CARGO_PROFILE_RELEASE_DEBUG=1 + CARGO_PROFILE_RELEASE_STRIP=false`. + +## Measured baseline (before this work) + +| Phase | Wall (per iter) | Allocations | Dominant costs | +|---|---|---|---| +| parse (`open_vensim`) | ~69 ms | 0.82M | MDL lexer/parser/convert | +| **compile (salsa)** | **~3574 ms** | **73M (8.9 GiB churned, 3.3 MiB retained)** | ~30% raw `malloc`/`free`; `reconstruct_variable` 6.4%; `canonicalize`+`to_lowercase` ~3.8% (6.1M `to_lowercase` calls); parse front-end ~4% (3.86M `parse_app`) | +| `Vm::new` | ~0.6 ms | 7.8k | buffer allocation | +| **run (`run_to_end`)** | **~342 ms** | **2.9M (~2944/timestep)** | `eval_bytecode` 35%; **~15% `make_module_key` clone + `HashMap` SipHash inside `EvalModule`**; `RuntimeView` machinery ~9% | + +Two structural facts dominate: + +1. **Compile is ~10× the run and is allocation-bound.** ~30% of compile + instructions are in glibc `malloc`/`free`, churning millions of tiny, + short-lived allocations (AST `Box` nodes, `canonicalize` `String`s, repeated + `datamodel::Variable` reconstruction). The front-end node count is amplified + because arrayed equations are parsed per declared element. +2. **The run's entire per-timestep allocation churn was one thing:** the + `EvalModule` opcode rebuilt a `(String, BTreeSet)` module key and + SipHashed it for a `HashMap` lookup on every module evaluation, every step + (~1344 `EvalModule` × 1000 steps ≈ 1.34M key constructions, each ≥2 heap + allocations). + +Bytecode shape (unchanged by this work): 64420 opcodes (8 B each = 503 KiB); +34673 are flow (the hot per-step program = 277 KiB). Flow histogram: `LoadVar` +32.8%, `Op2` 18.9%, `LoadConstant` 12.1%, `AssignCurr` 6.8%, `If`/`SetCond` 4.7% +each. So ~70% of executed opcodes are load / store / binary-op. + +## Clear wins implemented + +All three are behavior-preserving: the 3530 engine lib tests, 91 `simulate` +integration tests, and the `clearn_residual_exactness` guard (C-LEARN matches +Vensim's `Ref.vdf` byte-for-byte) all pass, and the compiled bytecode is +byte-identical (64420 opcodes). + +### 1. `EvalModule` index dispatch (run −17%, run allocations → 0) + +`make_module_key` cloned a `String` + `BTreeSet` and the `EvalModule` +opcode SipHashed it for a `HashMap` lookup, every module-eval every +timestep. Replaced the three keyed maps (`flow_modules` / `stock_modules` / +`initial_modules`) with a single `Vec` indexed by integer, plus a +`child_targets: Vec` per module resolving each `EvalModule` declaration to +its child's index **once** at `Vm::new`. The eval recursion threads a +`module_idx` and array-indexes; the `ModuleKey` map survives only for the cold +`set_value` / `clear_values` literal-override paths. + +- **run 342 → 283 ms (−17%)**; `run_to_end` allocations **2.94M → 0**. +- Post-change profile: `eval_bytecode` 35% → 46% (now the real work), the ~15% + SipHash cost gone entirely. + +### 2. Allocation-free 0-arity-builtin check (compile −3%, −1.45M allocs) + +`Expr0::reify_0_arity_builtins` called `id.as_str().to_lowercase()` (a heap +allocation) on **every** variable reference just to test membership in a +9-element ASCII set. Added `builtins::is_0_arity_builtin_fn_ci` (ASCII +case-insensitive, allocation-free) and only materialize the lowercased name in +the rare case a genuine `pi`/`time`/etc. reference is reified. + +- **compile 3574 → 3458 ms (−3.2%)**, −1.45M allocations. + +### 3. Cached project dims in `compile_var_fragment` (−130k allocs) + +`compile_var_fragment` (salsa-tracked, once per variable) rebuilt the full +datamodel dimension `Vec` via `source_dims_to_datamodel(project.dimensions(db))` +per variable; switched to the already-cached `project_datamodel_dims` query +(`returns(ref)`). Provably equivalent (the cached query is defined as exactly +that call). Marginal on C-LEARN (only 18 dims) but strictly correct and removes a +redundant per-variable rebuild. + +## Build-level levers (measured, near-free, the biggest wins) — IMPLEMENTED + +These need no engine-code changes and dwarf the code-level compile work. Both are +**native-only**: the WASM bundle (built via `cargo build --target +wasm32-unknown-unknown --release`) keeps `opt-level="z"` for download size and +never links mimalloc. + +### A. `opt-level = 3` for native (compile −30%, run −41%) + +`[profile.release]` is now `opt-level = 3`. The WASM bundle is forced back to +`opt-level=z` by `.cargo/config.toml` (`[target.wasm32-unknown-unknown] rustflags += ["-C", "opt-level=z"]`) — keyed on the target, so every wasm build path stays +size-optimized regardless of invocation (verified: wasm bundle 7.19 MB at z vs +9.75 MB at 3). Measured on C-LEARN (with the code wins in): + +| | opt="z" | opt=3 | delta | +|---|---|---|---| +| compile | 3485 ms | 2450 ms | **−30%** | +| run | 283 ms | 168 ms | **−41%** | + +Caveat documented in `.cargo/config.toml`: a `RUSTFLAGS` *env var* replaces the +target rustflags, so don't set `RUSTFLAGS` during a wasm release build. + +### B. mimalloc for native (compile −40% on top of opt=3) + +Compile is allocation-bound, so a faster allocator pays off directly: + +| | system malloc | mimalloc | delta | +|---|---|---|---| +| compile | 2450 ms | 1459 ms | **−40%** | +| run | 168 ms | 167 ms | none (run is allocation-free post-win #1) | + +Wiring: the binaries (`simlin-cli`, `simlin-serve`, `simlin-mcp`) set +`#[global_allocator] mimalloc::MiMalloc` in their `main.rs` (native binaries, +never wasm). `libsimlin` (the cdylib used by pysimlin via cffi and by C/C++ FFI, +*and* the wasm crate) gates it behind an opt-in `mimalloc` feature that is +additionally `cfg(not(target_arch = "wasm32"))`; pysimlin's build +(`Makefile`, `scripts/build_wheels.py`) enables `--features mimalloc`. The feature +is off by default, so `simlin-cli` (which links libsimlin) sees no allocator +there and supplies its own without conflict. + +**Cumulative compile: 3574 → 1459 ms (−59%)** via code wins + opt=3 + mimalloc. +**Cumulative run: 342 → 168 ms (−51%)** via code win + opt=3. + +## Run-side proposals (post-win hot path: `eval_bytecode` 46%, `RuntimeView` ~20%) + +### R1. Bounds-check elimination on `curr`/`next` indexing — INVESTIGATED, not worth it + +The hot opcodes index `curr[module_off + off]`, `next[...]`, +`bytecode.literals[id]`, and `context.graphical_functions[gf]`. Disassembly +confirms `eval_bytecode` carries 127 `panic_bounds_check` sites, so LLVM is not +eliding them. An earlier draft of this doc proposed `get_unchecked` here as "the +biggest code-level run win" — direct measurement disproves that. + +**Measured ceiling: ~0.** Replacing the bounds checks on the hottest scalar arms +(`LoadVar`, `LoadConstant`, `LoadGlobalVar`, `AssignCurr`/`Next`, +`AssignConstCurr`, `BinOpAssignCurr`/`Next`) *and* the dispatch `code[pc]` access +with `get_unchecked` moved the C-LEARN run by less than run-to-run noise (165–172 +ms across runs, vs ~167 ms checked). On a modern out-of-order core at +`opt-level=3` an always-in-bounds check is a perfectly-predicted, never-taken +branch with an out-of-line cold panic path — effectively free. (The ~10% in +`RuntimeView::flat_offset` is a per-element `SmallVec` rebuild + linear sparse +search, *not* a bounds check — see R4.) + +**Can safe code eliminate them (the optimizer-coaxing question)?** +- The dispatch index is *already* check-free in safe code: `while pc < + code.len() { match &code[pc] }` — the loop guard dominates the access with the + identical bound, so LLVM proves it in range. This is the canonical safe-BCE + pattern (the Go equivalent is the elision after `for i := 0; i < len(s); + i++`). Confirmed: `get_unchecked` on `code[pc]` made no difference. +- The data-driven indices cannot be made check-free in safe code. `off` is `u16` + opcode data and `module_off` is a runtime module base; the in-range invariant + is established by a separate validation pass and is not re-derivable at the + access site from types or local control flow. The safe idioms that *do* elide + don't fit: sequential iteration / `chunks`/`windows` (this is random access); + fixed-size `[T; N]` (n_slots is runtime); power-of-two masking `i & (len-1)` + (needs a compile-time-constant power-of-two length); a hoisted `assert!(i < + len)` (that *is* the check, relocated — `i` is per-opcode so it can't hoist out + of the loop). Removing them would require `unsafe` `get_unchecked` + a static + validation pass (the `Stack` pattern), verifiable under miri — and miri detects + OOB at runtime, it does not remove checks. + +**Decision: do not implement.** `unsafe` in a `#![deny(unsafe_code)]` crate, plus +a validation pass and a miri burden, is not justified for a sub-noise gain. The +run's *instruction count*, not its bounds checks, is the lever — that is R2. The +"bytecode density / dcache" intuition is also a non-issue: the program streams +linearly (prefetcher-friendly) and is already 8 B/opcode. + +### R2. 3-address binop fusion — IMPLEMENTED (run −6.8% on C-LEARN) + +~70% of executed opcodes are load/store/binop. A stack VM evaluates `a op b` as +`LoadX; LoadY; Op2` (3 dispatches); folding the leaf operand loads into the op +makes it 1. Crucially **the `curr[]` slot array is already the register file** — +variables live at fixed offsets — so the fused ops read operands straight from +`curr[]`/`literals` (or pop one from the stack), and the stack carries only +nested subexpression results. + +**Opcode budget forced a 2-operand design.** A full 3-operand `dst = a op b` +(3×u16 + Op2 = 7-byte payload → 10-byte enum) blows the asserted 8-byte `Opcode`. +So the fused ops are 2-operand *pushing* forms (≤6 bytes): `BinVarVar`, +`BinVarConst`, `BinConstVar` (both operands are leaves; fuse `Load; Load; Op2`, +3→1) and `BinStackVar`, `BinStackConst` (lhs already on the stack; fuse `Load; +Op2`, 2→1). A leaf *assignment* `dst = a op b` keeps the existing +`BinOpAssignCurr` for the store (so it stays 3 ops, not 1) — those are a minority +(`BinOpAssignCurr` ≪ `Op2`). + +**Where it runs.** A late `ByteCode::fuse_three_address` pass applied to the Vm's +flow/stock execution bytecode at `Vm::new`, reusing `peephole_optimize`'s +jump-target guard + old→new PC remap and preserving `max_stack_depth`. It runs at +`Vm::new` rather than compile time deliberately: the `CompiledSimulation` stays a +pure, *symbolizable*, salsa-cached artifact (the symbolic roundtrip tests +symbolize it; the fused opcodes have no symbolic form), and the `Vm`'s execution +copy is where the optimization lives. Per-`Vm` fusion is a linear scan, negligible +vs a run. Initials are left unfused (run once; `extract_assign_curr_offsets` reads +their `AssignCurr` targets). + +**Result.** Flow opcodes 34673 → 26539 on C-LEARN (−23.5%); run 166.8 → 155.4 ms +(−6.8%). The opcode reduction outweighs the runtime gain because the f64 +arithmetic, stock phase, save/copy, and array machinery (`flat_offset`, R4) are +untouched — only the scalar *dispatch* shrinks. Scalar-heavy models benefit more +than array-heavy C-LEARN. Behavior-preserving: full suite + `clearn_residual_ +exactness` pass, with dedicated fusion-pass and operand-order unit tests. + +A true register VM with a scratch-register file and a 3-operand instruction set +(register allocation over each expression DAG) would cut more, but is a large +codegen rewrite touching the symbolic/incremental layer; the 2-operand fusion +captures most of the dispatch win at a fraction of the risk. + +### R3. Faster dispatch + +The dispatch is `while pc < len { match &code[pc] { … } }`, which LLVM lowers to a +jump table (one indirect branch whose target is data-dependent → BTB-unfriendly). +Classic threaded dispatch (computed-goto / guaranteed tail calls) would spread the +indirect branch across handlers for better prediction, but **stable Rust offers +neither computed-goto nor guaranteed TCO** (the `become` keyword is unstable). +Portable options: + +- **More superinstructions** for the top opcode bigrams/trigrams (e.g. + `LoadVar; LoadVar; Op2`, `LoadConstant; Op2`). Each fused opcode removes a + dispatch; incremental and low-risk. This is the portable lever today. +- Revisit explicit tail-call dispatch if/when `become` stabilizes. +- R2 (register VM) reduces dispatch count more than any dispatch-mechanism change. + +### R4. `RuntimeView` allocation + `flat_offset` (~20% of post-win run) + +`PushVarView`/`PushTempView` rebuild `SmallVec`s (dims, strides, dim_ids) on every +execution; `flat_offset` (10.3%) recomputes row-major offsets per element. For +arrayed models this is now the #2 run cost. + +Proposal: (a) push more views through the compile-time `PushStaticView` path +(precomputed `StaticArrayView`) and store dynamic view descriptors in the +`ByteCodeContext` referenced by id (as `dim_lists` already does for dim ids), +eliminating per-op `SmallVec` construction; (b) ensure the `is_contiguous` fast +path in iteration/reduction is taken for the common dense case so `flat_offset`'s +general strided arithmetic only runs for transposed/sparse views. + +- Effort: medium. Risk: low–medium (array semantics are well-tested by + `array_tests`). + +## Compile-side proposals (the bigger pie — but build levers A+B capture most of it) + +After opt=3 + mimalloc the compile is ~1.46 s (from 3.57 s) with **no code +changes**. The following are second-order and worth it only if compile latency +remains a UX problem after the build levers (it matters for the salsa +*incremental* edit loop more than cold compile). + +### C1. Arena-allocate the transient parse AST + +The equation parser builds `Expr0` with `Box` children + `Vec` args — 3.86M+ +transient heap allocations, all lowered to `VariableStage0` and dropped. +`bumpalo` is already a dependency. Allocating the AST in a per-parse arena turns +these into pointer bumps. The constraint: the salsa-cached result +(`ParsedVariableResult`) must be owned/`'static`, so the arena can only back the +transient parse→lower step, with the cached value being the owned lowered form. +Much of this benefit is captured more cheaply by mimalloc (B); pursue the arena +only if profiling after B still shows the parser as a hotspot. + +- Effort: large (thread an arena through the parser; verify nothing cached + retains an arena reference). Risk: medium. + +### C2. Halve `reconstruct_variable` (6.4% of compile) + +`reconstruct_variable` rebuilds a full `datamodel::Variable` (ident/equation/ +inflows/outflows/compat clones) and is called ~2× per variable: once in the +per-variable parse, and once in `module_ident_context_for_model` → +`collect_module_idents`. The latter only needs each variable's `(ident, kind, +is-module-call)` — a lighter projection straight from `SourceVariable` would +avoid ~half the full reconstructions (and their clones). + +- Effort: medium. Risk: low–medium (changes the `collect_module_idents` input + type; behavior must stay identical). + +### C3. `canonicalize` ASCII fast-path + ident interning + +6.1M `to_lowercase` calls; ~4.6M are the `canonicalize` slow path (Vensim names +have spaces/capitals so they don't hit the alloc-free fast path). Two levers: +(a) lowercase ASCII in place into the output buffer instead of allocating a +per-part intermediate `String` (careful: keep Unicode correctness — the function +has extensive idempotence tests, #559); (b) **intern** canonical idents so +repeated canonicalization of the same name is a hashmap hit rather than a +re-derivation. (b) is broader but touches many call sites. + +- Effort: (a) small/careful, (b) medium–large. Risk: (a) medium (correctness- + critical function), (b) medium. + +## Suggested ordering + +1. ~~**Build levers A (opt=3 native) + B (mimalloc native)**~~ — DONE. Measured + −59% compile / −41% run for ~no engine code and near-zero risk + (`[profile.release] opt-level=3` + `.cargo/config.toml` wasm override; + `mimalloc` global allocator on the native binaries + libsimlin's opt-in + feature). WASM stays on `z` and links no mimalloc. +2. ~~**R1 (bounds-check elimination)**~~ — INVESTIGATED, dropped: measured + sub-noise (~0) ceiling; bounds checks are effectively free at opt-level=3. +3. ~~**R2 (3-address binop fusion)**~~ — DONE. Flow opcodes −23.5%, run −6.8% on + C-LEARN; a late `fuse_three_address` pass at Vm::new (the `CompiledSimulation` + stays symbolizable). A full register VM would cut more but is a large rewrite. +4. **R4 (RuntimeView)** — now the largest remaining run lever for arrayed models; + the ~10% `flat_offset` cost is a per-element `SmallVec` rebuild + sparse + search, not bounds checks. +5. **R3 superinstructions** — incremental dispatch wins, low risk. +6. **C2 / C3** — only if incremental-compile latency still bites after A+B. diff --git a/src/libsimlin/Cargo.toml b/src/libsimlin/Cargo.toml index 9f27ea2ff..1585091dd 100644 --- a/src/libsimlin/Cargo.toml +++ b/src/libsimlin/Cargo.toml @@ -15,6 +15,12 @@ default = [] debug-derive = ["simlin-engine/debug-derive"] file_io = ["simlin-engine/file_io"] ext_data = ["simlin-engine/ext_data"] +# Use mimalloc as the global allocator. Off by default; enabled by native +# consumers of the cdylib/staticlib (pysimlin via cffi, C/C++ FFI) where the +# allocation-heavy engine compile path benefits. Never enabled for the wasm32 +# bundle (the wasm build uses --no-default-features), and the global_allocator +# is additionally cfg'd off for wasm32 in lib.rs as belt-and-suspenders. +mimalloc = ["dep:mimalloc"] [dependencies] simlin-engine = { version = "0.1", path = "../simlin-engine", default-features = false, features = ["png_render"] } @@ -22,6 +28,7 @@ prost = "0.14" serde_json = "1.0" serde = { version = "1.0", features = ["derive"] } anyhow = "1.0" +mimalloc = { version = "0.1", optional = true } [dev-dependencies] diff --git a/src/libsimlin/src/lib.rs b/src/libsimlin/src/lib.rs index f9e45b1e1..11eaa6516 100644 --- a/src/libsimlin/src/lib.rs +++ b/src/libsimlin/src/lib.rs @@ -22,6 +22,17 @@ //! Shared types (enums, structs, helpers) live here in `lib.rs` and are //! imported by the modules via `crate::`. +// Native consumers of this cdylib/staticlib (pysimlin via cffi, C/C++ FFI) opt +// into mimalloc with the `mimalloc` feature: the engine compile path is +// allocation-heavy (millions of small, short-lived allocations) and mimalloc +// roughly halves allocator time vs the system malloc. Never enabled for the +// wasm32 bundle. See docs/design/engine-performance.md. This is the Rust global +// allocator and is independent of the `simlin_malloc`/`simlin_free` +// cross-boundary helpers in `memory`. +#[cfg(all(feature = "mimalloc", not(target_arch = "wasm32")))] +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + use anyhow::{Error as AnyError, Result}; use simlin_engine::{self as engine}; use std::collections::HashMap; diff --git a/src/pysimlin/Makefile b/src/pysimlin/Makefile index a23818997..311e02b74 100644 --- a/src/pysimlin/Makefile +++ b/src/pysimlin/Makefile @@ -1,8 +1,10 @@ .PHONY: all build test clean install dev lint type-check -# Build the libsimlin library +# Build the libsimlin library. mimalloc: the engine compile path is +# allocation-heavy and mimalloc roughly halves allocator time on native builds +# (see docs/design/engine-performance.md). build-lib: - cd ../libsimlin && cargo build --release + cd ../libsimlin && cargo build --release --features mimalloc # Build the Python package build: build-lib diff --git a/src/pysimlin/scripts/build_wheels.py b/src/pysimlin/scripts/build_wheels.py index 89e1f115a..83c0d2edd 100644 --- a/src/pysimlin/scripts/build_wheels.py +++ b/src/pysimlin/scripts/build_wheels.py @@ -37,9 +37,11 @@ def build_libsimlin() -> Path: project_root = Path(__file__).parent.parent.parent.parent libsimlin_dir = project_root / "libsimlin" - # Build the library + # Build the library. The mimalloc feature swaps in mimalloc as the global + # allocator: the engine compile path is allocation-heavy and mimalloc roughly + # halves allocator time on native builds (docs/design/engine-performance.md). subprocess.run( - ["cargo", "build", "--release"], + ["cargo", "build", "--release", "--features", "mimalloc"], cwd=libsimlin_dir, check=True ) diff --git a/src/simlin-cli/Cargo.toml b/src/simlin-cli/Cargo.toml index 22bfe7ed0..5216d9118 100644 --- a/src/simlin-cli/Cargo.toml +++ b/src/simlin-cli/Cargo.toml @@ -14,4 +14,8 @@ clap = { version = "4", features = ["derive"] } stringreader = "0.1" sha2 = "0.10" simlin-engine = { version = "0.1", path = "../simlin-engine", features = ["file_io"] } -simlin = { version = "0.1", path = "../libsimlin" } +# `mimalloc` installs mimalloc as the process global allocator (this binary runs +# the allocation-heavy engine compile path). Routed through libsimlin rather than +# a direct dep + local `#[global_allocator]` so there is exactly one global +# allocator in this artifact even under `cargo clippy --all-features`. +simlin = { version = "0.1", path = "../libsimlin", features = ["mimalloc"] } diff --git a/src/simlin-cli/src/main.rs b/src/simlin-cli/src/main.rs index 5eb52e924..de9f88260 100644 --- a/src/simlin-cli/src/main.rs +++ b/src/simlin-cli/src/main.rs @@ -2,6 +2,14 @@ // Use of this source code is governed by the Apache License, // Version 2.0, that can be found in the LICENSE file. +// mimalloc on native builds (the engine compile path is allocation-heavy; +// mimalloc roughly halves allocator time -- see docs/design/engine-performance.md) +// comes via the `simlin/mimalloc` feature on the libsimlin dependency, which +// installs the global allocator. We deliberately do NOT declare a second +// `#[global_allocator]` here: this binary links libsimlin, and two global +// allocators in one artifact is a compile error (notably under +// `cargo clippy --all-features`, which enables libsimlin's feature). + use std::fs::File; use std::io::{BufRead, BufReader, Write}; use std::path::PathBuf; diff --git a/src/simlin-engine/examples/clearn_profile.rs b/src/simlin-engine/examples/clearn_profile.rs new file mode 100644 index 000000000..7cc9a774b --- /dev/null +++ b/src/simlin-engine/examples/clearn_profile.rs @@ -0,0 +1,264 @@ +// Copyright 2026 The Simlin Authors. All rights reserved. +// Use of this source code is governed by the Apache License, +// Version 2.0, that can be found in the LICENSE file. + +//! Standalone profiling harness for the C-LEARN hero model. +//! +//! Times each pipeline stage (parse, compile-via-salsa, VM construction, run) +//! and reports allocation counts / peak live bytes per stage via a counting +//! global allocator. Designed as a focused `perf record` / heaptrack target: +//! set `CLEARN_PROFILE=compile` or `CLEARN_PROFILE=run` and a high iteration +//! count to give an external sampler sustained signal on one stage. +//! +//! Usage: +//! cargo run --release -p simlin-engine --example clearn_profile +//! CLEARN_COMPILE_ITERS=20 CLEARN_PROFILE=compile \ +//! perf record -g -- target/release/examples/clearn_profile +//! CLEARN_RUN_ITERS=200 CLEARN_PROFILE=run \ +//! perf record -g -- target/release/examples/clearn_profile +//! +//! Environment: +//! CLEARN_MODEL override the .mdl path +//! CLEARN_COMPILE_ITERS extra compile-only iterations (default 0) +//! CLEARN_RUN_ITERS extra run-only iterations (default 0) +//! CLEARN_PROFILE "compile" | "run" | "both" (default both) -- which +//! extra-iteration loop(s) to execute + +use std::alloc::{GlobalAlloc, Layout, System as Backing}; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::time::Instant; + +use simlin_engine::db::{SimlinDb, compile_project_incremental, sync_from_datamodel_incremental}; +use simlin_engine::{CompiledSimulation, Vm, open_vensim}; + +// --- Counting allocator ----------------------------------------------------- +// +// Tracks cumulative allocation calls/bytes plus live bytes and a high-water +// mark. compile_project_incremental can fan out across rayon threads, so all +// counters are atomic and the peak is maintained with a CAS loop. The default +// GlobalAlloc::realloc routes through our alloc/dealloc, so realloc is counted +// without an explicit override. + +struct Counting; + +static COUNTING_ON: AtomicBool = AtomicBool::new(false); +static ALLOC_CALLS: AtomicUsize = AtomicUsize::new(0); +static ALLOC_BYTES: AtomicUsize = AtomicUsize::new(0); +static LIVE_BYTES: AtomicUsize = AtomicUsize::new(0); +static PEAK_BYTES: AtomicUsize = AtomicUsize::new(0); + +unsafe impl GlobalAlloc for Counting { + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + let p = unsafe { Backing.alloc(layout) }; + // Counting is gated so the default run measures true wall-clock without + // per-allocation atomic overhead. Enable with CLEARN_COUNT_ALLOCS=1 to + // get allocation counts (at the cost of distorted timing). + if !p.is_null() && COUNTING_ON.load(Ordering::Relaxed) { + ALLOC_CALLS.fetch_add(1, Ordering::Relaxed); + ALLOC_BYTES.fetch_add(layout.size(), Ordering::Relaxed); + let live = LIVE_BYTES.fetch_add(layout.size(), Ordering::Relaxed) + layout.size(); + let mut peak = PEAK_BYTES.load(Ordering::Relaxed); + while live > peak { + match PEAK_BYTES.compare_exchange_weak( + peak, + live, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => break, + Err(observed) => peak = observed, + } + } + } + p + } + + unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { + unsafe { Backing.dealloc(ptr, layout) }; + if COUNTING_ON.load(Ordering::Relaxed) { + LIVE_BYTES.fetch_sub(layout.size(), Ordering::Relaxed); + } + } +} + +#[global_allocator] +static GLOBAL: Counting = Counting; + +#[derive(Clone, Copy)] +struct Snap { + calls: usize, + bytes: usize, + live: usize, +} + +fn snap() -> Snap { + Snap { + calls: ALLOC_CALLS.load(Ordering::Relaxed), + bytes: ALLOC_BYTES.load(Ordering::Relaxed), + live: LIVE_BYTES.load(Ordering::Relaxed), + } +} + +/// Reset the peak high-water mark to the current live bytes so the next phase's +/// peak is measured relative to its own starting point. +fn reset_peak() { + PEAK_BYTES.store(LIVE_BYTES.load(Ordering::Relaxed), Ordering::Relaxed); +} + +fn mib(bytes: usize) -> f64 { + bytes as f64 / (1024.0 * 1024.0) +} + +/// Run `f` as a measured phase: report wall time, allocation calls/bytes during +/// the phase, net retained (live) bytes, and peak live bytes reached. +fn phase(name: &str, f: impl FnOnce() -> T) -> T { + reset_peak(); + let before = snap(); + let t0 = Instant::now(); + let out = f(); + let elapsed = t0.elapsed(); + let after = snap(); + let peak = PEAK_BYTES.load(Ordering::Relaxed); + + let calls = after.calls - before.calls; + let bytes = after.bytes - before.bytes; + let retained = after.live as i64 - before.live as i64; + + println!( + "{name:<22} {:>9.2} ms | allocs {:>10} | alloc'd {:>9.1} MiB | retained {:>+8.1} MiB | peak {:>8.1} MiB", + elapsed.as_secs_f64() * 1000.0, + calls, + mib(bytes), + retained as f64 / (1024.0 * 1024.0), + mib(peak), + ); + out +} + +fn model_path() -> String { + if let Ok(p) = std::env::var("CLEARN_MODEL") { + return p; + } + format!( + "{}/../../test/xmutil_test_models/C-LEARN v77 for Vensim.mdl", + env!("CARGO_MANIFEST_DIR") + ) +} + +fn compile_once(datamodel: &simlin_engine::datamodel::Project) -> CompiledSimulation { + let mut db = SimlinDb::default(); + let sync = sync_from_datamodel_incremental(&mut db, datamodel, None); + compile_project_incremental(&db, sync.project, "main").unwrap() +} + +fn env_usize(key: &str, default: usize) -> usize { + std::env::var(key) + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(default) +} + +fn main() { + let path = model_path(); + let compile_iters = env_usize("CLEARN_COMPILE_ITERS", 0); + let run_iters = env_usize("CLEARN_RUN_ITERS", 0); + let which = std::env::var("CLEARN_PROFILE").unwrap_or_else(|_| "both".to_string()); + if std::env::var("CLEARN_COUNT_ALLOCS").is_ok_and(|v| v != "0") { + COUNTING_ON.store(true, Ordering::Relaxed); + } + + println!("model: {path}"); + + let contents = phase("read_file", || { + std::fs::read_to_string(&path).unwrap_or_else(|e| panic!("failed to read {path}: {e}")) + }); + println!( + " source: {} bytes, {} lines", + contents.len(), + contents.lines().count() + ); + + let datamodel = phase("parse (open_vensim)", || open_vensim(&contents).unwrap()); + let n_models = datamodel.models.len(); + let n_vars: usize = datamodel.models.iter().map(|m| m.variables.len()).sum(); + println!( + " models: {n_models}, datamodel variables: {n_vars}, dims: {}", + datamodel.dimensions.len() + ); + + let compiled = phase("compile (salsa)", || compile_once(&datamodel)); + println!(" n_slots (root): {}", compiled.n_slots()); + + let prof = compiled.bytecode_profile(); + println!( + " bytecode: {} opcodes ({:.1} KiB @ 8B) = {} flow + {} stock + {} initial ({} initials)", + prof.total_opcodes, + (prof.total_opcodes * 8) as f64 / 1024.0, + prof.flow_opcodes, + prof.stock_opcodes, + prof.initial_opcodes, + prof.n_initials, + ); + println!( + " flow opcodes after 3-address fusion (est): {} -> {} ({:.1}% reduction)", + prof.flow_opcodes, + prof.flow_opcodes_after_fusion, + 100.0 * (prof.flow_opcodes - prof.flow_opcodes_after_fusion) as f64 + / prof.flow_opcodes as f64, + ); + println!( + " tables: {} literals, {} GFs / {} points, {} temp slots, {} dims, {} static_views, {} dim_lists, {} names, {} modules", + prof.total_literals, + prof.graphical_functions, + prof.graphical_function_points, + prof.temp_storage_slots, + prof.dimensions, + prof.static_views, + prof.dim_lists, + prof.names, + prof.n_modules, + ); + let mut hist: Vec<_> = prof.histogram.iter().collect(); + hist.sort_by(|a, b| b.1.cmp(a.1)); + println!(" opcode histogram (top 25 of {}):", prof.histogram.len()); + for (name, count) in hist.iter().take(25) { + let pct = **count as f64 / prof.total_opcodes as f64 * 100.0; + println!(" {name:<22} {count:>9} {pct:>5.1}%"); + } + + let mut vm = phase("Vm::new", || Vm::new(compiled.clone()).unwrap()); + println!(" variables (offsets): {}", vm.names_as_strs().len()); + + phase("run_to_end", || vm.run_to_end().unwrap()); + let results = vm.into_results(); + println!( + " result slots/step: {}, saved steps: {}", + results.step_size, results.step_count + ); + + // Extra-iteration loops for external samplers (perf/heaptrack). Kept out of + // the per-phase accounting above; these print only aggregate timing. + let do_compile = which == "both" || which == "compile"; + let do_run = which == "both" || which == "run"; + + if compile_iters > 0 && do_compile { + let t0 = Instant::now(); + for _ in 0..compile_iters { + std::hint::black_box(compile_once(&datamodel)); + } + let per = t0.elapsed().as_secs_f64() * 1000.0 / compile_iters as f64; + println!("compile x{compile_iters}: {per:.2} ms/iter"); + } + + if run_iters > 0 && do_run { + let compiled = compile_once(&datamodel); + let t0 = Instant::now(); + for _ in 0..run_iters { + let mut vm = Vm::new(compiled.clone()).unwrap(); + vm.run_to_end().unwrap(); + std::hint::black_box(&vm); + } + let per = t0.elapsed().as_secs_f64() * 1000.0 / run_iters as f64; + println!("run x{run_iters}: {per:.2} ms/iter (incl. Vm::new + clone)"); + } +} diff --git a/src/simlin-engine/src/ast/expr0.rs b/src/simlin-engine/src/ast/expr0.rs index 901469fa7..96c2cc295 100644 --- a/src/simlin-engine/src/ast/expr0.rs +++ b/src/simlin-engine/src/ast/expr0.rs @@ -2,7 +2,7 @@ // Use of this source code is governed by the Apache License, // Version 2.0, that can be found in the LICENSE file. -use crate::builtins::{Loc, UntypedBuiltinFn, is_0_arity_builtin_fn}; +use crate::builtins::{Loc, UntypedBuiltinFn, is_0_arity_builtin_fn_ci}; use crate::common::{EquationError, RawIdent}; use crate::lexer::LexerType; use std::result::Result as StdResult; @@ -180,9 +180,13 @@ impl Expr0 { fn reify_0_arity_builtins(self) -> Self { match self { Expr0::Var(ref id, loc) => { - // Check for 0-arity builtins using lowercase version - let lowercase_id = id.as_str().to_lowercase(); - if is_0_arity_builtin_fn(&lowercase_id) { + // Allocation-free membership test first: the vast majority of + // variable references are not 0-arity builtins, so we avoid the + // per-reference to_lowercase() heap allocation on the hot parse + // path and only materialize the lowercased name in the rare case + // a genuine `pi`/`time`/etc. reference must be reified. + if is_0_arity_builtin_fn_ci(id.as_str()) { + let lowercase_id = id.as_str().to_lowercase(); Expr0::App(UntypedBuiltinFn(lowercase_id, vec![]), loc) } else { self diff --git a/src/simlin-engine/src/builtins.rs b/src/simlin-engine/src/builtins.rs index 3edcd630f..4b5e4e4bc 100644 --- a/src/simlin-engine/src/builtins.rs +++ b/src/simlin-engine/src/builtins.rs @@ -366,6 +366,30 @@ pub fn is_0_arity_builtin_fn(name: &str) -> bool { ) } +/// ASCII case-insensitive, allocation-free variant of [`is_0_arity_builtin_fn`]. +/// +/// The 0-arity builtin names are all ASCII, so a name containing any non-ASCII +/// byte cannot match, and ASCII case-folding yields the same membership verdict +/// as Unicode lowercasing for this fixed ASCII set. Used on the hot parse path +/// (`Expr0::reify_0_arity_builtins`), which previously allocated a `String` via +/// `to_lowercase()` for *every* variable reference just to test membership. +pub fn is_0_arity_builtin_fn_ci(name: &str) -> bool { + const NAMES: [&str; 9] = [ + "inf", + "pi", + "time", + "time_step", + "dt", + "initial_time", + "starttime", + "final_time", + "stoptime", + ]; + NAMES + .iter() + .any(|candidate| name.eq_ignore_ascii_case(candidate)) +} + /// Returns true if `func_name` (already lowercased) names a function that /// expands to a stdlib module: the canonical names in `MODEL_NAMES` plus /// the alias forms `delay`, `delayn`, and `smthn`. @@ -546,6 +570,54 @@ fn test_is_0_arity_builtin_fn() { assert!(is_0_arity_builtin_fn("time")); } +#[test] +fn test_is_0_arity_builtin_fn_ci() { + const NAMES: [&str; 9] = [ + "inf", + "pi", + "time", + "time_step", + "dt", + "initial_time", + "starttime", + "final_time", + "stoptime", + ]; + for name in NAMES { + assert!(is_0_arity_builtin_fn_ci(name), "lowercase {name}"); + assert!( + is_0_arity_builtin_fn_ci(&name.to_uppercase()), + "uppercase {name}" + ); + } + assert!(is_0_arity_builtin_fn_ci("Time")); + assert!(is_0_arity_builtin_fn_ci("Final_Time")); + assert!(!is_0_arity_builtin_fn_ci("lookup")); + assert!(!is_0_arity_builtin_fn_ci("times")); + assert!(!is_0_arity_builtin_fn_ci("")); + // A non-ASCII name can never match (every builtin name is ASCII). + assert!(!is_0_arity_builtin_fn_ci("pï")); + + // Equivalent to to_lowercase() + is_0_arity_builtin_fn for any ASCII input, + // which is the behavior the hot-path caller relies on. + for s in [ + "TIME", + "Pi", + "Dt", + "Final_Time", + "STOPTIME", + "foo", + "lookuptable", + "timestep", + ] { + assert_eq!( + is_0_arity_builtin_fn_ci(s), + is_0_arity_builtin_fn(&s.to_lowercase()), + "ci/lowercase mismatch for {s}" + ); + } +} + #[test] fn test_name() { enum TestExpr {} diff --git a/src/simlin-engine/src/bytecode.rs b/src/simlin-engine/src/bytecode.rs index 77bf10ea9..71d157041 100644 --- a/src/simlin-engine/src/bytecode.rs +++ b/src/simlin-engine/src/bytecode.rs @@ -645,6 +645,44 @@ pub(crate) enum Opcode { off: VariableOffset, }, + // === 3-ADDRESS BINARY OPS (R2) === + // Fold the leaf operand load(s) of a binary op into the op itself, so a + // subexpression `a op b` dispatches once instead of 3 (two loads + Op2) or + // twice instead of 2 (one load + Op2). Each pushes its result. `curr[]` is + // effectively the register file: these read operands straight from it (or + // from `literals`) with no intervening stack push/pop. Created only by the + // late `fuse_three_address` pass on final concrete bytecode -- they never + // enter the symbolic/incremental layer. A 3-operand `dst = a op b` would + // exceed the 8-byte Opcode budget, so the assign stays a separate op. + /// Push `curr[module_off + l] op curr[module_off + r]`. + BinVarVar { + l: VariableOffset, + r: VariableOffset, + op: Op2, + }, + /// Push `curr[module_off + l] op literals[r]`. + BinVarConst { + l: VariableOffset, + r: LiteralId, + op: Op2, + }, + /// Push `literals[l] op curr[module_off + r]`. + BinConstVar { + l: LiteralId, + r: VariableOffset, + op: Op2, + }, + /// Pop `lhs`; push `lhs op curr[module_off + r]`. + BinStackVar { + r: VariableOffset, + op: Op2, + }, + /// Pop `lhs`; push `lhs op literals[r]`. + BinStackConst { + r: LiteralId, + op: Op2, + }, + // ========================================================================= // ARRAY SUPPORT (new) // ========================================================================= @@ -979,6 +1017,14 @@ impl Opcode { Opcode::BinOpAssignCurr { .. } => (2, 0), // pops 2, assigns directly Opcode::BinOpAssignNext { .. } => (2, 0), // pops 2, assigns directly + // 3-address binops: the *Var/*Const forms read both operands from + // curr/literals and push (0 pops, 1 push); the Stack* forms pop the + // lhs and push the result (1 pop, 1 push). + Opcode::BinVarVar { .. } | Opcode::BinVarConst { .. } | Opcode::BinConstVar { .. } => { + (0, 1) + } + Opcode::BinStackVar { .. } | Opcode::BinStackConst { .. } => (1, 1), + // View stack ops don't touch arithmetic stack Opcode::PushVarView { .. } | Opcode::PushTempView { .. } @@ -1042,6 +1088,81 @@ impl Opcode { Opcode::NextBroadcastOrJump { .. } => (0, 0), } } + + /// Static variant name, independent of payload. Used for bytecode-composition + /// profiling (opcode histograms) and human-readable diagnostics without + /// depending on the optional `debug-derive` Debug impl. + pub(crate) fn name(&self) -> &'static str { + match self { + Opcode::Op2 { .. } => "Op2", + Opcode::Not {} => "Not", + Opcode::LoadConstant { .. } => "LoadConstant", + Opcode::LoadVar { .. } => "LoadVar", + Opcode::LoadGlobalVar { .. } => "LoadGlobalVar", + Opcode::LoadPrev { .. } => "LoadPrev", + Opcode::LoadInitial { .. } => "LoadInitial", + Opcode::PushSubscriptIndex { .. } => "PushSubscriptIndex", + Opcode::LoadSubscript { .. } => "LoadSubscript", + Opcode::SetCond {} => "SetCond", + Opcode::If {} => "If", + Opcode::Ret => "Ret", + Opcode::LoadModuleInput { .. } => "LoadModuleInput", + Opcode::EvalModule { .. } => "EvalModule", + Opcode::AssignCurr { .. } => "AssignCurr", + Opcode::AssignNext { .. } => "AssignNext", + Opcode::Apply { .. } => "Apply", + Opcode::Lookup { .. } => "Lookup", + Opcode::AssignConstCurr { .. } => "AssignConstCurr", + Opcode::BinVarVar { .. } => "BinVarVar", + Opcode::BinVarConst { .. } => "BinVarConst", + Opcode::BinConstVar { .. } => "BinConstVar", + Opcode::BinStackVar { .. } => "BinStackVar", + Opcode::BinStackConst { .. } => "BinStackConst", + Opcode::BinOpAssignCurr { .. } => "BinOpAssignCurr", + Opcode::BinOpAssignNext { .. } => "BinOpAssignNext", + Opcode::PushVarView { .. } => "PushVarView", + Opcode::PushTempView { .. } => "PushTempView", + Opcode::PushStaticView { .. } => "PushStaticView", + Opcode::PushVarViewDirect { .. } => "PushVarViewDirect", + Opcode::ViewSubscriptConst { .. } => "ViewSubscriptConst", + Opcode::ViewSubscriptDynamic { .. } => "ViewSubscriptDynamic", + Opcode::ViewRange { .. } => "ViewRange", + Opcode::ViewRangeDynamic { .. } => "ViewRangeDynamic", + Opcode::ViewStarRange { .. } => "ViewStarRange", + Opcode::ViewWildcard { .. } => "ViewWildcard", + Opcode::ViewTranspose {} => "ViewTranspose", + Opcode::PopView {} => "PopView", + Opcode::DupView {} => "DupView", + Opcode::LoadTempConst { .. } => "LoadTempConst", + Opcode::LoadTempDynamic { .. } => "LoadTempDynamic", + Opcode::BeginIter { .. } => "BeginIter", + Opcode::LoadIterElement {} => "LoadIterElement", + Opcode::LoadIterTempElement { .. } => "LoadIterTempElement", + Opcode::LoadIterViewTop {} => "LoadIterViewTop", + Opcode::LoadIterViewAt { .. } => "LoadIterViewAt", + Opcode::StoreIterElement {} => "StoreIterElement", + Opcode::NextIterOrJump { .. } => "NextIterOrJump", + Opcode::EndIter {} => "EndIter", + Opcode::ArraySum {} => "ArraySum", + Opcode::ArrayMax {} => "ArrayMax", + Opcode::ArrayMin {} => "ArrayMin", + Opcode::ArrayMean {} => "ArrayMean", + Opcode::ArrayStddev {} => "ArrayStddev", + Opcode::ArraySize {} => "ArraySize", + Opcode::VectorSelect {} => "VectorSelect", + Opcode::VectorElmMap { .. } => "VectorElmMap", + Opcode::VectorSortOrder { .. } => "VectorSortOrder", + Opcode::Rank { .. } => "Rank", + Opcode::LookupArray { .. } => "LookupArray", + Opcode::AllocateAvailable { .. } => "AllocateAvailable", + Opcode::AllocateByPriority { .. } => "AllocateByPriority", + Opcode::BeginBroadcastIter { .. } => "BeginBroadcastIter", + Opcode::LoadBroadcastElement { .. } => "LoadBroadcastElement", + Opcode::StoreBroadcastElement {} => "StoreBroadcastElement", + Opcode::NextBroadcastOrJump { .. } => "NextBroadcastOrJump", + Opcode::EndBroadcastIter {} => "EndBroadcastIter", + } + } } // ============================================================================ @@ -1256,6 +1377,45 @@ impl ByteCode { } max_depth } + + /// Estimate the opcode count after a 3-address fusion pass that folds leaf + /// operand loads into the binary op that consumes them (R2). Greedy, + /// post-peephole semantics: `LoadX; LoadY; Op2` fuses 3->1 (both operands + /// are leaf loads) and `LoadX; Op2` fuses 2->1 (one operand is a leaf load, + /// the other is already on the stack), where a leaf load is `LoadVar`, + /// `LoadGlobalVar`, or `LoadConstant`. Used only to size the win before + /// implementing the pass; the real pass must additionally fix up jumps. + pub(crate) fn estimate_fused_len(&self) -> usize { + fn is_leaf_load(op: &Opcode) -> bool { + matches!( + op, + Opcode::LoadVar { .. } | Opcode::LoadGlobalVar { .. } | Opcode::LoadConstant { .. } + ) + } + let code = &self.code; + let mut i = 0; + let mut emitted = 0; + while i < code.len() { + if i + 2 < code.len() + && is_leaf_load(&code[i]) + && is_leaf_load(&code[i + 1]) + && matches!(code[i + 2], Opcode::Op2 { .. }) + { + emitted += 1; + i += 3; + } else if i + 1 < code.len() + && is_leaf_load(&code[i]) + && matches!(code[i + 1], Opcode::Op2 { .. }) + { + emitted += 1; + i += 2; + } else { + emitted += 1; + i += 1; + } + } + emitted + } } #[cfg_attr(feature = "debug-derive", derive(Debug))] @@ -1403,6 +1563,145 @@ impl ByteCode { self.code = optimized; } + + /// Late 3-address fusion pass (R2): fold the leaf operand load(s) of a + /// binary op into the op itself, so `LoadX; LoadY; Op2` becomes one + /// `BinXY` (3->1) and `LoadX; Op2` (lhs already on the stack) becomes one + /// `BinStackX` (2->1). + /// + /// MUST run only on FINAL concrete bytecode -- after `peephole_optimize` + /// and, for the incremental path, after `resolve` -- because the fused + /// opcodes deliberately do not exist in the symbolic/incremental layer. + /// Greedy, longest-match-first. Reuses the same jump-target guard and + /// old->new PC remap as `peephole_optimize`: a run is only fused when the + /// instructions it *absorbs* (the second, and for a triple the third) are + /// not jump targets, so no jump can land mid-fusion; a jump to the first + /// instruction still lands on the fused opcode at the same new PC. + /// Stack-effect-preserving (the fused ops carry the net effect of the + /// sequence they replace), so the `max_stack_depth` safety proof is + /// unchanged. + pub(crate) fn fuse_three_address(&mut self) { + if self.code.is_empty() { + return; + } + + // 1. Build set of PCs that are jump targets. + let mut jump_targets = vec![false; self.code.len()]; + for (pc, op) in self.code.iter().enumerate() { + if let Some(offset) = op.jump_offset() { + let target = (pc as isize + offset as isize) as usize; + assert!( + target < jump_targets.len(), + "jump at pc {pc} targets {target}, out of bounds (len {})", + self.code.len() + ); + jump_targets[target] = true; + } + } + + // 2. Greedy fuse, building an old_pc -> new_pc map (one entry per + // original instruction) for jump fixup. + let mut optimized: Vec = Vec::with_capacity(self.code.len()); + let mut pc_map: Vec = Vec::with_capacity(self.code.len() + 1); + let mut i = 0; + while i < self.code.len() { + let new_pc = optimized.len(); + + // 3-window: [leaf load, leaf load, Op2]. Both absorbed instructions + // (i+1, i+2) must not be jump targets. + let three = i + 2 < self.code.len() + && !jump_targets[i + 1] + && !jump_targets[i + 2] + && matches!(self.code[i + 2], Opcode::Op2 { .. }); + let fused3 = if three { + match (&self.code[i], &self.code[i + 1], &self.code[i + 2]) { + ( + Opcode::LoadVar { off: l }, + Opcode::LoadVar { off: r }, + Opcode::Op2 { op }, + ) => Some(Opcode::BinVarVar { + l: *l, + r: *r, + op: *op, + }), + ( + Opcode::LoadVar { off: l }, + Opcode::LoadConstant { id: r }, + Opcode::Op2 { op }, + ) => Some(Opcode::BinVarConst { + l: *l, + r: *r, + op: *op, + }), + ( + Opcode::LoadConstant { id: l }, + Opcode::LoadVar { off: r }, + Opcode::Op2 { op }, + ) => Some(Opcode::BinConstVar { + l: *l, + r: *r, + op: *op, + }), + _ => None, + } + } else { + None + }; + if let Some(op) = fused3 { + optimized.push(op); + pc_map.push(new_pc); // old i + pc_map.push(new_pc); // old i+1 + pc_map.push(new_pc); // old i+2 + i += 3; + continue; + } + + // 2-window: [leaf load, Op2] with the lhs already on the stack. + let two = i + 1 < self.code.len() + && !jump_targets[i + 1] + && matches!(self.code[i + 1], Opcode::Op2 { .. }); + let fused2 = if two { + match (&self.code[i], &self.code[i + 1]) { + (Opcode::LoadVar { off: r }, Opcode::Op2 { op }) => { + Some(Opcode::BinStackVar { r: *r, op: *op }) + } + (Opcode::LoadConstant { id: r }, Opcode::Op2 { op }) => { + Some(Opcode::BinStackConst { r: *r, op: *op }) + } + _ => None, + } + } else { + None + }; + if let Some(op) = fused2 { + optimized.push(op); + pc_map.push(new_pc); // old i + pc_map.push(new_pc); // old i+1 + i += 2; + continue; + } + + // No fusion: copy as-is. + pc_map.push(new_pc); + optimized.push(self.code[i]); + i += 1; + } + pc_map.push(optimized.len()); + + // 3. Fix up jump offsets via the old_pc -> new_pc map. + for (old_pc, op) in self.code.iter().enumerate() { + let Some(jump_back) = op.jump_offset() else { + continue; + }; + let new_pc = pc_map[old_pc]; + let old_target = (old_pc as isize + jump_back as isize) as usize; + let new_target = pc_map[old_target]; + let new_jump_back = (new_target as isize - new_pc as isize) as PcOffset; + *optimized[new_pc].jump_offset_mut().unwrap() = new_jump_back; + } + + self.code = optimized; + } } #[cfg(test)] @@ -3099,6 +3398,224 @@ mod tests { assert_eq!(ids[0], i); } } + + // === 3-address fusion (R2) === + + #[test] + fn test_fuse_var_var() { + // a + b -> BinVarVar; the trailing assign is left to the existing + // BinOpAssignCurr fusion (a 3-operand op would exceed the 8-byte budget). + let mut bc = ByteCode { + literals: vec![], + code: vec![ + Opcode::LoadVar { off: 0 }, + Opcode::LoadVar { off: 1 }, + Opcode::Op2 { op: Op2::Add }, + Opcode::AssignCurr { off: 2 }, + ], + }; + bc.fuse_three_address(); + assert_eq!(bc.code.len(), 2); + assert!(matches!( + bc.code[0], + Opcode::BinVarVar { + l: 0, + r: 1, + op: Op2::Add + } + )); + assert!(matches!(bc.code[1], Opcode::AssignCurr { off: 2 })); + } + + #[test] + fn test_fuse_var_const_preserves_operand_order() { + // `a - 5`: the var is the lhs, the const the rhs. Sub is non-commutative, + // so a swapped encoding would be a silent miscompile. + let mut bc = ByteCode { + literals: vec![5.0], + code: vec![ + Opcode::LoadVar { off: 7 }, + Opcode::LoadConstant { id: 0 }, + Opcode::Op2 { op: Op2::Sub }, + ], + }; + bc.fuse_three_address(); + assert_eq!(bc.code.len(), 1); + assert!(matches!( + bc.code[0], + Opcode::BinVarConst { + l: 7, + r: 0, + op: Op2::Sub + } + )); + } + + #[test] + fn test_fuse_const_var_preserves_operand_order() { + // `5 - a`: the const is the lhs, the var the rhs. + let mut bc = ByteCode { + literals: vec![5.0], + code: vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::LoadVar { off: 7 }, + Opcode::Op2 { op: Op2::Sub }, + ], + }; + bc.fuse_three_address(); + assert_eq!(bc.code.len(), 1); + assert!(matches!( + bc.code[0], + Opcode::BinConstVar { + l: 0, + r: 7, + op: Op2::Sub + } + )); + } + + #[test] + fn test_fuse_greedy_triple_then_stack_var() { + // ((a + b) + c): the leaf triple fuses to BinVarVar (greedy prefers the + // 3-window), then the outer `+ c` -- whose lhs is on the stack -- fuses + // the load of c into a BinStackVar. + let mut bc = ByteCode { + literals: vec![], + code: vec![ + Opcode::LoadVar { off: 0 }, + Opcode::LoadVar { off: 1 }, + Opcode::Op2 { op: Op2::Add }, + Opcode::LoadVar { off: 2 }, + Opcode::Op2 { op: Op2::Add }, + ], + }; + bc.fuse_three_address(); + assert_eq!(bc.code.len(), 2); + assert!(matches!( + bc.code[0], + Opcode::BinVarVar { + l: 0, + r: 1, + op: Op2::Add + } + )); + assert!(matches!( + bc.code[1], + Opcode::BinStackVar { r: 2, op: Op2::Add } + )); + } + + #[test] + fn test_fuse_stack_const() { + // (a + b) * 2: leaf triple -> BinVarVar; the outer `* 2` (lhs on stack) + // -> BinStackConst. + let mut bc = ByteCode { + literals: vec![2.0], + code: vec![ + Opcode::LoadVar { off: 0 }, + Opcode::LoadVar { off: 1 }, + Opcode::Op2 { op: Op2::Add }, + Opcode::LoadConstant { id: 0 }, + Opcode::Op2 { op: Op2::Mul }, + ], + }; + bc.fuse_three_address(); + assert_eq!(bc.code.len(), 2); + assert!(matches!(bc.code[0], Opcode::BinVarVar { .. })); + assert!(matches!( + bc.code[1], + Opcode::BinStackConst { r: 0, op: Op2::Mul } + )); + } + + #[test] + fn test_fuse_noop_without_op2_and_empty() { + // Nothing to fold into (no Op2): unchanged. + let mut bc = ByteCode { + literals: vec![1.0], + code: vec![ + Opcode::LoadConstant { id: 0 }, + Opcode::AssignCurr { off: 0 }, + ], + }; + bc.fuse_three_address(); + assert_eq!(bc.code.len(), 2); + assert!(matches!(bc.code[0], Opcode::LoadConstant { id: 0 })); + + let mut empty = ByteCode::default(); + empty.fuse_three_address(); + assert!(empty.code.is_empty()); + } + + #[test] + fn test_fuse_preserves_max_stack_depth() { + // x = (a + b) * (c + d): peak depth 3. Fusion folds loads into ops, so + // depth can only stay the same or shrink -- never grow (the Stack-safety + // proof must survive fusion). + let mut bc = ByteCode { + literals: vec![], + code: vec![ + Opcode::LoadVar { off: 0 }, + Opcode::LoadVar { off: 1 }, + Opcode::Op2 { op: Op2::Add }, + Opcode::LoadVar { off: 2 }, + Opcode::LoadVar { off: 3 }, + Opcode::Op2 { op: Op2::Add }, + Opcode::Op2 { op: Op2::Mul }, + Opcode::AssignCurr { off: 4 }, + ], + }; + let before = bc.max_stack_depth(); + bc.fuse_three_address(); + assert!(bc.max_stack_depth() <= before); + } + + #[test] + fn test_fuse_triple_with_jump_target_at_first_instruction() { + // A backward jump targets the first instruction of a fusable triple. The + // triple still fuses (the fused op replaces the first instruction at the + // same PC) and the jump offset is rewritten to land on it. + let mut bc = ByteCode { + literals: vec![], + code: vec![ + Opcode::LoadVar { off: 0 }, // [0] <- jump target + Opcode::LoadVar { off: 1 }, // [1] + Opcode::Op2 { op: Op2::Add }, // [2] + Opcode::NextIterOrJump { jump_back: -3 }, // [3] -> [0] + ], + }; + bc.fuse_three_address(); + assert_eq!(bc.code.len(), 2); + assert!(matches!(bc.code[0], Opcode::BinVarVar { .. })); + assert!(matches!( + bc.code[1], + Opcode::NextIterOrJump { jump_back: -1 } + )); + } + + #[test] + fn test_fuse_blocked_when_absorbed_instruction_is_jump_target() { + // A jump targets the Op2 (the instruction a triple would absorb). Fusing + // would make the jump land mid-fusion, so the pass must leave it alone. + let mut bc = ByteCode { + literals: vec![], + code: vec![ + Opcode::LoadVar { off: 0 }, // [0] + Opcode::LoadVar { off: 1 }, // [1] + Opcode::Op2 { op: Op2::Add }, // [2] <- jump target + Opcode::NextIterOrJump { jump_back: -1 }, // [3] -> [2] + ], + }; + bc.fuse_three_address(); + assert_eq!(bc.code.len(), 4); + assert!(matches!(bc.code[0], Opcode::LoadVar { off: 0 })); + assert!(matches!(bc.code[1], Opcode::LoadVar { off: 1 })); + assert!(matches!(bc.code[2], Opcode::Op2 { op: Op2::Add })); + assert!(matches!( + bc.code[3], + Opcode::NextIterOrJump { jump_back: -1 } + )); + } } /// A single variable's compiled initial-value bytecode, along with the diff --git a/src/simlin-engine/src/compiler/symbolic.rs b/src/simlin-engine/src/compiler/symbolic.rs index a2a9c6edf..66361f621 100644 --- a/src/simlin-engine/src/compiler/symbolic.rs +++ b/src/simlin-engine/src/compiler/symbolic.rs @@ -518,6 +518,17 @@ pub(crate) fn symbolize_opcode( op: *op, var: rmap.lookup(u32::from(*off))?, }), + // The 3-address fused binops are created by `ByteCode::fuse_three_address`, + // which runs only on FINAL concrete bytecode (after `resolve`), strictly + // after symbolization. They therefore never reach this function; seeing + // one means the fusion ran before symbolize, which is a compiler bug. + Opcode::BinVarVar { .. } + | Opcode::BinVarConst { .. } + | Opcode::BinConstVar { .. } + | Opcode::BinStackVar { .. } + | Opcode::BinStackConst { .. } => { + unreachable!("3-address fused binop reached symbolize_opcode") + } Opcode::PushVarView { base_off, dim_list_id, @@ -1168,6 +1179,10 @@ pub(crate) fn resolve_module( }) .collect::, String>>()?; + // `resolve_module` is a pure symbolic<->concrete primitive (the roundtrip + // tests symbolize its output again), so the 3-address fusion (R2) is NOT + // applied here -- the production assembler `assemble_module` applies it to + // this function's output instead, where the result is never re-symbolized. let compiled_flows = resolve_bytecode(&sym.compiled_flows, layout)?; let compiled_stocks = resolve_bytecode(&sym.compiled_stocks, layout)?; diff --git a/src/simlin-engine/src/db.rs b/src/simlin-engine/src/db.rs index af019760a..74eef73da 100644 --- a/src/simlin-engine/src/db.rs +++ b/src/simlin-engine/src/db.rs @@ -3734,8 +3734,11 @@ pub fn compile_var_fragment( let var_ident_canonical: Ident = Ident::new(&var_ident); // Caller-owned, lowering-independent context (built only from - // project/variable data, never from the lowered equation). - let dm_dims = source_dims_to_datamodel(project.dimensions(db)); + // project/variable data, never from the lowered equation). Use the + // salsa-cached project dims (returns(ref)) rather than re-running + // source_dims_to_datamodel on every variable -- this fragment compiler is + // invoked once per variable, and the datamodel dims are project-global. + let dm_dims = project_datamodel_dims(db, project); let dim_context = crate::dimensions::DimensionsContext::from(dm_dims.as_slice()); let converted_dims: Vec = dm_dims .iter() @@ -5337,7 +5340,10 @@ pub fn assemble_module( dim_lists: merged.dim_lists, }; - // Resolve symbolic -> concrete offsets + // Resolve symbolic -> concrete offsets. The CompiledModule stays a pure, + // symbolizable artifact (the symbolic roundtrip tests symbolize it again, + // and salsa caches it); the 3-address fusion (R2) is applied later, at + // Vm::new, to the execution copy of the bytecode. resolve_module(&sym_module, layout).inspect_err(|msg| { try_accumulate_diagnostic( db, diff --git a/src/simlin-engine/src/lib.rs b/src/simlin-engine/src/lib.rs index 571d80c29..25478b323 100644 --- a/src/simlin-engine/src/lib.rs +++ b/src/simlin-engine/src/lib.rs @@ -108,6 +108,9 @@ mod units_infer; mod variable; pub mod vdf; mod vm; +// Bytecode-composition profiling for CompiledSimulation; a diagnostics-only +// sibling of `vm` kept separate purely for the per-file line cap. +mod vm_profile; mod vm_vector_elm_map; mod vm_vector_sort_order; pub mod xmile; @@ -123,6 +126,7 @@ pub use self::variable::{ DepClassification, Variable, classify_dependencies, identifier_set, previous_referenced_idents, }; pub use self::vm::{CompiledSimulation, Vm}; +pub use self::vm_profile::BytecodeProfile; // Re-export compat functions at the crate root for convenience #[cfg(feature = "xmutil")] diff --git a/src/simlin-engine/src/vm.rs b/src/simlin-engine/src/vm.rs index e7d29cb16..58bb75386 100644 --- a/src/simlin-engine/src/vm.rs +++ b/src/simlin-engine/src/vm.rs @@ -9,8 +9,8 @@ use smallvec::SmallVec; use crate::alloc::allocate_available; use crate::bytecode::{ - BuiltinId, ByteCode, ByteCodeContext, CompiledInitial, CompiledModule, DimId, LookupMode, - ModuleId, Op2, Opcode, RuntimeView, STACK_CAPACITY, TempId, + BuiltinId, ByteCode, ByteCodeContext, CompiledInitial, CompiledModule, DimId, LookupMode, Op2, + Opcode, RuntimeView, STACK_CAPACITY, TempId, }; use crate::common::{Canonical, Error, ErrorCode, ErrorKind, Ident, Result}; use crate::dimensions::match_dimensions_two_pass; @@ -169,22 +169,38 @@ impl CompiledSimulation { } } -/// Per-module compiled initials with the shared ByteCodeContext needed to eval them. +/// One unique compiled module (a distinct `(model_name, input_set)`), holding +/// its three phase programs plus the resolved child-module indices for its +/// `EvalModule` opcodes. +/// +/// `child_targets[decl_id]` is the index into `CompiledSlicedSimulation.modules` +/// of the module that `context.modules[decl_id]` instantiates. Resolving these +/// once at `Vm::new` lets the `EvalModule` opcode do a plain array index in the +/// hot loop instead of cloning a `(String, BTreeSet)` key and SipHashing +/// it for a `HashMap` lookup on every module evaluation, every timestep. #[cfg_attr(feature = "debug-derive", derive(Debug))] #[derive(Clone)] -struct CompiledModuleInitials { +struct ResolvedModule { #[allow(dead_code)] ident: Ident, context: Arc, initials: Arc>, + flows: Arc, + stocks: Arc, + child_targets: Vec, } #[cfg_attr(feature = "debug-derive", derive(Debug))] #[derive(Clone)] struct CompiledSlicedSimulation { - initial_modules: HashMap, - flow_modules: HashMap, - stock_modules: HashMap, + /// All unique compiled modules, indexed by the integer ids stored in + /// `child_targets` (and `root_idx`). + modules: Vec, + root_idx: usize, + /// `ModuleKey` -> module index. Used only by the cold `set_value` / + /// `clear_values` literal-override paths (which still address modules by + /// key via `BytecodeLocation`); never consulted in the hot eval loop. + key_to_idx: HashMap, } #[cfg_attr(feature = "debug-derive", derive(Debug))] @@ -209,7 +225,6 @@ fn borrow_two(buf: &mut [f64], n_slots: usize, a: usize, b: usize) -> (&mut [f64 #[derive(Clone)] pub struct Vm { specs: Specs, - root: ModuleKey, offsets: HashMap, usize>, sliced_sim: CompiledSlicedSimulation, n_slots: usize, @@ -338,27 +353,65 @@ struct EvalState<'a> { use_prev_fallback: bool, } -#[cfg_attr(feature = "debug-derive", derive(Debug))] -#[derive(Clone)] -struct CompiledModuleSlice { - #[allow(dead_code)] - ident: Ident, - context: Arc, - bytecode: Arc, - part: StepPart, -} +impl CompiledSlicedSimulation { + /// Build the indexed module table from the keyed `CompiledModule` map, + /// resolving every module declaration's `(model_name, input_set)` key to a + /// child index so the hot eval loop never reconstructs or hashes a key. + fn build(modules: &HashMap, root: &ModuleKey) -> Self { + // Stable, deterministic ordering so module indices don't depend on + // HashMap iteration order. + let mut keys: Vec<&ModuleKey> = modules.keys().collect(); + keys.sort(); + + let key_to_idx: HashMap = keys + .iter() + .enumerate() + .map(|(idx, key)| ((*key).clone(), idx as u32)) + .collect(); -impl CompiledModuleSlice { - fn new(module: &CompiledModule, part: StepPart) -> Self { - CompiledModuleSlice { - ident: module.ident.clone(), - context: module.context.clone(), - bytecode: match part { - StepPart::Flows => module.compiled_flows.clone(), - StepPart::Stocks => module.compiled_stocks.clone(), - StepPart::Initials => unreachable!("initials use CompiledModuleInitials"), - }, - part, + let resolved: Vec = keys + .iter() + .map(|key| { + let m = &modules[*key]; + // Resolve each child declaration's key to its module index. + let child_targets: Vec = m + .context + .modules + .iter() + .map(|decl| { + let child_key = make_module_key(&decl.model_name, &decl.input_set); + key_to_idx[&child_key] + }) + .collect(); + // 3-address fusion (R2): fold leaf operand loads into the + // binary ops of the per-timestep flows/stocks programs. Done + // on the Vm's execution copy (not the cached CompiledModule, + // which stays a pure symbolizable artifact) so the fused + // opcodes never re-enter the symbolic layer. make_mut clones + // the bytecode out of the shared Arc once per Vm; the scan is + // linear and cheap relative to a simulation run. Initials run + // once and their AssignCurr targets are read elsewhere, so they + // are left unfused. + let mut flows = m.compiled_flows.clone(); + let mut stocks = m.compiled_stocks.clone(); + Arc::make_mut(&mut flows).fuse_three_address(); + Arc::make_mut(&mut stocks).fuse_three_address(); + ResolvedModule { + ident: m.ident.clone(), + context: m.context.clone(), + initials: m.compiled_initials.clone(), + flows, + stocks, + child_targets, + } + }) + .collect(); + + let root_idx = key_to_idx[root] as usize; + CompiledSlicedSimulation { + modules: resolved, + root_idx, + key_to_idx, } } } @@ -541,36 +594,12 @@ impl Vm { }; let rk_scratch = vec![0.0; stock_offsets.len() * 2]; + let sliced_sim = CompiledSlicedSimulation::build(&sim.modules, &sim.root); + Ok(Vm { specs: sim.specs, - root: sim.root, offsets: sim.offsets, - sliced_sim: CompiledSlicedSimulation { - initial_modules: sim - .modules - .iter() - .map(|(id, m)| { - ( - id.clone(), - CompiledModuleInitials { - ident: m.ident.clone(), - context: m.context.clone(), - initials: m.compiled_initials.clone(), - }, - ) - }) - .collect(), - flow_modules: sim - .modules - .iter() - .map(|(id, m)| (id.clone(), CompiledModuleSlice::new(m, StepPart::Flows))) - .collect(), - stock_modules: sim - .modules - .iter() - .map(|(id, m)| (id.clone(), CompiledModuleSlice::new(m, StepPart::Stocks))) - .collect(), - }, + sliced_sim, n_slots, n_chunks, data: Some(data), @@ -613,8 +642,7 @@ impl Vm { self.stack.clear(); let mut data = self.data.take().unwrap(); - let module_flows = &self.sliced_sim.flow_modules[&self.root]; - let module_stocks = &self.sliced_sim.stock_modules[&self.root]; + let root_idx = self.sliced_sim.root_idx; self.view_stack.clear(); self.iter_stack.clear(); @@ -673,14 +701,7 @@ impl Vm { break; } - Self::eval_step( - &self.sliced_sim, - &mut state, - module_flows, - module_stocks, - curr, - next, - ); + Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next); state.prev_values.copy_from_slice(curr); state.use_prev_fallback = false; self.prev_values_valid = true; @@ -699,14 +720,7 @@ impl Vm { let saved_time = curr[TIME_OFF]; // Stage 1: evaluate at (t, y) - Self::eval_step( - &self.sliced_sim, - &mut state, - module_flows, - module_stocks, - curr, - next, - ); + Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next); for (i, &off) in stock_offsets.iter().enumerate() { let s1 = next[off] - curr[off]; saved[i] = curr[off]; @@ -716,14 +730,7 @@ impl Vm { curr[TIME_OFF] = saved_time + dt * 0.5; // Stage 2: evaluate at (t + dt/2, y + s1/2) - Self::eval_step( - &self.sliced_sim, - &mut state, - module_flows, - module_stocks, - curr, - next, - ); + Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next); for (i, &off) in stock_offsets.iter().enumerate() { let s2 = next[off] - curr[off]; accum[i] += 2.0 * s2; @@ -731,14 +738,7 @@ impl Vm { } // Stage 3: evaluate at (t + dt/2, y + s2/2) - Self::eval_step( - &self.sliced_sim, - &mut state, - module_flows, - module_stocks, - curr, - next, - ); + Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next); for (i, &off) in stock_offsets.iter().enumerate() { let s3 = next[off] - curr[off]; accum[i] += 2.0 * s3; @@ -747,14 +747,7 @@ impl Vm { curr[TIME_OFF] = saved_time + dt; // Stage 4: evaluate at (t + dt, y + s3) - Self::eval_step( - &self.sliced_sim, - &mut state, - module_flows, - module_stocks, - curr, - next, - ); + Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next); for (i, &off) in stock_offsets.iter().enumerate() { let s4 = next[off] - curr[off]; accum[i] += s4; @@ -776,7 +769,8 @@ impl Vm { Self::eval( &self.sliced_sim, &mut state, - module_flows, + root_idx, + StepPart::Flows, 0, &[], curr, @@ -802,14 +796,7 @@ impl Vm { let saved_time = curr[TIME_OFF]; // Stage 1: evaluate at (t, y) - Self::eval_step( - &self.sliced_sim, - &mut state, - module_flows, - module_stocks, - curr, - next, - ); + Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next); for (i, &off) in stock_offsets.iter().enumerate() { let s1 = next[off] - curr[off]; saved[i] = curr[off]; @@ -819,14 +806,7 @@ impl Vm { curr[TIME_OFF] = saved_time + dt; // Stage 2: evaluate at (t + dt, y + s1) - Self::eval_step( - &self.sliced_sim, - &mut state, - module_flows, - module_stocks, - curr, - next, - ); + Self::eval_step(&self.sliced_sim, &mut state, root_idx, curr, next); for (i, &off) in stock_offsets.iter().enumerate() { let s2 = next[off] - curr[off]; accum[i] += s2; @@ -842,7 +822,8 @@ impl Vm { Self::eval( &self.sliced_sim, &mut state, - module_flows, + root_idx, + StepPart::Flows, 0, &[], curr, @@ -914,6 +895,18 @@ impl Vm { self.constant_info.contains_key(&off) } + /// Resolve a `ModuleKey` (carried by a `BytecodeLocation` from the + /// constant-info map) to its module index. Cold path only -- used by the + /// `set_value` / `clear_values` literal-override machinery, never the hot + /// eval loop. + fn module_idx_for(&self, module_key: &ModuleKey) -> usize { + *self + .sliced_sim + .key_to_idx + .get(module_key) + .expect("module key must exist") as usize + } + /// Read the current value of a literal at a bytecode location. fn read_literal(&self, loc: &BytecodeLocation) -> f64 { match loc { @@ -922,32 +915,21 @@ impl Vm { part, literal_id, } => { - let module = match part { - StepPart::Flows => self - .sliced_sim - .flow_modules - .get(module_key) - .expect("module key must exist"), - StepPart::Stocks => self - .sliced_sim - .stock_modules - .get(module_key) - .expect("module key must exist"), + let module = &self.sliced_sim.modules[self.module_idx_for(module_key)]; + let bytecode = match part { + StepPart::Flows => &module.flows, + StepPart::Stocks => &module.stocks, StepPart::Initials => unreachable!(), }; - module.bytecode.literals[*literal_id as usize] + bytecode.literals[*literal_id as usize] } BytecodeLocation::Initial { module_key, initial_index, literal_id, } => { - let initials_module = self - .sliced_sim - .initial_modules - .get(module_key) - .expect("module key must exist"); - initials_module.initials[*initial_index].bytecode.literals[*literal_id as usize] + let module = &self.sliced_sim.modules[self.module_idx_for(module_key)]; + module.initials[*initial_index].bytecode.literals[*literal_id as usize] } } } @@ -961,32 +943,23 @@ impl Vm { part, literal_id, } => { - let module = match part { - StepPart::Flows => self - .sliced_sim - .flow_modules - .get_mut(module_key) - .expect("module key must exist"), - StepPart::Stocks => self - .sliced_sim - .stock_modules - .get_mut(module_key) - .expect("module key must exist"), + let idx = self.module_idx_for(module_key); + let module = &mut self.sliced_sim.modules[idx]; + let bytecode = match part { + StepPart::Flows => &mut module.flows, + StepPart::Stocks => &mut module.stocks, StepPart::Initials => unreachable!(), }; - Arc::make_mut(&mut module.bytecode).literals[*literal_id as usize] = value; + Arc::make_mut(bytecode).literals[*literal_id as usize] = value; } BytecodeLocation::Initial { module_key, initial_index, literal_id, } => { - let initials_module = self - .sliced_sim - .initial_modules - .get_mut(module_key) - .expect("module key must exist"); - let initials = Arc::make_mut(&mut initials_module.initials); + let idx = self.module_idx_for(module_key); + let module = &mut self.sliced_sim.modules[idx]; + let initials = Arc::make_mut(&mut module.initials); initials[*initial_index].bytecode.literals[*literal_id as usize] = value; } } @@ -1130,7 +1103,7 @@ impl Vm { Self::eval_initials( &self.sliced_sim, &mut state, - &self.root, + self.sliced_sim.root_idx, 0, module_inputs, curr, @@ -1186,48 +1159,20 @@ impl Vm { Some(series) } - /// Evaluate a submodule's initials. - #[allow(clippy::too_many_arguments)] - #[inline(never)] - fn eval_module_initials( - sliced_sim: &CompiledSlicedSimulation, - state: &mut EvalState<'_>, - parent_context: &ByteCodeContext, - parent_module_off: usize, - module_inputs: &[f64], - curr: &mut [f64], - next: &mut [f64], - id: ModuleId, - ) { - let new_module_decl = &parent_context.modules[id as usize]; - let module_key = make_module_key(&new_module_decl.model_name, &new_module_decl.input_set); - let module_off = parent_module_off + new_module_decl.off; - - Self::eval_initials( - sliced_sim, - state, - &module_key, - module_off, - module_inputs, - curr, - next, - ); - } - /// Run all per-variable initials for a module (in dependency order). #[allow(clippy::too_many_arguments)] fn eval_initials( sliced_sim: &CompiledSlicedSimulation, state: &mut EvalState<'_>, - module_key: &ModuleKey, + module_idx: usize, module_off: usize, module_inputs: &[f64], curr: &mut [f64], next: &mut [f64], ) { - let module_initials = &sliced_sim.initial_modules[module_key]; - let context = &module_initials.context; - for compiled_initial in module_initials.initials.iter() { + let module = &sliced_sim.modules[module_idx]; + let context = &module.context; + for compiled_initial in module.initials.iter() { Self::eval_bytecode( sliced_sim, state, @@ -1235,6 +1180,7 @@ impl Vm { &compiled_initial.bytecode, StepPart::Initials, module_off, + module_idx, module_inputs, curr, next, @@ -1244,17 +1190,35 @@ impl Vm { /// Evaluate one full integration step: compute all flows/auxes then /// update all stocks. Used by each RK stage and the Euler loop. + /// Always evaluates the root module (`module_off == 0`). #[inline(always)] fn eval_step( sliced_sim: &CompiledSlicedSimulation, state: &mut EvalState<'_>, - module_flows: &CompiledModuleSlice, - module_stocks: &CompiledModuleSlice, + module_idx: usize, curr: &mut [f64], next: &mut [f64], ) { - Self::eval(sliced_sim, state, module_flows, 0, &[], curr, next); - Self::eval(sliced_sim, state, module_stocks, 0, &[], curr, next); + Self::eval( + sliced_sim, + state, + module_idx, + StepPart::Flows, + 0, + &[], + curr, + next, + ); + Self::eval( + sliced_sim, + state, + module_idx, + StepPart::Stocks, + 0, + &[], + curr, + next, + ); } #[allow(clippy::too_many_arguments)] @@ -1262,19 +1226,27 @@ impl Vm { fn eval( sliced_sim: &CompiledSlicedSimulation, state: &mut EvalState<'_>, - module: &CompiledModuleSlice, + module_idx: usize, + part: StepPart, module_off: usize, module_inputs: &[f64], curr: &mut [f64], next: &mut [f64], ) { + let module = &sliced_sim.modules[module_idx]; + let bytecode = match part { + StepPart::Flows => &module.flows, + StepPart::Stocks => &module.stocks, + StepPart::Initials => unreachable!("initials are evaluated via eval_initials"), + }; Self::eval_bytecode( sliced_sim, state, &module.context, - &module.bytecode, - module.part, + bytecode, + part, module_off, + module_idx, module_inputs, curr, next, @@ -1289,6 +1261,11 @@ impl Vm { bytecode: &ByteCode, part: StepPart, module_off: usize, + // Index of the module currently executing, into + // `sliced_sim.modules`. Used to resolve `EvalModule` child targets + // without reconstructing/hashing a module key. `context` is + // `&sliced_sim.modules[module_idx].context`. + module_idx: usize, module_inputs: &[f64], curr: &mut [f64], next: &mut [f64], @@ -1416,35 +1393,29 @@ impl Vm { prev_values, use_prev_fallback, }; + // Resolve the child module by precomputed index instead of + // reconstructing + SipHashing a (model_name, input_set) key. + let child_module_off = module_off + context.modules[*id as usize].off; + let child_idx = + sliced_sim.modules[module_idx].child_targets[*id as usize] as usize; match part { StepPart::Initials => { - Self::eval_module_initials( + Self::eval_initials( sliced_sim, &mut child_state, - context, - module_off, + child_idx, + child_module_off, &module_inputs, curr, next, - *id, ); } StepPart::Flows | StepPart::Stocks => { - let new_module_decl = &context.modules[*id as usize]; - let module_key = make_module_key( - &new_module_decl.model_name, - &new_module_decl.input_set, - ); - let child_module_off = module_off + new_module_decl.off; - let child_module = match part { - StepPart::Flows => &sliced_sim.flow_modules[&module_key], - StepPart::Stocks => &sliced_sim.stock_modules[&module_key], - StepPart::Initials => unreachable!(), - }; Self::eval( sliced_sim, &mut child_state, - child_module, + child_idx, + part, child_module_off, &module_inputs, curr, @@ -1495,6 +1466,35 @@ impl Vm { next[module_off + *off as usize] = eval_op2(*op, l, r); debug_assert_eq!(0, stack.len()); } + // === 3-ADDRESS BINARY OPS (R2) === + // Operands are read straight from curr[]/literals; the *Stack* + // forms take the lhs from the arithmetic stack. Each pushes the + // result, replacing a Load;Load;Op2 or Load;Op2 sequence. + Opcode::BinVarVar { l, r, op } => { + let lv = curr[module_off + *l as usize]; + let rv = curr[module_off + *r as usize]; + stack.push(eval_op2(*op, lv, rv)); + } + Opcode::BinVarConst { l, r, op } => { + let lv = curr[module_off + *l as usize]; + let rv = bytecode.literals[*r as usize]; + stack.push(eval_op2(*op, lv, rv)); + } + Opcode::BinConstVar { l, r, op } => { + let lv = bytecode.literals[*l as usize]; + let rv = curr[module_off + *r as usize]; + stack.push(eval_op2(*op, lv, rv)); + } + Opcode::BinStackVar { r, op } => { + let lv = stack.pop(); + let rv = curr[module_off + *r as usize]; + stack.push(eval_op2(*op, lv, rv)); + } + Opcode::BinStackConst { r, op } => { + let lv = stack.pop(); + let rv = bytecode.literals[*r as usize]; + stack.push(eval_op2(*op, lv, rv)); + } Opcode::Apply { func } => { let time = curr[TIME_OFF]; let dt = curr[DT_OFF]; @@ -2679,16 +2679,22 @@ impl Vm { #[cfg(test)] pub fn debug_print_bytecode(&self, _model_name: &str) { - let mut module_keys: Vec<_> = self.sliced_sim.initial_modules.keys().collect(); - module_keys.sort_unstable(); - for module_key in module_keys { + // Iterate modules in key order for stable, readable output. + let mut keyed: Vec<(&ModuleKey, usize)> = self + .sliced_sim + .key_to_idx + .iter() + .map(|(k, &idx)| (k, idx as usize)) + .collect(); + keyed.sort_unstable_by(|a, b| a.0.cmp(b.0)); + for (module_key, idx) in keyed { eprintln!("\n\nCOMPILED MODULE: {:?}", module_key); - let module_initials = &self.sliced_sim.initial_modules[module_key]; - let flows_bc = &self.sliced_sim.flow_modules[module_key].bytecode; - let stocks_bc = &self.sliced_sim.stock_modules[module_key].bytecode; + let module = &self.sliced_sim.modules[idx]; + let flows_bc = &module.flows; + let stocks_bc = &module.stocks; - for ci in module_initials.initials.iter() { + for ci in module.initials.iter() { eprintln!("\ninitial '{}' literals:", ci.ident); for (i, lit) in ci.bytecode.literals.iter().enumerate() { eprintln!("\t{i}: {lit}"); @@ -3457,6 +3463,48 @@ mod vm_reset_and_run_initials_tests { .expect("incremental compile should succeed") } + /// End-to-end guard for the 3-address fusion (R2), which is applied to the + /// Vm's flow/stock bytecode at construction. Uses subtraction and division + /// (non-commutative) so a swapped operand encoding in any fused form is a + /// loud failure rather than a silent miscompile. `a`, `b`, `c` are distinct + /// variables (not foldable into a literal), so each expression compiles to + /// loads + Op2 that the pass folds into BinVarVar / BinVarConst / + /// BinConstVar / BinStackVar / BinStackConst. + #[test] + fn test_fused_binops_preserve_operand_order() { + let tp = TestProject::new("fusion_order") + .with_sim_time(0.0, 1.0, 1.0) + .aux("a", "20", None) + .aux("b", "5", None) + .aux("c", "2", None) + .aux("vv", "a - b", None) // BinVarVar + .aux("dvv", "a / b", None) // BinVarVar, division + .aux("vc", "a - 3", None) // BinVarConst + .aux("cv", "10 - a", None) // BinConstVar + .aux("sv", "(a - b) - c", None) // BinVarVar then BinStackVar + .aux("sc", "(a - b) - 4", None); // BinVarVar then BinStackConst + + let compiled = build_compiled(&tp); + let mut vm = Vm::new(compiled).unwrap(); + vm.run_to_end().unwrap(); + let results = vm.into_results(); + + let val = |name: &str| -> f64 { + let off = *results + .offsets + .get(&*canonicalize(name)) + .unwrap_or_else(|| panic!("missing {name}")); + results.data[off] // step 0 + }; + + assert_eq!(val("vv"), 15.0, "a - b"); + assert_eq!(val("dvv"), 4.0, "a / b"); + assert_eq!(val("vc"), 17.0, "a - 3"); + assert_eq!(val("cv"), -10.0, "10 - a"); + assert_eq!(val("sv"), 13.0, "(a - b) - c"); + assert_eq!(val("sc"), 11.0, "(a - b) - 4"); + } + #[test] fn test_vm_reset_produces_identical_results() { let tp = pop_model(); @@ -4435,13 +4483,13 @@ mod superinstruction_tests { /// Helper: collect all opcodes from the flow bytecode of the root module. fn flow_opcodes(vm: &Vm) -> Vec<&Opcode> { - let bc = &vm.sliced_sim.flow_modules[&vm.root].bytecode; + let bc = &vm.sliced_sim.modules[vm.sliced_sim.root_idx].flows; bc.code.iter().collect() } /// Helper: collect all opcodes from the stock bytecode of the root module. fn stock_opcodes(vm: &Vm) -> Vec<&Opcode> { - let bc = &vm.sliced_sim.stock_modules[&vm.root].bytecode; + let bc = &vm.sliced_sim.modules[vm.sliced_sim.root_idx].stocks; bc.code.iter().collect() } diff --git a/src/simlin-engine/src/vm_profile.rs b/src/simlin-engine/src/vm_profile.rs new file mode 100644 index 000000000..7674072d2 --- /dev/null +++ b/src/simlin-engine/src/vm_profile.rs @@ -0,0 +1,86 @@ +// Copyright 2026 The Simlin Authors. All rights reserved. +// Use of this source code is governed by the Apache License, +// Version 2.0, that can be found in the LICENSE file. + +//! Bytecode composition profiling for a compiled simulation. +//! +//! A diagnostics-only sibling of `vm.rs` (kept here purely for the per-file line +//! cap): `CompiledSimulation::bytecode_profile` answers "how big and what shape +//! is the compiled bytecode?" for the `clearn_profile` example and similar +//! analysis, without exposing the private `Opcode` type. + +use std::collections::BTreeMap; + +use crate::bytecode::ByteCode; +use crate::vm::CompiledSimulation; + +impl CompiledSimulation { + /// Walk every compiled module's bytecode and tables to produce an aggregate + /// composition profile. + pub fn bytecode_profile(&self) -> BytecodeProfile { + let mut p = BytecodeProfile { + n_modules: self.modules.len(), + n_slots_root: self.n_slots(), + ..Default::default() + }; + + let mut tally = |bc: &ByteCode, hist: &mut BTreeMap<&'static str, usize>| { + p.total_literals += bc.literals.len(); + for op in bc.code.iter() { + *hist.entry(op.name()).or_insert(0) += 1; + } + bc.code.len() + }; + + for module in self.modules.values() { + p.flow_opcodes += tally(&module.compiled_flows, &mut p.histogram); + p.flow_opcodes_after_fusion += module.compiled_flows.estimate_fused_len(); + p.stock_opcodes += tally(&module.compiled_stocks, &mut p.histogram); + for ci in module.compiled_initials.iter() { + p.n_initials += 1; + p.initial_opcodes += tally(&ci.bytecode, &mut p.histogram); + } + + let ctx = &module.context; + p.graphical_functions += ctx.graphical_functions.len(); + p.graphical_function_points += ctx + .graphical_functions + .iter() + .map(|gf| gf.len()) + .sum::(); + p.temp_storage_slots += ctx.temp_total_size; + p.dimensions += ctx.dimensions.len(); + p.static_views += ctx.static_views.len(); + p.dim_lists += ctx.dim_lists.len(); + p.names += ctx.names.len(); + } + + p.total_opcodes = p.flow_opcodes + p.stock_opcodes + p.initial_opcodes; + p + } +} + +/// Aggregate composition of a compiled simulation's bytecode and side tables. +/// Produced by [`CompiledSimulation::bytecode_profile`]. `histogram` maps each +/// opcode variant name to its occurrence count across all modules and phases. +#[derive(Default, Clone)] +pub struct BytecodeProfile { + pub n_modules: usize, + pub n_slots_root: usize, + pub total_opcodes: usize, + pub flow_opcodes: usize, + /// Estimated flow opcode count after a 3-address fusion pass (R2 sizing). + pub flow_opcodes_after_fusion: usize, + pub stock_opcodes: usize, + pub initial_opcodes: usize, + pub n_initials: usize, + pub total_literals: usize, + pub graphical_functions: usize, + pub graphical_function_points: usize, + pub temp_storage_slots: usize, + pub dimensions: usize, + pub static_views: usize, + pub dim_lists: usize, + pub names: usize, + pub histogram: BTreeMap<&'static str, usize>, +} diff --git a/src/simlin-mcp/Cargo.toml b/src/simlin-mcp/Cargo.toml index 574369189..c68722d19 100644 --- a/src/simlin-mcp/Cargo.toml +++ b/src/simlin-mcp/Cargo.toml @@ -18,6 +18,7 @@ path = "src/main.rs" simlin-engine = { version = "0.1", path = "../simlin-engine", features = ["schema"] } simlin-mcp-core = { version = "0.1", path = "../simlin-mcp-core" } rmcp = { version = "1", features = ["server", "macros", "transport-io"] } +mimalloc = "0.1" serde_json = "1" anyhow = "1" tokio = { version = "1", features = ["macros", "rt-multi-thread", "fs"] } diff --git a/src/simlin-mcp/src/main.rs b/src/simlin-mcp/src/main.rs index 3f06724e9..0c6241110 100644 --- a/src/simlin-mcp/src/main.rs +++ b/src/simlin-mcp/src/main.rs @@ -19,6 +19,12 @@ //! simlin-mcp --version # print version //! ``` +// mimalloc on native builds: the engine compile path is allocation-heavy +// (millions of small, short-lived allocations); mimalloc roughly halves the +// allocator time vs the system malloc. See docs/design/engine-performance.md. +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + use rmcp::{ServiceExt, transport::stdio}; use simlin_mcp::access::FileSystemAccess; use simlin_mcp_core::server::{ResourceContent, SimlinMcpServer}; diff --git a/src/simlin-serve/Cargo.toml b/src/simlin-serve/Cargo.toml index 6c2d8b61a..ebd7cdff4 100644 --- a/src/simlin-serve/Cargo.toml +++ b/src/simlin-serve/Cargo.toml @@ -15,6 +15,7 @@ name = "simlin-serve" path = "src/main.rs" [dependencies] +mimalloc = "0.1" tokio = { version = "1", features = ["full"] } axum = { version = "0.8", features = ["ws"] } tower-http = { version = "0.6", features = ["limit", "trace"] } diff --git a/src/simlin-serve/src/main.rs b/src/simlin-serve/src/main.rs index 1600a9a6d..c38c7b9c3 100644 --- a/src/simlin-serve/src/main.rs +++ b/src/simlin-serve/src/main.rs @@ -4,6 +4,13 @@ #![deny(unsafe_code)] +// mimalloc on native builds: the engine compile path is allocation-heavy +// (millions of small, short-lived allocations); mimalloc roughly halves the +// allocator time vs the system malloc. See docs/design/engine-performance.md. +// `#[global_allocator]` is a safe item, so it stands under `deny(unsafe_code)`. +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + use std::sync::Arc; use tracing_subscriber::EnvFilter;